Data simulation
The following code was modified based on the simulation method described in the paper:
PASTE: Ron Zeira, Max Land, Alexander Strzalkowski, and Benjamin J. Raphael. “Alignment and integration of spatial transcriptomics data”. Nature Methods (2022). https://doi.org/10.1038/s41592-022-01459-6
[54]:
import math
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib as mpl
import scanpy as sc
import anndata
from sklearn.decomposition import NMF
import scipy
import os
import os.path as osp
mpl.rcParams['pdf.fonttype'] = 42 # for pdf to be editable
style.use('seaborn-dark')
mpl.rc('xtick', labelsize=14)
mpl.rc('ytick', labelsize=14)
/tmp/ipykernel_230457/3288172430.py:17: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
style.use('seaborn-dark')
Simulate Spatial Noise
Define Simulation functions
Generate grid points
Takes a DLPFC slice, rotates by angle. Next, we map all new coordinates to closest grid points.
This removes some points as some points move out from the grid or two points who share the same closest grid points will map to same coordinate.
[55]:
def rotate_spots(grid,spots,theta=0,translation=0,center_correction=0,figsize=(5,5),plot=True):
grid = grid.copy() + center_correction
spots = spots.copy() + center_correction
print(f"The number of the original spots {len(spots):.2f}")
R = np.array([[np.cos(theta),-np.sin(theta)],[np.sin(theta),np.cos(theta)]])
rotated_spots = np.array([R.dot(spots[i]) for i in range(len(spots))])
rotated_spots += translation
new_spots = grid[np.argmin(scipy.spatial.distance.cdist(rotated_spots,grid),axis=1)]
#print(len(new_spots))
grid -= center_correction
spots -= center_correction
rotated_spots -= center_correction
new_spots -= center_correction
seen = {}
mapping = []
for i in range(len(new_spots)):
if tuple(new_spots[i]) in seen: continue
seen[tuple(new_spots[i])] = 1
mapping.append(i)
print(f"The number of new spots {len(mapping):.2f}")
if plot:
fig = plt.figure(figsize=figsize) #s=100
sns.scatterplot(x = grid[:,0],y = grid[:,1],linewidth=0,s=50, marker=".",alpha=0.2,color='blue')
sns.scatterplot(x = rotated_spots[:,0],y = rotated_spots[:,1],linewidth=0,s=50, marker=".",color='red')
sns.scatterplot(x = new_spots[:,0],y = new_spots[:,1],linewidth=0,s=50, marker=".",color='green')
# plt.show()
return new_spots,mapping
#Generate points that are aligned in alternating rows in grid for DLPFC data
def generate_offset_grid(rows, cols, spacing, offset, x_range, y_range):
grid = []
for row in range(rows):
for col in range(cols):
x = x_range[0] + col * spacing*2+ (row % 2) * spacing
y = y_range[0] +row * offset
grid.append([x, y])
return grid
def euclidean_distance(point1, point2):
x1, y1 = point1
x2, y2 = point2
return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
def minimal_internal_space(points):
min_distance = float('inf') # Initialize with a large value
n = len(points)
for i in range(n):
for j in range(i + 1, n):
distance = euclidean_distance(points[i], points[j])
min_distance = min(min_distance, distance)
return min_distance
def simulate_spatial(adata, rotation_angle):
adata_sim = adata.copy()
min_space = minimal_internal_space(adata.obsm['spatial'])
print(f"The minimal internal space between any two adjacent points is {min_space:.2f}")
# generate grid
x_coordinates = adata.obsm['spatial'][:, 0] # Assuming x is in the first column
y_coordinates = adata.obsm['spatial'][:, 1] # Assuming y is in the second column
min_x = np.min(x_coordinates)
min_y = np.min(y_coordinates)
max_x = np.max(x_coordinates)
max_y = np.max(y_coordinates)
# print(min_x)
# print(max_x)
# print(min_y)
# print(max_y)
x_range = [min_x, max_x]
y_range = [min_y, max_y]
spacing = min_space/2 # this is the half of the distance between any two adjacent grid points
offset = min_space*math.sqrt(3)/2 # this is the y coordinate difference between any two verticallly adjacent points
rows = round((y_range[1]-y_range[0])/offset)+1
cols = round((x_range[1]-x_range[0])/(spacing*2))+1
grid = generate_offset_grid(rows, cols, spacing, offset, x_range, y_range)
layer_grid = np.array(grid)
#print(adata_sim.obsm['spatial'])
#print(layer_grid)
new_spots, mappings = rotate_spots(layer_grid, adata.obsm['spatial'], center_correction=0, theta= rotation_angle, plot=False)
adata_sim.obsm['spatial'] = new_spots
return adata_sim[mappings, :], mappings
Simulate Gene Expression
[56]:
def simulate_gene_exp(adata, pc = 0.25, factor = 1):
"""
Adds pertubations to gene expression data. The rows are simulated according to a Multinomial distribution,
with the total counts per spot drawn from a Negative Binomial Distribution.
param: pc- Pseudocount to be added to dataframe
param: factor - amount by which we scale the variance (to increase noise)
"""
adata_sim = adata.copy()
# for large dataset
df = pd.DataFrame(adata_sim.X.toarray())
# for small dataset
#df = pd.DataFrame(adata_sim.X)
# add pseudocounts
alpha = df.copy().to_numpy() + pc
# get vector of total counts per spot
n = df.sum(axis=1).to_numpy()
# Simulate total counts using negative binomial
mean = np.mean(n)
var = np.var(n)*factor
n = sample_nb(mean, var, len(n)).astype(int)
# Reassign zero counts so we don't divide by 0 in future calcuation
n[n == 0] = 1
# convert to float
alpha = np.array(alpha, dtype=np.float64)
n = np.array(n, dtype=np.float64)
# convert rows to unit vectors
alpha = alpha/alpha.sum(axis=1)[:, None]
dist = np.empty(df.shape)
for i in range(alpha.shape[0]):
dist[i] = np.random.multinomial(n[i], alpha[i])
new_df = pd.DataFrame(dist, index= df.index, columns= df.columns)
adata_sim.X = new_df
return adata_sim
def sample_nb(m, v, n = 1):
"""
param: m - mean
param: v - variance
param: n - number of samples
return: random sample from negative binomial distribution
"""
r = m**2/(v - m)
p = m/v
samples = np.random.negative_binomial(r, p, n)
return samples
Output simulated data
[57]:
def simulate_out(adata_layer, pseudocounts, slice_name, output):
#adata_layer_sim_spatial, mappings = simulate_spatial(adata_layer, math.pi/17.5) #% 80%
#adata_layer_sim_spatial, mappings = simulate_spatial(adata_layer, math.pi/10.3) #60%
#adata_layer_sim_spatial, mappings = simulate_spatial(adata_layer, math.pi/6.8) #40%
adata_layer_sim_spatial, mappings = simulate_spatial(adata_layer, math.pi/4.7) #20%
#adata_layer_sim_spatial, mappings = simulate_spatial(adata_layer, math.pi/17.5) #% 80%
decimal_format = '{:.6f}'
mm = round(adata_layer_sim_spatial.shape[0]/adata_layer.shape[0]*100)
print(f"We keep {mm}% of the original spots")
np.save(osp.join(output, slice_name+f'_overlap={mm}%'+'mappings'), mappings)
adata_layer.write_h5ad(
osp.join(output, slice_name+'original.h5'),
)
sns.scatterplot(x = adata_layer.obsm['spatial'][:,0],y = adata_layer.obsm['spatial'][:,1], label='DLPFC151673', linewidth=0,s=80, marker=".",alpha=0.2,color='blue')
sns.scatterplot(x = adata_layer_sim_spatial.obsm['spatial'][:,0],y = adata_layer_sim_spatial.obsm['spatial'][:,1], label=f'overlap={mm}%',linewidth=0,s=80, marker=".",alpha=0.2,color='red')
print(adata_layer[adata_layer.obs['original_clusters']=='1'])
plt.xticks([])
plt.yticks([])
legend=plt.legend(bbox_to_anchor=(1.005, 0.5), loc='center left', prop={'size': 8})
#plt.show()
figure_path = osp.join(output, slice_name+f'_overlap={mm}%'+'_fig.pdf')
plt.savefig(figure_path, bbox_inches='tight')
# save the correspoinding spot index
rows = adata_layer.shape[0]
columns = adata_layer_sim_spatial.shape[0]
mappings_list = [[0] * columns for _ in range(rows)]
for i in range(len(mappings)):
mappings_list[mappings[i]][i] = 1
pd.DataFrame(mappings_list).to_csv(output + '/'+f'_overlap={mm}%'+'mapping_matrix_ground_truth.csv')
# row_sums = [sum(row) for row in mappings_list]
# print("Sum of each row:", row_sums)
# print(max(row_sums))
# print(min(row_sums))
# because we are varying pseudocounts, want to resimulate gene expression
for p in pseudocounts:
if p == 0:
adata_layer_sim_spatial.write_h5ad(
osp.join(output, slice_name+f'_overlap={mm}%'+f'_pseudocount_0.h5'),
)
else:
adata_layer_sim_both = adata_layer_sim_spatial.copy()
adata_layer_sim_both = simulate_gene_exp(adata_layer_sim_both, pc = p)
adata_layer_sim_both.write_h5ad(
osp.join(output, slice_name+f'_overlap={mm}%'+f'_pseudocount_{p}.h5'),
)
return p, mm, adata_layer_sim_both
Run Experiment
Read in Data
[58]:
def load_DLPFC(root_dir, section_id):
# 151507, ..., 151676
# 12 in total
ad = sc.read_visium(path=os.path.join(root_dir, section_id), count_file=section_id+'_filtered_feature_bc_matrix.h5')
ad.var_names_make_unique()
gt_dir = os.path.join(root_dir, section_id, 'gt')
gt_df = pd.read_csv(os.path.join(gt_dir, 'tissue_positions_list_GTs.txt'), sep=',', header=None, index_col=0)
ad.obs['original_clusters'] = gt_df.loc[:, 6]
keep_bcs = ad.obs.dropna().index
ad = ad[keep_bcs].copy()
ad.obs['original_clusters'] = ad.obs['original_clusters'].astype(int).astype(str)
return ad
[59]:
DLPFC_layer_151673= load_DLPFC(root_dir='/home/xiem6/0Data/DLPFC12', section_id='151673')
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1113: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if not is_categorical_dtype(df_full[k]):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1840: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1113: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if not is_categorical_dtype(df_full[k]):
Results
[60]:
slices = {
"DLPFC_151673" : DLPFC_layer_151673
}
[61]:
# Number of runs per experiment
N_RUNS = 1
def my_range(start, num_elements, step):
return range(start, start+step*num_elements, step)
pseudocounts = list(np.arange(0, 3.5, 0.5))
#pseudocounts = list(np.arange(0, 2, 1))
print(pseudocounts)
[0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0]
Plot simulated data
[62]:
outPut='/home/xiem6/0Projects/Benchmark_STdata/tutorials/'
for slice_name, adata in slices.items():
p, mm, adata_layer_sim_both_final = simulate_out(adata, pseudocounts, slice_name, output = outPut)
The minimal internal space between any two adjacent points is 137.00
The number of the original spots 3611.00
The number of new spots 739.00
We keep 20% of the original spots
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1113: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if not is_categorical_dtype(df_full[k]):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if pd.api.types.is_categorical_dtype(vector):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if pd.api.types.is_categorical_dtype(vector):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if pd.api.types.is_categorical_dtype(vector):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/seaborn/_oldcore.py:1498: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if pd.api.types.is_categorical_dtype(vector):
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1113: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
if not is_categorical_dtype(df_full[k]):
View of AnnData object with n_obs × n_vars = 273 × 33538
obs: 'in_tissue', 'array_row', 'array_col', 'original_clusters'
var: 'gene_ids', 'feature_types', 'genome'
uns: 'spatial'
obsm: 'spatial'
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1230: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
df[key] = c
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1230: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
df[key] = c
/home/xiem6/0Virtual_Environment/F_MaskGraphene/lib/python3.9/site-packages/anndata/_core/anndata.py:1230: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
df[key] = c
[63]:
plt.savefig('spatial.png')
<Figure size 640x480 with 0 Axes>