STAligner integration tutorial
[2]:
import warnings
warnings.filterwarnings("ignore")
import STAligner
# the location of R (used for the mclust clustering)
import os
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri
from st_loading_utils import load_mHypothalamus, load_DLPFC, load_spacelhBC
import anndata
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import scipy.linalg
from sklearn.metrics import adjusted_rand_score as ari_score
import time
import torch
used_device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
iters = 1
save_dir_gt = './'
Mouse Hypothalamus data integration (pair-wise)
[ ]:
"""integration mhypo"""
section_ids_list = [['-0.04', '-0.09'], ['-0.09', '-0.14'], ['-0.14', '-0.19'], ['-0.19', '-0.24']]
run_times = []
for iter_ in range(iters):
for section_ids in section_ids_list:
Batch_list = []
adj_list = []
dataset = section_ids[0] + '_' + section_ids[1]
print(dataset)
start_time = time.time()
for section_id in section_ids:
adata = load_mHypothalamus(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus', section_id=section_id)
adata.var_names_make_unique(join="++")
# make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
adata.X = sp.csr_matrix(adata.X)
# Constructing the spatial network
STAligner.Cal_Spatial_Net(adata, rad_cutoff=35) # the spatial network are saved in adata.uns[‘adj’]
# STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
print('adata_concat.shape: ', adata_concat.shape)
adj_concat = np.asarray(adj_list[0].todense())
for batch_id in range(1,len(section_ids)):
adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
end_time = time.time()
run_times.append(end_time - start_time)
print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
if not os.path.exists(os.path.join(save_dir_gt, dataset)):
os.makedirs(os.path.join(save_dir_gt, dataset))
# embedding in adata.obsm:key='STAligner'
# print(adata_concat.obs)
# print(adata_concat.obsm['STAligner'])
df_labels = adata_concat.obs[['original_clusters', 'mclust']]
embed_ = adata_concat.obsm['STAligner']
# adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))
DLPFC data integration (pair-wise)
[ ]:
"""integration dlpfc"""
section_ids_list = [['151507', '151508'], ['151508', '151509'], ['151509', '151510'], ['151673', '151674'], ['151674', '151675'], ['151675', '151676']]
run_times = []
for iter_ in range(iters):
for section_ids in section_ids_list:
Batch_list = []
adj_list = []
dataset = section_ids[0] + '_' + section_ids[1]
print(dataset)
start_time = time.time()
for section_id in section_ids:
adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
adata.var_names_make_unique(join="++")
# make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
adata.X = sp.csr_matrix(adata.X)
# Constructing the spatial network
STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
# STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
print('adata_concat.shape: ', adata_concat.shape)
adj_concat = np.asarray(adj_list[0].todense())
for batch_id in range(1,len(section_ids)):
adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
end_time = time.time()
run_times.append(end_time - start_time)
print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
if not os.path.exists(os.path.join(save_dir_gt, dataset)):
os.makedirs(os.path.join(save_dir_gt, dataset))
# embedding in adata.obsm:key='STAligner'
# print(adata_concat.obs)
# print(adata_concat.obsm['STAligner'])
df_labels = adata_concat.obs[['original_clusters', 'mclust']]
embed_ = adata_concat.obsm['STAligner']
# adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))
section_ids_list = [['151669', '151670'], ['151670', '151671'], ['151671', '151672']]
run_times = []
for iter_ in range(iters):
for section_ids in section_ids_list:
Batch_list = []
adj_list = []
dataset = section_ids[0] + '_' + section_ids[1]
print(dataset)
start_time = time.time()
for section_id in section_ids:
adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
adata.var_names_make_unique(join="++")
# make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
adata.X = sp.csr_matrix(adata.X)
# Constructing the spatial network
STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
# STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
print('adata_concat.shape: ', adata_concat.shape)
adj_concat = np.asarray(adj_list[0].todense())
for batch_id in range(1,len(section_ids)):
adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
end_time = time.time()
run_times.append(end_time - start_time)
print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
if not os.path.exists(os.path.join(save_dir_gt, dataset)):
os.makedirs(os.path.join(save_dir_gt, dataset))
# embedding in adata.obsm:key='STAligner'
# print(adata_concat.obs)
# print(adata_concat.obsm['STAligner'])
df_labels = adata_concat.obs[['original_clusters', 'mclust']]
embed_ = adata_concat.obsm['STAligner']
# adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))
Mouse Hypothalamus data integration (multi-slices)
[ ]:
section_ids_list = [['-0.04', '-0.09', '-0.14', '-0.19', '-0.24']]
run_times = []
for iter_ in range(iters):
for section_ids in section_ids_list:
Batch_list = []
adj_list = []
dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3] + '_' + section_ids[4]
print(dataset)
start_time = time.time()
for section_id in section_ids:
adata = load_mHypothalamus(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus', section_id=section_id)
adata.var_names_make_unique(join="++")
# make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
adata.X = sp.csr_matrix(adata.X)
# Constructing the spatial network
STAligner.Cal_Spatial_Net(adata, rad_cutoff=35) # the spatial network are saved in adata.uns[‘adj’d]
# STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
print('adata_concat.shape: ', adata_concat.shape)
adj_concat = np.asarray(adj_list[0].todense())
for batch_id in range(1,len(section_ids)):
adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
end_time = time.time()
run_times.append(end_time - start_time)
print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
if not os.path.exists(os.path.join(save_dir_gt, dataset)):
os.makedirs(os.path.join(save_dir_gt, dataset))
# embedding in adata.obsm:key='STAligner'
# print(adata_concat.obs)
# print(adata_concat.obsm['STAligner'])
df_labels = adata_concat.obs[['original_clusters', 'mclust']]
embed_ = adata_concat.obsm['STAligner']
# adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))
DLPFC data integration (multi-slices)
[ ]:
section_ids_list = [['151507', '151508', '151509', '151510']]
run_times = []
for iter_ in range(iters):
for section_ids in section_ids_list:
Batch_list = []
adj_list = []
dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3]
print(dataset)
start_time = time.time()
for section_id in section_ids:
adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
adata.var_names_make_unique(join="++")
# make spot name unique
adata.obs_names = [x+'_'+section_id for x in adata.obs_names]
adata.X = sp.csr_matrix(adata.X)
# Constructing the spatial network
STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
# STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors
# Normalization
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata = adata[:, adata.var['highly_variable']]
adj_list.append(adata.uns['adj'])
Batch_list.append(adata)
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
print('adata_concat.shape: ', adata_concat.shape)
adj_concat = np.asarray(adj_list[0].todense())
for batch_id in range(1,len(section_ids)):
adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
adata_concat.uns['edgeList'] = np.nonzero(adj_concat)
adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
end_time = time.time()
run_times.append(end_time - start_time)
print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
if not os.path.exists(os.path.join(save_dir_gt, dataset)):
os.makedirs(os.path.join(save_dir_gt, dataset))
# embedding in adata.obsm:key='STAligner'
# print(adata_concat.obs)
# print(adata_concat.obsm['STAligner'])
df_labels = adata_concat.obs[['original_clusters', 'mclust']]
embed_ = adata_concat.obsm['STAligner']
# adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))