STAligner integration tutorial

[2]:
import warnings
warnings.filterwarnings("ignore")

import STAligner

# the location of R (used for the mclust clustering)
import os
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri

from st_loading_utils import load_mHypothalamus, load_DLPFC, load_spacelhBC
import anndata
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sp
import scipy.linalg
from sklearn.metrics import adjusted_rand_score as ari_score
import time
import torch
used_device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
iters = 1

save_dir_gt = './'

Mouse Hypothalamus data integration (pair-wise)

[ ]:
"""integration mhypo"""

section_ids_list = [['-0.04', '-0.09'], ['-0.09', '-0.14'], ['-0.14', '-0.19'], ['-0.19', '-0.24']]
run_times = []
for iter_ in range(iters):
    for section_ids in section_ids_list:
        Batch_list = []
        adj_list = []
        dataset = section_ids[0] + '_' + section_ids[1]
        print(dataset)
        start_time = time.time()
        for section_id in section_ids:
            adata = load_mHypothalamus(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus', section_id=section_id)
            adata.var_names_make_unique(join="++")

            # make spot name unique
            adata.obs_names = [x+'_'+section_id for x in adata.obs_names]

            adata.X = sp.csr_matrix(adata.X)

            # Constructing the spatial network
            STAligner.Cal_Spatial_Net(adata, rad_cutoff=35) # the spatial network are saved in adata.uns[‘adj’]
            # STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors

            # Normalization
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
            sc.pp.normalize_total(adata, target_sum=1e4)
            sc.pp.log1p(adata)
            adata = adata[:, adata.var['highly_variable']]

            adj_list.append(adata.uns['adj'])
            Batch_list.append(adata)

        adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
        adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
        adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
        print('adata_concat.shape: ', adata_concat.shape)

        adj_concat = np.asarray(adj_list[0].todense())
        for batch_id in range(1,len(section_ids)):
            adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
        adata_concat.uns['edgeList'] = np.nonzero(adj_concat)


        adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
        STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
        adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
        end_time = time.time()
        run_times.append(end_time - start_time)

        print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        # embedding in adata.obsm:key='STAligner'
        # print(adata_concat.obs)
        # print(adata_concat.obsm['STAligner'])
        df_labels = adata_concat.obs[['original_clusters', 'mclust']]
        embed_ = adata_concat.obsm['STAligner']
        # adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
        np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
        df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))

DLPFC data integration (pair-wise)

[ ]:
"""integration dlpfc"""

section_ids_list = [['151507', '151508'], ['151508', '151509'], ['151509', '151510'], ['151673', '151674'], ['151674', '151675'], ['151675', '151676']]

run_times = []
for iter_ in range(iters):
    for section_ids in section_ids_list:
        Batch_list = []
        adj_list = []
        dataset = section_ids[0] + '_' + section_ids[1]
        print(dataset)
        start_time = time.time()
        for section_id in section_ids:
            adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
            adata.var_names_make_unique(join="++")

            # make spot name unique
            adata.obs_names = [x+'_'+section_id for x in adata.obs_names]

            adata.X = sp.csr_matrix(adata.X)

            # Constructing the spatial network
            STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
            # STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors

            # Normalization
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
            sc.pp.normalize_total(adata, target_sum=1e4)
            sc.pp.log1p(adata)
            adata = adata[:, adata.var['highly_variable']]

            adj_list.append(adata.uns['adj'])
            Batch_list.append(adata)

        adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
        adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
        adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
        print('adata_concat.shape: ', adata_concat.shape)

        adj_concat = np.asarray(adj_list[0].todense())
        for batch_id in range(1,len(section_ids)):
            adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
        adata_concat.uns['edgeList'] = np.nonzero(adj_concat)


        adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
        STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
        adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
        end_time = time.time()
        run_times.append(end_time - start_time)

        print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        # embedding in adata.obsm:key='STAligner'
        # print(adata_concat.obs)
        # print(adata_concat.obsm['STAligner'])
        df_labels = adata_concat.obs[['original_clusters', 'mclust']]
        embed_ = adata_concat.obsm['STAligner']
        # adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
        np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
        df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))

section_ids_list = [['151669', '151670'], ['151670', '151671'], ['151671', '151672']]

run_times = []
for iter_ in range(iters):
    for section_ids in section_ids_list:
        Batch_list = []
        adj_list = []
        dataset = section_ids[0] + '_' + section_ids[1]
        print(dataset)
        start_time = time.time()
        for section_id in section_ids:
            adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
            adata.var_names_make_unique(join="++")

            # make spot name unique
            adata.obs_names = [x+'_'+section_id for x in adata.obs_names]

            adata.X = sp.csr_matrix(adata.X)

            # Constructing the spatial network
            STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
            # STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors

            # Normalization
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
            sc.pp.normalize_total(adata, target_sum=1e4)
            sc.pp.log1p(adata)
            adata = adata[:, adata.var['highly_variable']]

            adj_list.append(adata.uns['adj'])
            Batch_list.append(adata)

        adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
        adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
        adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
        print('adata_concat.shape: ', adata_concat.shape)

        adj_concat = np.asarray(adj_list[0].todense())
        for batch_id in range(1,len(section_ids)):
            adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
        adata_concat.uns['edgeList'] = np.nonzero(adj_concat)


        adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
        STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
        adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
        end_time = time.time()
        run_times.append(end_time - start_time)

        print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        # embedding in adata.obsm:key='STAligner'
        # print(adata_concat.obs)
        # print(adata_concat.obsm['STAligner'])
        df_labels = adata_concat.obs[['original_clusters', 'mclust']]
        embed_ = adata_concat.obsm['STAligner']
        # adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
        np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
        df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))

Mouse Hypothalamus data integration (multi-slices)

[ ]:
section_ids_list = [['-0.04', '-0.09', '-0.14', '-0.19', '-0.24']]
run_times = []
for iter_ in range(iters):
    for section_ids in section_ids_list:
        Batch_list = []
        adj_list = []
        dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3] + '_' + section_ids[4]
        print(dataset)
        start_time = time.time()
        for section_id in section_ids:
            adata = load_mHypothalamus(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus', section_id=section_id)
            adata.var_names_make_unique(join="++")

            # make spot name unique
            adata.obs_names = [x+'_'+section_id for x in adata.obs_names]

            adata.X = sp.csr_matrix(adata.X)

            # Constructing the spatial network
            STAligner.Cal_Spatial_Net(adata, rad_cutoff=35) # the spatial network are saved in adata.uns[‘adj’d]
            # STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors

            # Normalization
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
            sc.pp.normalize_total(adata, target_sum=1e4)
            sc.pp.log1p(adata)
            adata = adata[:, adata.var['highly_variable']]

            adj_list.append(adata.uns['adj'])
            Batch_list.append(adata)

        adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
        adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
        adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
        print('adata_concat.shape: ', adata_concat.shape)

        adj_concat = np.asarray(adj_list[0].todense())
        for batch_id in range(1,len(section_ids)):
            adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
        adata_concat.uns['edgeList'] = np.nonzero(adj_concat)


        adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
        STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
        adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
        end_time = time.time()
        run_times.append(end_time - start_time)

        print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        # embedding in adata.obsm:key='STAligner'
        # print(adata_concat.obs)
        # print(adata_concat.obsm['STAligner'])
        df_labels = adata_concat.obs[['original_clusters', 'mclust']]
        embed_ = adata_concat.obsm['STAligner']
        # adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
        np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
        df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))

DLPFC data integration (multi-slices)

[ ]:
section_ids_list = [['151507', '151508', '151509', '151510']]
run_times = []
for iter_ in range(iters):
    for section_ids in section_ids_list:
        Batch_list = []
        adj_list = []
        dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3]
        print(dataset)
        start_time = time.time()
        for section_id in section_ids:
            adata = load_DLPFC(root_dir='/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12/', section_id=section_id)
            adata.var_names_make_unique(join="++")

            # make spot name unique
            adata.obs_names = [x+'_'+section_id for x in adata.obs_names]

            adata.X = sp.csr_matrix(adata.X)

            # Constructing the spatial network
            STAligner.Cal_Spatial_Net(adata, rad_cutoff=150) # the spatial network are saved in adata.uns[‘adj’]
            # STAligner.Stats_Spatial_Net(adata) # plot the number of spatial neighbors

            # Normalization
            sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
            sc.pp.normalize_total(adata, target_sum=1e4)
            sc.pp.log1p(adata)
            adata = adata[:, adata.var['highly_variable']]

            adj_list.append(adata.uns['adj'])
            Batch_list.append(adata)

        adata_concat = anndata.concat(Batch_list, label="slice_name", keys=section_ids)
        adata_concat.obs['original_clusters'] = adata_concat.obs['original_clusters'].astype('category')
        adata_concat.obs["batch_name"] = adata_concat.obs["slice_name"].astype('category')
        print('adata_concat.shape: ', adata_concat.shape)

        adj_concat = np.asarray(adj_list[0].todense())
        for batch_id in range(1,len(section_ids)):
            adj_concat = scipy.linalg.block_diag(adj_concat, np.asarray(adj_list[batch_id].todense()))
        adata_concat.uns['edgeList'] = np.nonzero(adj_concat)


        adata_concat = STAligner.train_STAligner(adata_concat, verbose=True, knn_neigh = 50, device=used_device)
        STAligner.mclust_R(adata_concat, num_cluster=7, used_obsm='STAligner')
        adata_concat = adata_concat[adata_concat.obs['original_clusters']!='unknown']
        end_time = time.time()
        run_times.append(end_time - start_time)

        print('mclust, ARI = %01.3f' % ari_score(adata_concat.obs['original_clusters'], adata_concat.obs['mclust']))
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        # embedding in adata.obsm:key='STAligner'
        # print(adata_concat.obs)
        # print(adata_concat.obsm['STAligner'])
        df_labels = adata_concat.obs[['original_clusters', 'mclust']]
        embed_ = adata_concat.obsm['STAligner']
        # adata_concat.write_h5ad(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'.h5ad'))
        np.save(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), embed_)
        df_labels.to_csv(os.path.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.csv'))