DeepST integration tutorial

[1]:
import os
from DeepST import run
import matplotlib.pyplot as plt
from pathlib import Path
import scanpy as sc
import numpy as np
from sklearn import metrics
import json
import time
import os.path as osp

iters = 1
save_dir_gt = '/home/yunfei/spatial_benchmarking/BenchmarkST/sim_data_results/deepst'
/home/yunfei/anaconda3/envs/DeepST/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Mouse Hypothalamus data integration (pair-wise)

[ ]:
"""mhypo"""

section_ids_list = [['-0.04', '-0.09'], ['-0.09', '-0.14'], ['-0.14', '-0.19'], ['-0.19', '-0.24']]
run_times = []

for iter_ in range(iters):
    for section_ids in section_ids_list:
        dataset = section_ids[0] + '_' + section_ids[1]
        start_time = time.time()
        # slice1 = load_mHypothalamus(section_id=section_ids[0])
        # slice2 = load_mHypothalamus(section_id=section_ids[1])

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 8
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="benchmark_test", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 150)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])

        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])

DLPFC data integration (pair-wise)

[ ]:
"""dlpfc"""
section_ids_list = [['151507', '151508'], ['151508', '151509'], ['151509', '151510'], ['151673', '151674'], ['151674', '151675'], ['151675', '151676']]
run_times = []
for section_ids in section_ids_list:
    for iter_ in range(iters):
        dataset = section_ids[0] + '_' + section_ids[1]
        start_time = time.time()
        # slice1 = load_mHypothalamus(section_id=section_ids[0])
        # slice2 = load_mHypothalamus(section_id=section_ids[1])

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 7
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="Visium", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 200)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        # save labels
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])


section_ids_list = [['151669', '151670'], ['151670', '151671'], ['151671', '151672']]
run_times = []
for section_ids in section_ids_list:
    for iter_ in range(iters):
        dataset = section_ids[0] + '_' + section_ids[1]
        start_time = time.time()
        # slice1 = load_mHypothalamus(section_id=section_ids[0])
        # slice2 = load_mHypothalamus(section_id=section_ids[1])

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 5
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="Visium", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 200)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        # save labels
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])

Mouse Hypothalamus data integration (multi-slices)

[ ]:
"""mhypo"""

section_ids_list = [['-0.04', '-0.09', '-0.14', '-0.19', '-0.24']]
run_times = []

for iter_ in range(iters):
    for section_ids in section_ids_list:
        dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3] + '_' + section_ids[4]
        start_time = time.time()

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/mHypothalamus"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 8
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="benchmark_test", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 150)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])

        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])

DLPFC data integration (multi-slices)

[ ]:
"""dlpfc"""
section_ids_list = [['151507', '151508', '151509', '151510'], ['151673', '151674', '151675', '151676']]
run_times = []
for section_ids in section_ids_list:
    for iter_ in range(iters):
        dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3]
        start_time = time.time()
        # slice1 = load_mHypothalamus(section_id=section_ids[0])
        # slice2 = load_mHypothalamus(section_id=section_ids[1])

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 7
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="Visium", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 200)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        # save labels
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])


section_ids_list = [['151669', '151670', '151671', '151672']]
run_times = []
for section_ids in section_ids_list:
    for iter_ in range(iters):
        dataset = section_ids[0] + '_' + section_ids[1] + '_' + section_ids[2] + '_' + section_ids[3]
        start_time = time.time()
        # slice1 = load_mHypothalamus(section_id=section_ids[0])
        # slice2 = load_mHypothalamus(section_id=section_ids[1])

        # run deeepst pairwise alignment
        data_path = "/home/yunfei/spatial_benchmarking/benchmarking_data/DLPFC12"
        data_name_list = section_ids
        save_path = "../Results"
        n_domains = 5
        deepen = run(task = "Integration",
            pre_epochs = 800, #### According to your own hardware, choose the number of training
            epochs = 1000, #### According to your own hardware, choose the number of training
            use_gpu = True
            )
        augement_data_list = []
        graph_list = []
        for i in range(len(section_ids)):
            adata = deepen._get_adata(platform="Visium", data_path=data_path, data_name=data_name_list[i])
            # adata = deepen._get_image_crop(adata, data_name=data_name_list[i])
            # adata.obs = adata.obs.rename(columns={'imagerow': 'array_row', 'imagecol': 'array_col'})
            # print(adata.obs)
            adata = deepen._get_augment(adata, spatial_type="KDTree", use_morphological=False)
            graph_dict = deepen._get_graph(adata.obsm["spatial"], distType = "KDTree")
            augement_data_list.append(adata)
            graph_list.append(graph_dict)

        ######## Synthetic Datasets and Graphs
        multiple_adata, multiple_graph = deepen._get_multiple_adata(adata_list = augement_data_list, data_name_list = data_name_list, graph_list = graph_list)

        ###### Enhanced data preprocessing
        data = deepen._data_process(multiple_adata, pca_n_comps = 200)

        deepst_embed = deepen._fit(
                data = data,
                graph_dict = multiple_graph,
                domains = multiple_adata.obs["batch"].values,  ##### Input to Domain Adversarial Model
                n_domains = len(data_name_list))
        multiple_adata.obsm["DeepST_embed"] = deepst_embed
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)

        # np.save(os.path.join(save_dir_r, 'deepst_' + sec1 + '_' + sec2 + '_iter_' + str(e) + '_deepst_embedding.npy'), adata.obsm['DeepST_embed'])

        # save alignment matrix
        if not os.path.exists(os.path.join(save_dir_gt, dataset)):
            os.makedirs(os.path.join(save_dir_gt, dataset))
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'embedding.npy'), multiple_adata.obsm['DeepST_embed'])
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'prediction.npy'), multiple_adata.obs['DeepST_refine_domain'])

        end_time = time.time()
        run_times.append(end_time - start_time)

        # save labels
        np.save(osp.join(save_dir_gt, dataset, 'iter'+str(iter_)+'labels.npy'), multiple_adata.obs['original_clusters'])
        multiple_adata = deepen._get_cluster_data(multiple_adata, n_domains=n_domains, priori = True)
        for data_name in data_name_list:
            adata = multiple_adata[multiple_adata.obs["batch_name"]==data_name]
            np.save(osp.join(save_dir_gt, dataset, 'iter'+data_name+'_'+str(iter_)+'preds.npy'), adata.obs['DeepST_refine_domain'])