BANKSY tutorial
Dependencies
library(Banksy)
library(SummarizedExperiment)
library(SpatialExperiment)
library(scuttle)
library(scater)
library(cowplot)
library(ggplot2)
library(Seurat)
library(hdf5r)
Data loading: DLPFC
sample.name <- "151573"
cluster.number <- 7
dir.input <- file.path('/home/xiem6/0Data/DLPFC12/', sample.name)
dir.output <- file.path('/maiziezhou_lab/manfeifei/0Projects/Benchmark_STdata/R/Banksy/output/', sample.name, '/')
meta.input <- file.path('/home/xiem6/0Data/DLPFC12/', sample.name, 'gt')
layer.input <- file.path('/home/xiem6/0Data/DLPFC12/', sample.name, 'gt/layered')
if(!dir.exists(file.path(dir.output))){
dir.create(file.path(dir.output), recursive = TRUE)
}
filename <- paste0(sample.name, "_filtered_feature_bc_matrix.h5")
sp_data <- Load10X_Spatial(dir.input, filename = filename)
df_meta <- read.table(file.path(meta.input, 'tissue_positions_list_GTs.txt'))
original_row_names <- row.names(df_meta)
split_data <- strsplit(df_meta$V1, split = ",")
df_meta <- do.call(rbind, lapply(split_data, function(x) {
data.frame(V1=x[1], V2=x[2], V3=x[3], V4=x[4], V5=x[5], V6=x[6], V7=x[7])
}))
row.names(df_meta) <- df_meta$V1
df_meta$V3 <- as.numeric(df_meta$V3)
df_meta$V4 <- as.numeric(df_meta$V4)
# Identify the cells that are in both sp_data and df_meta
common_cells <- colnames(sp_data[["Spatial"]]) %in% rownames(df_meta)
# Subset sp_data to keep only these cells
sp_data <- sp_data[, common_cells]
# Initialize an empty dataframe to hold the final results
layer.data <- data.frame()
if(as.numeric(cluster.number) == 5) {
for(i in 3:6){
file.name <- paste0(sample.name, "_L", i, "_barcodes.txt")
file.path <- file.path(layer.input, file.name)
data.temp <- read.table(file.path, header = FALSE, stringsAsFactors = FALSE) # assuming the file has no header
data.temp <- data.frame(barcode = data.temp[,1], layer = paste0("layer", i), row.names = data.temp[,1])
# Append to the final dataframe
layer.data <- rbind(layer.data, data.temp)
}
} else {
for(i in 1:6){
file.name <- paste0(sample.name, "_L", i, "_barcodes.txt")
file.path <- file.path(layer.input, file.name)
data.temp <- read.table(file.path, header = FALSE, stringsAsFactors = FALSE) # assuming the file has no header
data.temp <- data.frame(barcode = data.temp[,1], layer = paste0("layer", i), row.names = data.temp[,1])
# Append to the final dataframe
layer.data <- rbind(layer.data, data.temp)
}
}
# For the WM file
file.name <- paste0(sample.name, "_WM_barcodes.txt")
file.path <- file.path(layer.input, file.name)
data.temp <- read.table(file.path, header = FALSE, stringsAsFactors = FALSE) # assuming the file has no header
data.temp <- data.frame(barcode = data.temp[,1], layer = "WM", row.names = data.temp[,1])
# Append to the final dataframe
layer.data <- rbind(layer.data, data.temp)
sp_data <- AddMetaData(sp_data,
metadata = df_meta['V3'],
col.name = 'sdimx') #row'
sp_data <- AddMetaData(sp_data,
metadata = df_meta['V4'],
col.name = 'sdimy') #col
sp_data <- AddMetaData(sp_data,
metadata = layer.data['layer'],
col.name = 'layer_guess_reordered')
#gcm <- sp_data@assays$Spatial@counts # this is used in Seurat ‘4.3.0’
gcm <- sp_data@assays$Spatial@layers$counts # this is used in Seurat ‘5.0.3’
# get coordinates
locs <- data.frame(sdimx=sp_data@meta.data$sdimx, sdimy=sp_data@meta.data$sdimy)
row.names(locs) <- row.names(sp_data@meta.data)
spatial_coor <- as.matrix(locs)
rownames(spatial_coor) <- row.names(locs)
Run BANKSY
run_analysis <- function(respa) {
#Initialize a SpatialExperiment object and perform basic quality control and normalization.
se <- SpatialExperiment(assay = list(counts = gcm), spatialCoords = spatial_coor)
print("finish Spatial exp")
imgData(se) <- NULL
assay(se, "logcounts") <- NULL
reducedDims(se) <- NULL
rowData(se) <- NULL
colData(se) <- DataFrame(
sample_id = sample.name,
clust_annotation = factor(
addNA(sp_data@meta.data$layer_guess_reordered),
exclude = NULL, labels = seq(length(unique(sp_data@meta.data$layer_guess_reordered)))
),
row.names = row.names(locs)
)
rownames(se) <- row.names(sp_data)
#' Remove NA spots optionally
se = se[as.numeric(se$clust_annotation) <= cluster.number, ]
se$clust_annotation = droplevels.factor(se$clust_annotation)
# Normalization to mean library size
se <- computeLibraryFactors(se)
print("finish computeLibraryFactors")
aname = "logcounts"
assay(se, aname) <- normalizeCounts(se, log = TRUE)
print("finish normalizeCounts")
#' Find variable features
feat = VariableFeatures(FindVariableFeatures(as.Seurat(se)))
se = se[feat, ]
#Compute the neighborhood matrices for BANKSY. Setting compute_agf=TRUE computes both the weighted neighborhood mean (M
#) and the azimuthal Gabor filter (G). The number of spatial neighbors used to compute M and G
#are k_geom[1]=15 and k_geom[2]=30 respectively. We run BANKSY at lambda=0 corresponding to non-spatial clustering, and lambda=0.2 corresponding to BANKSY for cell-typing.
lambda <- c(0, 0.2)
#k_geom <- c(15, 30) # this is default
k_geom = c(18, 18) # this is for DLPFC
se <- Banksy::computeBanksy(se, assay_name = aname, compute_agf = TRUE, k_geom = k_geom)
print("finish computeBansky")
# run PCA on the BANKSY matrix and perform clustering. Setting use_agf=TRUE uses both
# and to construct the BANKSY matrix.
set.seed(1000)
se <- Banksy::runBanksyPCA(se, use_agf = TRUE, lambda = lambda)
print("finish PCA")
se <- Banksy::runBanksyUMAP(se, use_agf = TRUE, lambda = lambda)
print("finish UMAP")
se <- Banksy::clusterBanksy(se, use_agf = TRUE, lambda = lambda, resolution = respa)
print("finish cluster")
#Different clustering runs can be relabeled to minimise their differences with connectClusters:
se <- Banksy::connectClusters(se)
print("finish connect")
#Visualise the clustering output for non-spatial clustering (lambda=0) and BANKSY clustering (lambda=0.2).
cnames <- colnames(colData(se))
cnames <- cnames[grep("^clust", cnames)]
colData(se) <- cbind(colData(se), spatialCoords(se))
# plot_nsp <- plotColData(se,
# x = "sdimx", y = "sdimy",
# point_size = 0.6, colour_by = cnames[1]
# )
# plot_bank <- plotColData(se,
# x = "sdimx", y = "sdimy",
# point_size = 0.6, colour_by = cnames[2]
# )
#plot_grid(plot_nsp + coord_equal(), plot_bank + coord_equal(), ncol = 2)
name <- paste0("clust_M1_lam0.2_k50_res", respa)
zlabels <- colData(se)[[name]]
# spatial domain labels, the class of zlabels is factor
print("obatained spots no.")
print(length(zlabels))
ob_clusternumber <- nlevels(zlabels) # the no. of obtained clusters
print("obtained num. of clusters")
print(ob_clusternumber)
gtlabels <- list(colData(se)$clust_annotation)
print("length of gtlables")
print(length(gtlabels[[1]]))
print("original num. of clusters:")
print(length(unique(gtlabels[[1]])))
if (ob_clusternumber == 0) {
df_i <- data.frame()
ari_Bansky <- 0
} else {
df_i <- data.frame(
row = colData(se)$sdimx,
col = colData(se)$sdimy,
slot2 = zlabels,
stringsAsFactors = FALSE
)
# Set row names of df_i
rownames(df_i) <- rownames(colData(se))
colnames(df_i)[ncol(df_i)] <- "spatial cluster"
ari_Bansky <- mclust::adjustedRandIndex(zlabels, gtlabels[[1]])
}
print("ari")
print(ari_Bansky)
return(list(ari_Bansky = ari_Bansky, respa = respa, cluster.number = length(unique(gtlabels[[1]])), ob_clusternumber = ob_clusternumber, df_i=df_i, dirOut=dir.output))
}
# Initialize the result dataframe
result_df <- data.frame(ari_Bansky = numeric(), res_pa = numeric(), ori_cluster_no = numeric(), ob_cluster_no = numeric())
df_list <- list()
# Run the main function with different resolution parameters
respa <- seq(0.1, 1.5, by = 0.1)
for(i in 1:length(respa)){
result <- run_analysis(respa[i])
result_df <- rbind(result_df, data.frame(ari_Bansky = result$ari_Bansky, resolution_para = result$respa, ori_cluster_no = result$cluster.number, ob_cluster_no = result$ob_clusternumber))
df_list <- c(df_list, list(result$df_i))
}
Save the output
Write the original result dataframe to a txt file
dir.output <- result$dirOut
write.table(result_df, file = file.path(dir.output, "ori_ari.txt"), sep = "\t", row.names = FALSE, col.names=TRUE)
result_df_filtered <- result_df[result_df$ori_cluster_no == result_df$ob_cluster_no, ]
print("filtered")
print(result_df_filtered)
# Check if result_df_filtered is not empty
if (nrow(result_df_filtered) > 0) {
# Find the row indices with ari_Bansky closest to the median value
median_index <- which.min(abs(result_df_filtered$ari_Bansky - median(result_df_filtered$ari_Bansky)))
# Extract the rows closest to the median value, just keep the first one
median_df <- result_df_filtered[median_index[1], ]
write.table(median_df, file = file.path(dir.output, "ari.txt"), sep = "\t", row.names = FALSE, col.names=TRUE)
# Print the result
print(median_df)
# get the corresponding row number in the original results
para_median <- median_df$resolution_para
print("para_median")
print(para_median)
row_number <- which(result_df$resolution_para == para_median)
print("row number:")
row_number
df <- df_list[[row_number]]
filename <- paste0(sample.name, "_output.csv")
write.table(df, file = file.path(dir.output, filename), sep = "\t", qmethod = "double", col.names = NA)
} else {
print("Can't find a result that obtained cluster no. equal to the given cluster no.")
lower <- result_df$ori_cluster_no - 1
upper <- result_df$ori_cluster_no +1
result_df_filtered <- result_df[lower <= result_df$ob_cluster_no & result_df$ob_cluster_no <= upper, ]
print("filtered")
print(result_df_filtered)
if (nrow(result_df_filtered) > 0) {
# Find the row indices with ari_Bansky closest to the median value
median_index <- which.min(abs(result_df_filtered$ari_Bansky - median(result_df_filtered$ari_Bansky)))
# Extract the rows closest to the median value, just keep the first one
median_df <- result_df_filtered[median_index[1], ]
write.table(median_df, file = file.path(dir.output, "ari.txt"), sep = "\t", row.names = FALSE, col.names=TRUE)
# Print the result
print(median_df)
# get the corresponding row number in the original results
para_median <- median_df$resolution_para
print("para_median")
print(para_median)
row_number <- which(result_df$resolution_para == para_median)
print("row number:")
row_number
df <- df_list[[row_number]]
filename <- paste0(sample.name, "_output.csv")
write.table(df, file = file.path(dir.output, filename), sep = "\t", qmethod = "double", col.names = NA)
} else {
print("Can't find a result that obtained cluster no. in [ori_cluster_no-1, ori_cluster_no +1]")
}
}