## ----eval=FALSE--------------------------------------------------------------- # install.packages("BiocManager") # BiocManager::install("nnSVG") ## ----message=FALSE------------------------------------------------------------ library(SpatialExperiment) library(STexampleData) library(scran) library(nnSVG) library(ggplot2) ## ----------------------------------------------------------------------------- # load example dataset from STexampleData package # see '?Visium_humanDLPFC' for more details spe <- Visium_humanDLPFC() dim(spe) ## ----------------------------------------------------------------------------- # preprocessing steps # keep only spots over tissue spe <- spe[, colData(spe)$in_tissue == 1] dim(spe) # skip spot-level quality control, since this has been performed previously # on this dataset # filter low-expressed and mitochondrial genes # using default filtering parameters spe <- filter_genes(spe) # calculate logcounts (log-transformed normalized counts) using scran package # using library size factors spe <- computeLibraryFactors(spe) spe <- logNormCounts(spe) assayNames(spe) ## ----------------------------------------------------------------------------- # select small set of random genes and several known SVGs for # faster runtime in this example set.seed(123) ix_random <- sample(seq_len(nrow(spe)), 10) known_genes <- c("MOBP", "PCP4", "SNAP25", "HBB", "IGKC", "NPY") ix_known <- which(rowData(spe)$gene_name %in% known_genes) ix <- c(ix_known, ix_random) spe <- spe[ix, ] dim(spe) ## ----------------------------------------------------------------------------- # run nnSVG # set seed for reproducibility set.seed(123) # using a single thread in this example spe <- nnSVG(spe) # show results rowData(spe) ## ----------------------------------------------------------------------------- # number of significant SVGs table(rowData(spe)$padj <= 0.05) ## ----------------------------------------------------------------------------- # show results for top n SVGs rowData(spe)[order(rowData(spe)$rank)[1:10], ] ## ----fig.small=TRUE----------------------------------------------------------- # plot spatial expression of top-ranked SVG ix <- which(rowData(spe)$rank == 1) ix_name <- rowData(spe)$gene_name[ix] ix_name df <- as.data.frame( cbind(spatialCoords(spe), expr = counts(spe)[ix, ])) ggplot(df, aes(x = pxl_col_in_fullres, y = pxl_row_in_fullres, color = expr)) + geom_point(size = 0.8) + coord_fixed() + scale_y_reverse() + scale_color_gradient(low = "gray90", high = "blue", trans = "sqrt", breaks = range(df$expr), name = "counts") + ggtitle(ix_name) + theme_bw() + theme(plot.title = element_text(face = "italic"), panel.grid = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) ## ----message=FALSE------------------------------------------------------------ library(SpatialExperiment) library(STexampleData) library(scran) library(nnSVG) library(ggplot2) ## ----------------------------------------------------------------------------- # load example dataset from STexampleData package # see '?SlideSeqV2_mouseHPC' for more details spe <- SlideSeqV2_mouseHPC() dim(spe) ## ----------------------------------------------------------------------------- # preprocessing steps # remove spots with NA cell type labels spe <- spe[, !is.na(colData(spe)$celltype)] dim(spe) # check cell type labels table(colData(spe)$celltype) # filter low-expressed and mitochondrial genes # using adjusted filtering parameters for this platform spe <- filter_genes( spe, filter_genes_ncounts = 1, filter_genes_pcspots = 1, filter_mito = TRUE ) dim(spe) # calculate log-transformed normalized counts using scran package # using library size normalization spe <- computeLibraryFactors(spe) spe <- logNormCounts(spe) assayNames(spe) ## ----------------------------------------------------------------------------- # select small set of random genes and several known SVGs for # faster runtime in this example set.seed(123) ix_random <- sample(seq_len(nrow(spe)), 10) known_genes <- c("Cpne9", "Rgs14") ix_known <- which(rowData(spe)$gene_name %in% known_genes) ix <- c(ix_known, ix_random) spe <- spe[ix, ] dim(spe) ## ----------------------------------------------------------------------------- # run nnSVG with covariates # create model matrix for cell type labels X <- model.matrix(~ colData(spe)$celltype) dim(X) stopifnot(nrow(X) == ncol(spe)) # set seed for reproducibility set.seed(123) # using a single thread in this example spe <- nnSVG(spe, X = X) # show results rowData(spe) ## ----------------------------------------------------------------------------- # create model matrix after dropping empty factor levels X <- model.matrix(~ droplevels(as.factor(colData(spe)$celltype))) ## ----------------------------------------------------------------------------- # number of significant SVGs table(rowData(spe)$padj <= 0.05) ## ----------------------------------------------------------------------------- # show results for top n SVGs rowData(spe)[order(rowData(spe)$rank)[1:10], ] ## ----fig.small=TRUE----------------------------------------------------------- # plot spatial expression of top-ranked SVG ix <- which(rowData(spe)$rank == 1) ix_name <- rowData(spe)$gene_name[ix] ix_name df <- as.data.frame( cbind(spatialCoords(spe), expr = counts(spe)[ix, ])) ggplot(df, aes(x = xcoord, y = ycoord, color = expr)) + geom_point(size = 0.1) + coord_fixed() + scale_color_gradient(low = "gray90", high = "blue", trans = "sqrt", breaks = range(df$expr), name = "counts") + ggtitle(ix_name) + theme_bw() + theme(plot.title = element_text(face = "italic"), panel.grid = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) ## ----message=FALSE------------------------------------------------------------ library(SpatialExperiment) library(WeberDivechaLCdata) library(nnSVG) ## ----------------------------------------------------------------------------- # load example dataset from WeberDivechaLCdata package # see '?WeberDivechaLCdata_Visium' for more details spe <- WeberDivechaLCdata_Visium() dim(spe) # check sample IDs table(colData(spe)$sample_id) ## ----------------------------------------------------------------------------- # select sample IDs for example samples_keep <- c("Br6522_LC_1_round1", "Br6522_LC_2_round1") # subset SpatialExperiment object to keep only these samples spe <- spe[, colData(spe)$sample_id %in% samples_keep] # remove empty levels from factor of sample IDs colData(spe)$sample_id <- droplevels(colData(spe)$sample_id) # check dim(spe) table(colData(spe)$sample_id) ## ----------------------------------------------------------------------------- # preprocessing steps # keep only spots over tissue spe <- spe[, colData(spe)$in_tissue == 1] dim(spe) # skip spot-level quality control, since this has been performed previously # on this dataset ## ----------------------------------------------------------------------------- # filter low-expressed genes # filter out genes with extremely low expression # using simple threshold on total UMI counts summed across all spots # approximate threshold of 10 UMI counts per sample n_umis <- 20 ix_low_genes <- rowSums(counts(spe)) < n_umis table(ix_low_genes) spe <- spe[!ix_low_genes, ] dim(spe) # filter any new zeros created after filtering low-expressed genes # remove genes with zero expression ix_zero_genes <- rowSums(counts(spe)) == 0 table(ix_zero_genes) if (sum(ix_zero_genes) > 0) { spe <- spe[!ix_zero_genes, ] } dim(spe) # remove spots with zero expression ix_zero_spots <- colSums(counts(spe)) == 0 table(ix_zero_spots) if (sum(ix_zero_spots) > 0) { spe <- spe[, !ix_zero_spots] } dim(spe) ## ----------------------------------------------------------------------------- # run nnSVG once per sample and store lists of top SVGs sample_ids <- levels(colData(spe)$sample_id) res_list <- as.list(rep(NA, length(sample_ids))) names(res_list) <- sample_ids for (s in seq_along(sample_ids)) { # select sample ix <- colData(spe)$sample_id == sample_ids[s] spe_sub <- spe[, ix] dim(spe_sub) # run nnSVG filtering for mitochondrial genes and low-expressed genes # (note: set 'filter_mito = TRUE' in most datasets) spe_sub <- filter_genes( spe_sub, filter_genes_ncounts = 3, filter_genes_pcspots = 0.5, filter_mito = FALSE ) # remove any zeros introduced by filtering ix_zeros <- colSums(counts(spe_sub)) == 0 if (sum(ix_zeros) > 0) { spe_sub <- spe_sub[, !ix_zeros] } dim(spe_sub) # re-calculate logcounts after filtering spe_sub <- computeLibraryFactors(spe_sub) spe_sub <- logNormCounts(spe_sub) # subsample small set of random genes for faster runtime in this example # (same genes for each sample) # (note: skip this step in a full analysis) if (s == 1) { set.seed(123) ix_random <- sample(seq_len(nrow(spe_sub)), 30) genes_random <- rownames(spe_sub)[ix_random] } spe_sub <- spe_sub[rownames(spe_sub) %in% genes_random, ] dim(spe_sub) # run nnSVG set.seed(123) spe_sub <- nnSVG(spe_sub) # store results for this sample res_list[[s]] <- rowData(spe_sub) } ## ----------------------------------------------------------------------------- # combine results for multiple samples by averaging gene ranks # to generate an overall ranking # number of genes that passed filtering (and subsampling) for each sample sapply(res_list, nrow) # match results from each sample and store in matching rows res_ranks <- matrix(NA, nrow = nrow(spe), ncol = length(sample_ids)) rownames(res_ranks) <- rownames(spe) colnames(res_ranks) <- sample_ids for (s in seq_along(sample_ids)) { stopifnot(colnames(res_ranks)[s] == sample_ids[s]) stopifnot(colnames(res_ranks)[s] == names(res_list)[s]) rownames_s <- rownames(res_list[[s]]) res_ranks[rownames_s, s] <- res_list[[s]][, "rank"] } # remove genes that were filtered out in all samples ix_allna <- apply(res_ranks, 1, function(r) all(is.na(r))) res_ranks <- res_ranks[!ix_allna, ] dim(res_ranks) # calculate average ranks # note missing values due to filtering for samples avg_ranks <- rowMeans(res_ranks, na.rm = TRUE) # calculate number of samples where each gene is within top 100 ranked SVGs # for that sample n_withinTop100 <- apply(res_ranks, 1, function(r) sum(r <= 100, na.rm = TRUE)) # summary table df_summary <- data.frame( gene_id = names(avg_ranks), gene_name = rowData(spe)[names(avg_ranks), "gene_name"], gene_type = rowData(spe)[names(avg_ranks), "gene_type"], overall_rank = rank(avg_ranks), average_rank = unname(avg_ranks), n_withinTop100 = unname(n_withinTop100), row.names = names(avg_ranks) ) # sort by average rank df_summary <- df_summary[order(df_summary$average_rank), ] head(df_summary) # top n genes # (note: NAs in this example due to subsampling genes for faster runtime) top100genes <- df_summary$gene_name[1:100] # summary table of "replicated" SVGs (i.e. genes that are highly ranked in at # least x samples) df_summaryReplicated <- df_summary[df_summary$n_withinTop100 >= 2, ] # re-calculate rank within this set df_summaryReplicated$overall_rank <- rank(df_summaryReplicated$average_rank) dim(df_summaryReplicated) head(df_summaryReplicated) # top "replicated" SVGs topSVGsReplicated <- df_summaryReplicated$gene_name ## ----------------------------------------------------------------------------- # plot one of the top SVGs head(df_summary, 5) ix_gene <- which(rowData(spe)$gene_name == df_summary[2, "gene_name"]) df <- as.data.frame(cbind( colData(spe), spatialCoords(spe), gene = counts(spe)[ix_gene, ] )) ggplot(df, aes(x = pxl_col_in_fullres, y = pxl_row_in_fullres, color = gene)) + facet_wrap(~ sample_id, nrow = 1, scales = "free") + geom_point(size = 0.6) + scale_color_gradient(low = "gray80", high = "red", trans = "sqrt", name = "counts", breaks = range(df$gene)) + scale_y_reverse() + ggtitle(paste0(rowData(spe)$gene_name[ix_gene], " expression")) + theme_bw() + theme(aspect.ratio = 1, panel.grid = element_blank(), plot.title = element_text(face = "italic"), axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) ## ----eval=FALSE--------------------------------------------------------------- # # filter any new zeros created after filtering low-expressed genes # # # remove genes with zero expression # ix_zero_genes <- rowSums(counts(spe)) == 0 # table(ix_zero_genes) # # if (sum(ix_zero_genes) > 0) { # spe <- spe[!ix_zero_genes, ] # } # # dim(spe) # # # remove spots with zero expression # ix_zero_spots <- colSums(counts(spe)) == 0 # table(ix_zero_spots) # # if (sum(ix_zero_spots) > 0) { # spe <- spe[, !ix_zero_spots] # } # # dim(spe) ## ----------------------------------------------------------------------------- sessionInfo()