## ----options, echo=FALSE, results="hide",message=FALSE, error=FALSE, include=FALSE, autodep=TRUE---- if(packageVersion("zinbwave")<"0.99.6") stop("must have current devel version to avoid bugs") if(packageVersion("clusterExperiment")<"1.3.2") stop("must have current devel version to avoid bugs") if(packageVersion("scone")<"1.1.2") stop("must have current devel version to avoid bugs") knitr::opts_chunk$set(cache=TRUE, error=FALSE, message=FALSE, warning=FALSE) knitr::opts_chunk$set(fig.align="center", fig.width=5, fig.height=5) ## ----schema, echo=FALSE, out.width="90%", fig.cap="Workflow for analyzing scRNA-seq datasets. On the right, main plots generated by the workflow."---- knitr::include_graphics("schema_workflow.png") ## ----stemcelldiff, echo=FALSE, out.width="60%", fig.align="center", fig.cap = "Stem cell differentiation in the mouse olfactory epithelium. This figure was reproduced with kind permission from Fletcher et al. (2017)."---- knitr::include_graphics("stemcelldiff_Fletcher2017_2e.png") ## ----packages------------------------------------------------------------ # Bioconductor library(BiocParallel) library(clusterExperiment) library(scone) library(zinbwave) # GitHub library(slingshot) # CRAN library(doParallel) library(gam) library(RColorBrewer) set.seed(20) ## ------------------------------------------------------------------------ register(SerialParam()) ## ----parallel,eval=FALSE------------------------------------------------- # NCORES <- 2 # mysystem = Sys.info()[["sysname"]] # if (mysystem == "Darwin"){ # registerDoParallel(NCORES) # register(DoparParam()) # }else if (mysystem == "Linux"){ # register(bpstart(MulticoreParam(workers=NCORES))) # }else{ # print("Please change this to allow parallel computing on your computer.") # register(SerialParam()) # } ## ----datadir------------------------------------------------------------- data_dir <- "../data/" ## ----datain,eval=FALSE--------------------------------------------------- # if (!dir.exists(data_dir)) system(sprintf('mkdir %s', data_dir)) # # urls = c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE95601&format=file&file=GSE95601%5FoeHBCdiff%5FCufflinks%5FeSet%2ERda%2Egz", # "https://raw.githubusercontent.com/rufletch/p63-HBC-diff/master/ref/oeHBCdiff_clusterLabels.txt") # # if(!file.exists(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda"))) { # if (!dir.exists(data_dir)) system(sprintf('mkdir %s', data_dir)) # download.file(urls[1], paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda.gz")) # R.utils::gunzip(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda.gz")) # assayData(Cufflinks_eSet)$exprs = NULL # assayData(Cufflinks_eSet)$fpkm_table = NULL # assayData(Cufflinks_eSet)$tpm_table = NULL # save(Cufflinks_eSet, file='data/GSE95601_oeHBCdiff_Cufflinks_eSet_reduced.Rda') # } # # if(!file.exists(paste0(data_dir, "oeHBCdiff_clusterLabels.txt"))) { # download.file(urls[2], paste0(data_dir, "oeHBCdiff_clusterLabels.txt")) # } ## ------------------------------------------------------------------------ load(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet_reduced.Rda")) # Count matrix E <- assayData(Cufflinks_eSet)$counts_table # Remove undetected genes E <- na.omit(E) E <- E[rowSums(E)>0,] dim(E) ## ------------------------------------------------------------------------ # Remove ERCC and CreER genes cre <- E["CreER",] ercc <- E[grep("^ERCC-", rownames(E)),] E <- E[grep("^ERCC-", rownames(E), invert = TRUE), ] E <- E[-which(rownames(E)=="CreER"), ] dim(E) ## ------------------------------------------------------------------------ # Extract QC metrics qc <- as.matrix(protocolData(Cufflinks_eSet)@data)[,c(1:5, 10:18)] qc <- cbind(qc, CreER = cre, ERCC_reads = colSums(ercc)) # Extract metadata batch <- droplevels(pData(Cufflinks_eSet)$MD_c1_run_id) bio <- droplevels(pData(Cufflinks_eSet)$MD_expt_condition) clusterLabels <- read.table(paste0(data_dir, "oeHBCdiff_clusterLabels.txt"), sep = "\t", stringsAsFactors = FALSE) m <- match(colnames(E), clusterLabels[, 1]) # Create metadata data.frame metadata <- data.frame("Experiment" = bio, "Batch" = batch, "publishedClusters" = clusterLabels[m,2], qc) # Symbol for cells not assigned to a lineage in original data metadata$publishedClusters[is.na(metadata$publishedClusters)] <- -2 se <- SummarizedExperiment(assays = list(counts = E), colData = metadata) se ## ----scone, fig.cap="SCONE: Filtering of low-quality cells."------------- # QC-metric-based sample-filtering data("housekeeping") hk = rownames(se)[toupper(rownames(se)) %in% housekeeping$V1] mfilt <- metric_sample_filter(assay(se), nreads = colData(se)$NREADS, ralign = colData(se)$RALIGN, pos_controls = rownames(se) %in% hk, zcut = 3, mixture = FALSE, plot = TRUE) ## ----sconeFilt----------------------------------------------------------- # Simplify to a single logical mfilt <- !apply(simplify2array(mfilt[!is.na(mfilt)]), 1, any) se <- se[, mfilt] dim(se) ## ------------------------------------------------------------------------ # Filtering to top 1,000 most variable genes vars <- rowVars(log1p(assay(se))) names(vars) <- rownames(se) vars <- sort(vars, decreasing = TRUE) core <- se[names(vars)[1:1000],] ## ------------------------------------------------------------------------ core ## ----batch--------------------------------------------------------------- batch <- colData(core)$Batch col_batch = c(brewer.pal(9, "Set1"), brewer.pal(8, "Dark2"), brewer.pal(8, "Accent")[1]) names(col_batch) = unique(batch) table(batch) ## ----original------------------------------------------------------------ publishedClusters <- colData(core)[, "publishedClusters"] col_clus <- c("transparent", "#1B9E77", "antiquewhite2", "cyan", "#E7298A", "#A6CEE3", "#666666", "#E6AB02", "#FFED6F", "darkorchid2", "#B3DE69", "#FF7F00", "#A6761D", "#1F78B4") names(col_clus) <- sort(unique(publishedClusters)) table(publishedClusters) ## ----clustbatch---------------------------------------------------------- table(data.frame(batch = as.vector(batch), cluster = publishedClusters)) ## ----zinbschema, echo=FALSE, out.width="95%", fig.cap="ZINB-WaVE: Schematic view of the ZINB-WaVE model. This figure was reproduced with kind permission from Risso et al. (2017)."---- knitr::include_graphics("zinb_schema.png") ## ----zinb,eval=FALSE----------------------------------------------------- # print(system.time(se <- zinbwave(core, K = 50, X = "~ Batch", # residuals = TRUE, # normalizedValues = TRUE))) # save(se, file = 'se_after_zinbwave.rda') ## ------------------------------------------------------------------------ load(sprintf('%sse_after_zinbwave.rda', data_dir)) ## ----norm---------------------------------------------------------------- norm <- assays(se)$normalizedValues norm[1:3,1:3] ## ----boxplotNorm, fig.cap="ZINB-WaVE: Boxplots of normalized expression measures (deviance residuals), color-coded by batch."---- norm_order <- norm[, order(as.numeric(batch))] col_order <- col_batch[batch[order(as.numeric(batch))]] boxplot(norm_order, col = col_order, staplewex = 0, outline = 0, border = col_order, xaxt = "n", ylab="Expression measure") abline(h=0) ## ----pcanorm, fig.cap="ZINB-WaVE: PCA of normalized expression measures, where each point represents a cell. Cells are color-coded by batch (left panel) and by original published clustering (right panel)."---- pca <- prcomp(t(norm)) par(mfrow = c(1,2)) plot(pca$x, col = col_batch[batch], pch = 20, main = "") plot(pca$x, col = col_clus[as.character(publishedClusters)], pch = 20, main = "") ## ----mdsW, fig.cap="ZINB-WaVE: MDS of the low-dimensional matrix W, where each point represents a cell and cells are color-coded by original published clustering."---- W <- colData(se)[, grepl("^W", colnames(colData(se)))] W <- as.matrix(W) d <- dist(W) fit <- cmdscale(d, eig = TRUE, k = 2) plot(fit$points, col = col_clus[as.character(publishedClusters)], main = "", pch = 20, xlab = "Component 1", ylab = "Component 2") legend(x = "topleft", legend = unique(names(col_clus)), cex = .5, fill = unique(col_clus), title = "Sample") ## ----rsec_50,eval=FALSE-------------------------------------------------- # seObj <- SummarizedExperiment(t(W), colData = colData(core)) # print(system.time(ceObj <- RSEC(seObj, k0s = 4:15, alphas = c(0.1), # betas = 0.8, dimReduce="none", # clusterFunction = "hierarchical01", minSizes=1, # ncores = NCORES, isCount=FALSE, # dendroReduce="none",dendroNDims=NA, # subsampleArgs = list(resamp.num=100, # clusterFunction="kmeans", # clusterArgs=list(nstart=10)), # verbose=TRUE, # combineProportion = 0.7, # mergeMethod = "none", random.seed=424242, # combineMinSize = 10))) # save(seObj, file= 'seObj_after_RSEC.rda') # save(ceObj, file= 'ceObj_after_RSEC.rda') ## ------------------------------------------------------------------------ load(sprintf('%sceObj_after_RSEC.rda', data_dir)) load(sprintf('%sseObj_after_RSEC.rda', data_dir)) ## ----examineCombineMany, fig.cap="RSEC: Candidate clusterings found using the function RSEC from the clusterExperiment package."---- plotClusters(ceObj, colPalette = c(bigPalette, rainbow(199))) ## ----plotcoclust, fig.cap="RSEC: Heatmap of co-clustering matrix."------- plotCoClustering(ceObj) ## ----tableclust---------------------------------------------------------- table(primaryClusterNamed(ceObj)) ## ----barplotOurs, fig.cap="RSEC: Barplot of number of cells per cluster for our workflow's RSEC clustering."---- plotBarplot(ceObj, legend = FALSE) ## ----addPublishedClusters, fig.cap="RSEC: Barplot of number of cells per cluster, for our workflow's RSEC clustering, color-coded by original published clustering."---- ceObj <- addClusters(ceObj, colData(ceObj)$publishedClusters, clusterLabel = "publishedClusters") ## change default color to match with Figure 7 clusterLegend(ceObj)$publishedClusters[, "color"] <- col_clus[clusterLegend(ceObj)$publishedClusters[, "name"]] plotBarplot(ceObj, whichClusters=c("combineMany","publishedClusters"), xlab = "", legend = FALSE) ## ----heatmapsClusters, fig.cap="RSEC: Heatmap of the normalized expression measures for the 1,000 most variable genes, where rows correspond to genes and columns to cells ordered by RSEC clusters."---- # Set colors for cell clusterings colData(ceObj)$publishedClusters <- as.factor(colData(ceObj)$publishedClusters) origClusterColors <- bigPalette[1:nlevels(colData(ceObj)$publishedClusters)] experimentColors <- bigPalette[1:nlevels(colData(ceObj)$Experiment)] batchColors <- bigPalette[1:nlevels(colData(ceObj)$Batch)] metaColors <- list("Experiment" = experimentColors, "Batch" = batchColors, "publishedClusters" = origClusterColors) plotHeatmap(ceObj, visualizeData = assays(se)$normalizedValues, whichClusters = "primary", clusterFeaturesData = "all", clusterSamplesData = "dendrogramValue", breaks = 0.99, sampleData = c("publishedClusters", "Batch", "Experiment"), clusterLegend = metaColors, annLegend = FALSE, main = "") ## ----mdsWce, fig.cap="RSEC: MDS of the low-dimensional matrix W, where each point represents a cell and cells are color-coded by RSEC clustering."---- palDF <- ceObj@clusterLegend[[1]] pal <- palDF[, "color"] names(pal) <- palDF[, "name"] pal["-1"] = "transparent" plot(fit$points, col = pal[primaryClusterNamed(ceObj)], main = "", pch = 20, xlab = "Component1", ylab = "Component2") legend(x = "topleft", legend = names(pal), cex = .5, fill = pal, title = "Sample") ## ----tabagain------------------------------------------------------------ table(data.frame(original = publishedClusters, ours = primaryClusterNamed(ceObj))) ## ----boxplotReg3g, fig.width=6, fig.height=4, fig.cap="RSEC: Boxplots of the log count of gene Reg3g stratified by cluster."---- c4 <- rep("other clusters", ncol(se)) c4[primaryClusterNamed(ceObj) == "c4"] <- "cluster c4" boxplot(log1p(assay(se)["Reg3g", ]) ~ primaryClusterNamed(ceObj), ylab = "Reg3g log counts", cex.axis = .8, cex.lab = .8) ## ------------------------------------------------------------------------ our_cl <- primaryClusterNamed(ceObj) cl <- our_cl[!our_cl %in% c("-1", "c4")] pal <- pal[!names(pal) %in% c("-1", "c4")] X <- W[!our_cl %in% c("-1", "c4"), ] X <- cmdscale(dist(X), k = 4) lineages <- getLineages(X, clusterLabels = cl, start.clus = "c1") ## ----tree, fig.cap="Slingshot: Cells color-coded by cluster in a 4-dimensional MDS space, with connecting lines between cluster centers representing the inferred global lineage structure."---- pairs(lineages, type="lineages", col = pal[cl]) ## ----curves, fig.cap="Slingshot: Cells color-coded by cluster in a 4-dimensional MDS space, with smooth curves representing each inferred lineage."---- lineages <- getCurves(lineages) pairs(lineages, type="curves", col = pal[cl]) ## ----lineages------------------------------------------------------------ lineages ## ----slingshotsupervised, eval=FALSE------------------------------------- # lineages <- getLineages(X, clusterLabels = cl, start.clus = "c1", # end.clus = c("c3", "c7")) # lineagees <- getCurves(lineages) # pairs(lineages, type="curves", col = pal[primaryClusterNamed(ceObj)]) # pairs(lineages, type="lineages", col = pal[primaryClusterNamed(ceObj)], # show.constraints = TRUE) # # lineages ## ----fitgam-------------------------------------------------------------- t <- pseudotime(lineages)[,1] y <- assays(se)$normalizedValues[, !our_cl %in% c("-1", "c4")] gam.pval <- apply(y,1,function(z){ d <- data.frame(z=z, t=t) tmp <- gam(z ~ lo(t), data=d) p <- summary(tmp)[4][[1]][1,5] p }) ## ----heatmapsignificant, fig.cap="DE: Heatmap of the normalized expression measures for the 100 most significantly DE genes for the neuronal lineage, where rows correspond to genes and columns to cells ordered by pseudotime."---- topgenes <- names(sort(gam.pval, decreasing = FALSE))[1:100] heatdata <- y[rownames(se) %in% topgenes, order(t, na.last = NA)] heatclus <- cl[order(t, na.last = NA)] ce <- clusterExperiment(heatdata, heatclus, transformation = identity) #match to existing colors cols <- clusterLegend(ceObj)$combineMany[, "color"] names(cols) <- clusterLegend(ceObj)$combineMany[, "name"] clusterLegend(ce)$cluster1[, "color"] <- cols[clusterLegend(ce)$cluster1[, "name"]] plotHeatmap(ce, clusterSamplesData = "orderSamplesValue", breaks = .99) ## ------------------------------------------------------------------------ sessionInfo()