## ----options, echo=FALSE, results="hide",message=FALSE, error=FALSE, include=FALSE, autodep=TRUE----
if(packageVersion("zinbwave")<"0.99.6") stop("must have current devel version to avoid bugs")
if(packageVersion("clusterExperiment")<"1.3.2") stop("must have current devel version to avoid bugs")
if(packageVersion("scone")<"1.1.2") stop("must have current devel version to avoid bugs")
knitr::opts_chunk$set(cache=TRUE, error=FALSE, message=FALSE, warning=FALSE)
knitr::opts_chunk$set(fig.align="center", fig.width=5, fig.height=5)

## ----schema, echo=FALSE, out.width="90%", fig.cap="Workflow for analyzing scRNA-seq datasets. On the right, main plots generated by the workflow."----
knitr::include_graphics("schema_workflow.png")

## ----stemcelldiff, echo=FALSE, out.width="60%", fig.align="center", fig.cap = "Stem cell differentiation in the mouse olfactory epithelium. This figure was reproduced with kind permission from Fletcher et al. (2017)."----
knitr::include_graphics("stemcelldiff_Fletcher2017_2e.png")

## ----packages------------------------------------------------------------
# Bioconductor
library(BiocParallel)
library(clusterExperiment)
library(scone)
library(zinbwave)

# GitHub
library(slingshot)

# CRAN
library(doParallel)
library(gam)
library(RColorBrewer)

set.seed(20)

## ------------------------------------------------------------------------
register(SerialParam())

## ----parallel,eval=FALSE-------------------------------------------------
#  NCORES <- 2
#  mysystem = Sys.info()[["sysname"]]
#  if (mysystem == "Darwin"){
#    registerDoParallel(NCORES)
#    register(DoparParam())
#  }else if (mysystem == "Linux"){
#    register(bpstart(MulticoreParam(workers=NCORES)))
#  }else{
#    print("Please change this to allow parallel computing on your computer.")
#    register(SerialParam())
#  }

## ----datadir-------------------------------------------------------------
data_dir <- "../data/"

## ----datain,eval=FALSE---------------------------------------------------
#  if (!dir.exists(data_dir)) system(sprintf('mkdir %s', data_dir))
#  
#  urls = c("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE95601&format=file&file=GSE95601%5FoeHBCdiff%5FCufflinks%5FeSet%2ERda%2Egz",
#           "https://raw.githubusercontent.com/rufletch/p63-HBC-diff/master/ref/oeHBCdiff_clusterLabels.txt")
#  
#  if(!file.exists(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda"))) {
#    if (!dir.exists(data_dir)) system(sprintf('mkdir %s', data_dir))
#    download.file(urls[1], paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda.gz"))
#    R.utils::gunzip(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet.Rda.gz"))
#    assayData(Cufflinks_eSet)$exprs = NULL
#    assayData(Cufflinks_eSet)$fpkm_table = NULL
#    assayData(Cufflinks_eSet)$tpm_table = NULL
#    save(Cufflinks_eSet, file='data/GSE95601_oeHBCdiff_Cufflinks_eSet_reduced.Rda')
#  }
#  
#  if(!file.exists(paste0(data_dir, "oeHBCdiff_clusterLabels.txt"))) {
#    download.file(urls[2], paste0(data_dir, "oeHBCdiff_clusterLabels.txt"))
#  }

## ------------------------------------------------------------------------
load(paste0(data_dir, "GSE95601_oeHBCdiff_Cufflinks_eSet_reduced.Rda"))

# Count matrix
E <- assayData(Cufflinks_eSet)$counts_table

# Remove undetected genes
E <- na.omit(E)
E <- E[rowSums(E)>0,]
dim(E)

## ------------------------------------------------------------------------
# Remove ERCC and CreER genes
cre <- E["CreER",]
ercc <- E[grep("^ERCC-", rownames(E)),]
E <- E[grep("^ERCC-", rownames(E), invert = TRUE), ]
E <- E[-which(rownames(E)=="CreER"), ]
dim(E)

## ------------------------------------------------------------------------
# Extract QC metrics
qc <- as.matrix(protocolData(Cufflinks_eSet)@data)[,c(1:5, 10:18)]
qc <- cbind(qc, CreER = cre, ERCC_reads = colSums(ercc))

# Extract metadata
batch <- droplevels(pData(Cufflinks_eSet)$MD_c1_run_id)
bio <- droplevels(pData(Cufflinks_eSet)$MD_expt_condition)
clusterLabels <- read.table(paste0(data_dir, "oeHBCdiff_clusterLabels.txt"),
                            sep = "\t", stringsAsFactors = FALSE)
m <- match(colnames(E), clusterLabels[, 1])

# Create metadata data.frame
metadata <- data.frame("Experiment" = bio,
                       "Batch" = batch,
                       "publishedClusters" = clusterLabels[m,2],
                       qc)

# Symbol for cells not assigned to a lineage in original data
metadata$publishedClusters[is.na(metadata$publishedClusters)] <- -2

se <- SummarizedExperiment(assays = list(counts = E),
                           colData = metadata)
se

## ----scone, fig.cap="SCONE: Filtering of low-quality cells."-------------
# QC-metric-based sample-filtering
data("housekeeping")
hk = rownames(se)[toupper(rownames(se)) %in% housekeeping$V1]

mfilt <- metric_sample_filter(assay(se), 
                              nreads = colData(se)$NREADS,
                              ralign = colData(se)$RALIGN,
                              pos_controls = rownames(se) %in% hk,
                              zcut = 3, mixture = FALSE,
                              plot = TRUE)

## ----sconeFilt-----------------------------------------------------------
# Simplify to a single logical
mfilt <- !apply(simplify2array(mfilt[!is.na(mfilt)]), 1, any)
se <- se[, mfilt]
dim(se)

## ------------------------------------------------------------------------
# Filtering to top 1,000 most variable genes
vars <- rowVars(log1p(assay(se)))
names(vars) <- rownames(se)
vars <- sort(vars, decreasing = TRUE)
core <- se[names(vars)[1:1000],]

## ------------------------------------------------------------------------
core

## ----batch---------------------------------------------------------------
batch <- colData(core)$Batch
col_batch = c(brewer.pal(9, "Set1"), brewer.pal(8, "Dark2"), 
              brewer.pal(8, "Accent")[1])
names(col_batch) = unique(batch)
table(batch)

## ----original------------------------------------------------------------
publishedClusters <- colData(core)[, "publishedClusters"]
col_clus <- c("transparent", "#1B9E77", "antiquewhite2", "cyan", "#E7298A", 
              "#A6CEE3", "#666666", "#E6AB02", "#FFED6F", "darkorchid2", 
              "#B3DE69", "#FF7F00", "#A6761D", "#1F78B4")
names(col_clus) <- sort(unique(publishedClusters))
table(publishedClusters)

## ----clustbatch----------------------------------------------------------
table(data.frame(batch = as.vector(batch),
                 cluster = publishedClusters))

## ----zinbschema, echo=FALSE, out.width="95%", fig.cap="ZINB-WaVE: Schematic view of the ZINB-WaVE model. This figure was reproduced with kind permission from Risso et al. (2017)."----
knitr::include_graphics("zinb_schema.png")

## ----zinb,eval=FALSE-----------------------------------------------------
#  print(system.time(se <- zinbwave(core, K = 50, X = "~ Batch",
#                                   residuals = TRUE,
#                                   normalizedValues = TRUE)))
#  save(se, file = 'se_after_zinbwave.rda')

## ------------------------------------------------------------------------
load(sprintf('%sse_after_zinbwave.rda', data_dir))

## ----norm----------------------------------------------------------------
norm <- assays(se)$normalizedValues
norm[1:3,1:3]

## ----boxplotNorm, fig.cap="ZINB-WaVE: Boxplots of normalized expression measures (deviance residuals), color-coded by batch."----
norm_order <- norm[, order(as.numeric(batch))]
col_order <- col_batch[batch[order(as.numeric(batch))]]
boxplot(norm_order, col = col_order, staplewex = 0, outline = 0,
        border = col_order, xaxt = "n", ylab="Expression measure")
abline(h=0)

## ----pcanorm, fig.cap="ZINB-WaVE: PCA of normalized expression measures, where each point represents a cell. Cells are color-coded by batch (left panel) and by original published clustering (right panel)."----
pca <- prcomp(t(norm))
par(mfrow = c(1,2))
plot(pca$x, col = col_batch[batch], pch = 20, main = "")
plot(pca$x, col = col_clus[as.character(publishedClusters)], pch = 20, main = "")

## ----mdsW, fig.cap="ZINB-WaVE: MDS of the low-dimensional matrix W, where each point represents a cell and cells are color-coded by original published clustering."----
W <- colData(se)[, grepl("^W", colnames(colData(se)))]
W <- as.matrix(W)
d <- dist(W)
fit <- cmdscale(d, eig = TRUE, k = 2)
plot(fit$points, col = col_clus[as.character(publishedClusters)], main = "",
     pch = 20, xlab = "Component 1", ylab = "Component 2")
legend(x = "topleft", legend = unique(names(col_clus)), cex = .5,
       fill = unique(col_clus), title = "Sample")

## ----rsec_50,eval=FALSE--------------------------------------------------
#  seObj <- SummarizedExperiment(t(W), colData = colData(core))
#  print(system.time(ceObj <- RSEC(seObj, k0s = 4:15, alphas = c(0.1),
#                                  betas = 0.8, dimReduce="none",
#                                  clusterFunction = "hierarchical01", minSizes=1,
#                                  ncores = NCORES, isCount=FALSE,
#                                  dendroReduce="none",dendroNDims=NA,
#                                  subsampleArgs = list(resamp.num=100,
#                                                       clusterFunction="kmeans",
#                                                       clusterArgs=list(nstart=10)),
#                                  verbose=TRUE,
#                                  combineProportion = 0.7,
#                                  mergeMethod = "none", random.seed=424242,
#                                  combineMinSize = 10)))
#  save(seObj, file= 'seObj_after_RSEC.rda')
#  save(ceObj, file= 'ceObj_after_RSEC.rda')

## ------------------------------------------------------------------------
load(sprintf('%sceObj_after_RSEC.rda', data_dir))
load(sprintf('%sseObj_after_RSEC.rda', data_dir))

## ----examineCombineMany, fig.cap="RSEC: Candidate clusterings found using the function RSEC from the clusterExperiment package."----
plotClusters(ceObj, colPalette = c(bigPalette, rainbow(199)))

## ----plotcoclust, fig.cap="RSEC: Heatmap of co-clustering matrix."-------
plotCoClustering(ceObj)

## ----tableclust----------------------------------------------------------
table(primaryClusterNamed(ceObj))

## ----barplotOurs, fig.cap="RSEC: Barplot of number of cells per cluster for our workflow's RSEC clustering."----
plotBarplot(ceObj, legend = FALSE) 

## ----addPublishedClusters, fig.cap="RSEC: Barplot of number of cells per cluster, for our workflow's RSEC clustering, color-coded by original published clustering."----
ceObj <- addClusters(ceObj, colData(ceObj)$publishedClusters, 
                     clusterLabel = "publishedClusters")

## change default color to match with Figure 7
clusterLegend(ceObj)$publishedClusters[, "color"] <- 
  col_clus[clusterLegend(ceObj)$publishedClusters[, "name"]]

plotBarplot(ceObj, whichClusters=c("combineMany","publishedClusters"),
            xlab = "", legend = FALSE)

## ----heatmapsClusters, fig.cap="RSEC: Heatmap of the normalized expression measures for the 1,000 most variable genes, where rows correspond to genes and columns to cells ordered by RSEC clusters."----
# Set colors for cell clusterings
colData(ceObj)$publishedClusters <- as.factor(colData(ceObj)$publishedClusters)
origClusterColors <- bigPalette[1:nlevels(colData(ceObj)$publishedClusters)]
experimentColors <- bigPalette[1:nlevels(colData(ceObj)$Experiment)]
batchColors <- bigPalette[1:nlevels(colData(ceObj)$Batch)]
metaColors <- list("Experiment" = experimentColors,
                   "Batch" = batchColors,
                   "publishedClusters" = origClusterColors)

plotHeatmap(ceObj, visualizeData = assays(se)$normalizedValues,
            whichClusters = "primary", clusterFeaturesData = "all",
            clusterSamplesData = "dendrogramValue", breaks = 0.99,
            sampleData = c("publishedClusters", "Batch", "Experiment"),
            clusterLegend = metaColors, annLegend = FALSE, main = "")

## ----mdsWce, fig.cap="RSEC: MDS of the low-dimensional matrix W, where each point represents a cell and cells are color-coded by RSEC clustering."----
palDF <- ceObj@clusterLegend[[1]]
pal <- palDF[, "color"]
names(pal) <- palDF[, "name"]
pal["-1"] = "transparent"
plot(fit$points, col = pal[primaryClusterNamed(ceObj)], main = "", pch = 20,
     xlab = "Component1", ylab = "Component2")
legend(x = "topleft", legend = names(pal), cex = .5,
       fill = pal, title = "Sample")

## ----tabagain------------------------------------------------------------
table(data.frame(original = publishedClusters, ours = primaryClusterNamed(ceObj)))

## ----boxplotReg3g, fig.width=6, fig.height=4, fig.cap="RSEC: Boxplots of the log count of gene Reg3g stratified by cluster."----
c4 <- rep("other clusters", ncol(se)) 
c4[primaryClusterNamed(ceObj) == "c4"] <- "cluster c4"
boxplot(log1p(assay(se)["Reg3g", ]) ~ primaryClusterNamed(ceObj), 
        ylab = "Reg3g log counts", cex.axis = .8, cex.lab = .8)

## ------------------------------------------------------------------------
our_cl <- primaryClusterNamed(ceObj)
cl <- our_cl[!our_cl %in% c("-1", "c4")]
pal <- pal[!names(pal) %in% c("-1", "c4")]
X <- W[!our_cl %in% c("-1", "c4"), ]
X <- cmdscale(dist(X), k = 4)

lineages <- getLineages(X, clusterLabels = cl, start.clus = "c1")

## ----tree, fig.cap="Slingshot: Cells color-coded by cluster in a 4-dimensional MDS space, with connecting lines between cluster centers representing the inferred global lineage structure."----
pairs(lineages, type="lineages", col = pal[cl])

## ----curves, fig.cap="Slingshot: Cells color-coded by cluster in a 4-dimensional MDS space, with smooth curves representing each inferred lineage."----
lineages <- getCurves(lineages)
pairs(lineages, type="curves", col = pal[cl])

## ----lineages------------------------------------------------------------
lineages

## ----slingshotsupervised, eval=FALSE-------------------------------------
#  lineages <- getLineages(X, clusterLabels = cl, start.clus = "c1",
#                           end.clus = c("c3", "c7"))
#  lineagees <- getCurves(lineages)
#  pairs(lineages, type="curves", col = pal[primaryClusterNamed(ceObj)])
#  pairs(lineages, type="lineages", col = pal[primaryClusterNamed(ceObj)],
#        show.constraints = TRUE)
#  
#  lineages

## ----fitgam--------------------------------------------------------------
t <- pseudotime(lineages)[,1]
y <- assays(se)$normalizedValues[, !our_cl %in% c("-1", "c4")]
gam.pval <- apply(y,1,function(z){
  d <- data.frame(z=z, t=t)
  tmp <- gam(z ~ lo(t), data=d)
  p <- summary(tmp)[4][[1]][1,5]
  p
})

## ----heatmapsignificant, fig.cap="DE: Heatmap of the normalized expression measures for the 100 most significantly DE genes for the neuronal lineage, where rows correspond to genes and columns to cells ordered by pseudotime."----
topgenes <- names(sort(gam.pval, decreasing = FALSE))[1:100]
heatdata <- y[rownames(se) %in% topgenes, order(t, na.last = NA)]
heatclus <- cl[order(t, na.last = NA)]
ce <- clusterExperiment(heatdata, heatclus, transformation = identity)

#match to existing colors
cols <- clusterLegend(ceObj)$combineMany[, "color"]
names(cols) <- clusterLegend(ceObj)$combineMany[, "name"]
clusterLegend(ce)$cluster1[, "color"] <- cols[clusterLegend(ce)$cluster1[, "name"]]

plotHeatmap(ce, clusterSamplesData = "orderSamplesValue", breaks = .99)

## ------------------------------------------------------------------------
sessionInfo()