## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) set.seed(2) ## ----------------------------------------------------------------------------- library(glmGamPoi) ## ----------------------------------------------------------------------------- # overdispersion = 1/size counts <- rnbinom(n = 10, mu = 5, size = 1/0.7) # design = ~ 1 means that an intercept-only model is fit fit <- glm_gp(counts, design = ~ 1) fit # Internally fit is just a list: as.list(fit)[1:2] ## ----warning=FALSE, message = FALSE------------------------------------------- library(SummarizedExperiment) library(DelayedMatrixStats) ## ----------------------------------------------------------------------------- # The full dataset with 33,000 genes and 4340 cells # The first time this is run, it will download the data pbmcs <- TENxPBMCData::TENxPBMCData("pbmc4k") # I want genes where at least some counts are non-zero non_empty_rows <- which(rowSums2(assay(pbmcs)) > 0) pbmcs_subset <- pbmcs[sample(non_empty_rows, 300), ] pbmcs_subset ## ----------------------------------------------------------------------------- fit <- glm_gp(pbmcs_subset, on_disk = FALSE) summary(fit) ## ----warning=FALSE------------------------------------------------------------ # Explicitly realize count matrix in memory so that it is a fair comparison pbmcs_subset <- as.matrix(assay(pbmcs_subset)) model_matrix <- matrix(1, nrow = ncol(pbmcs_subset)) bench::mark( glmGamPoi_in_memory = { glm_gp(pbmcs_subset, design = model_matrix, on_disk = FALSE) }, glmGamPoi_on_disk = { glm_gp(pbmcs_subset, design = model_matrix, on_disk = TRUE) }, DESeq2 = suppressMessages({ dds <- DESeq2::DESeqDataSetFromMatrix(pbmcs_subset, colData = data.frame(name = seq_len(4340)), design = ~ 1) dds <- DESeq2::estimateSizeFactors(dds, "poscounts") dds <- DESeq2::estimateDispersions(dds, quiet = TRUE) dds <- DESeq2::nbinomWaldTest(dds, minmu = 1e-6) }), edgeR = { edgeR_data <- edgeR::DGEList(pbmcs_subset) edgeR_data <- edgeR::calcNormFactors(edgeR_data) edgeR_data <- edgeR::estimateDisp(edgeR_data, model_matrix) edgeR_fit <- edgeR::glmFit(edgeR_data, design = model_matrix) }, check = FALSE, min_iterations = 3 ) ## ----message=FALSE, warning=FALSE--------------------------------------------- # Results with my method fit <- glm_gp(pbmcs_subset, design = model_matrix, on_disk = FALSE) # DESeq2 dds <- DESeq2::DESeqDataSetFromMatrix(pbmcs_subset, colData = data.frame(name = seq_len(4340)), design = ~ 1) sizeFactors(dds) <- fit$size_factors dds <- DESeq2::estimateDispersions(dds, quiet = TRUE) dds <- DESeq2::nbinomWaldTest(dds, minmu = 1e-6) #edgeR edgeR_data <- edgeR::DGEList(pbmcs_subset, lib.size = fit$size_factors) edgeR_data <- edgeR::estimateDisp(edgeR_data, model_matrix) edgeR_fit <- edgeR::glmFit(edgeR_data, design = model_matrix) ## ----coefficientComparison, fig.height=5, fig.width=10, warning=FALSE, echo = FALSE---- par(mfrow = c(2, 4), cex.main = 2, cex.lab = 1.5) plot(fit$Beta[,1], coef(dds)[,1] / log2(exp(1)), pch = 16, main = "Beta Coefficients", xlab = "glmGamPoi", ylab = "DESeq2") abline(0,1) plot(fit$Beta[,1], edgeR_fit$unshrunk.coefficients[,1], pch = 16, main = "Beta Coefficients", xlab = "glmGamPoi", ylab = "edgeR") abline(0,1) plot(fit$Mu[,1], assay(dds, "mu")[,1], pch = 16, log="xy", main = "Gene Mean", xlab = "glmGamPoi", ylab = "DESeq2") abline(0,1) plot(fit$Mu[,1], edgeR_fit$fitted.values[,1], pch = 16, log="xy", main = "Gene Mean", xlab = "glmGamPoi", ylab = "edgeR") abline(0,1) plot(fit$overdispersions, rowData(dds)$dispGeneEst, pch = 16, log="xy", main = "Overdispersion", xlab = "glmGamPoi", ylab = "DESeq2") abline(0,1) plot(fit$overdispersions, edgeR_fit$dispersion, pch = 16, log="xy", main = "Overdispersion", xlab = "glmGamPoi", ylab = "edgeR") abline(0,1) ## ----------------------------------------------------------------------------- sce <- muscData::Kang18_8vs8() colData(sce) ## ----------------------------------------------------------------------------- set.seed(1) # Take highly expressed genes and proper cells: sce_subset <- sce[rowSums(counts(sce)) > 100, sample(which(sce$multiplets == "singlet" & ! is.na(sce$cell) & sce$cell %in% c("CD4 T cells", "B cells", "NK cells")), 1000)] # Convert counts to dense matrix counts(sce_subset) <- as.matrix(counts(sce_subset)) # Remove empty levels because glm_gp() will complain otherwise sce_subset$cell <- droplevels(sce_subset$cell) ## ----------------------------------------------------------------------------- fit <- glm_gp(sce_subset, design = ~ cell + stim + stim:cell - 1, reference_level = "NK cells") summary(fit) ## ----------------------------------------------------------------------------- colnames(fit$Beta) ## ----------------------------------------------------------------------------- # The contrast argument specifies what we want to compare # We test the expression difference of stimulated and control T-cells # # There is no sample label in the colData, so we create it on the fly # from `stim` and `ind` columns in colData(fit$data). de_res <- test_de(fit, contrast = `stimstim` + `cellCD4 T cells:stimstim`, pseudobulk_by = paste0(stim, "-", ind)) # The large `lfc` values come from groups were nearly all counts are 0 # Setting them to Inf makes the plots look nicer de_res$lfc <- ifelse(abs(de_res$lfc) > 20, sign(de_res$lfc) * Inf, de_res$lfc) # Most different genes head(de_res[order(de_res$pval), ]) ## ----------------------------------------------------------------------------- library(ggplot2) ggplot(de_res, aes(x = lfc, y = -log10(pval))) + geom_point(size = 0.6, aes(color = adj_pval < 0.1)) + ggtitle("Volcano Plot", "Genes that change most through interferon-beta treatment in T cells") ## ----------------------------------------------------------------------------- marker_genes <- test_de(fit, `cellCD4 T cells` - `cellB cells`, sort_by = pval) head(marker_genes) ## ----------------------------------------------------------------------------- marker_genes2 <- test_de(fit, (`cellCD4 T cells` + `cellCD4 T cells:stimstim`) - (`cellB cells` + `cellB cells:stimstim`), sort_by = pval) head(marker_genes2) ## ----------------------------------------------------------------------------- # Create a data.frame with the expression values, gene names, and cell types tmp <- data.frame(gene = rep(marker_genes$name[1:6], times = ncol(sce_subset)), expression = c(counts(sce_subset)[marker_genes$name[1:6], ]), celltype = rep(sce_subset$cell, each = 6)) ggplot(tmp, aes(x = celltype, y = expression)) + geom_jitter(height = 0.1) + stat_summary(geom = "crossbar", fun = "mean", color = "red") + facet_wrap(~ gene, scales = "free_y") + ggtitle("Marker genes of B vs. T cells") ## ----------------------------------------------------------------------------- sessionInfo()