## ----style, echo=FALSE, message=FALSE, warning=FALSE, results="asis"---------- library(knitr) library(rmarkdown) opts_chunk$set(message=TRUE, warning=FALSE, error=FALSE, cache=FALSE, fig.width=5, fig.height=5) ## ----diagram, echo=FALSE, out.width="100%", fig.cap="Diagram of the methods presented in this workflow. The left-side shows two paths for performing differential transcript usage (DTU) using Bioconductor packages and the right-side shows two paths for performing differential gene expression (DGE). DTU and DGE are complementary analyses of changes in transcription across conditions. This workflow focuses mostly on DTU, as there are a number of other published Bioconductor workflows for DGE. In bold are the recommended choices for quantification and filtering Salmon transcript-level data as input to the statistical methods. The recommended filters implemented in DRIMSeq, and applied upstream of DRIMSeq and DEXSeq, are discussed in this workflow."---- knitr::include_graphics("figs/swimdown_diagram.png") ## ----ma-simulated, message=FALSE, echo=FALSE, dev="png", out.width="75%", fig.cap="MA plot of simulated abundances. Each point depicts a transcript, with the average log2 abundance in transcripts-per-million (TPM) on the x-axis and the difference between the two groups on the y-axis. Of the 35,850 transcripts which are expressed with TPM > 1 in at least one group, 77\\% (n=27,429) are null transcripts (grey), which fall by construction on the M=0 line, and 23\\% (n=8,421) are differentially expressed (green, orange, or purple). This filtering of 1 TPM is for visualization only and unrelated to the DRIMSeq filtering used in the workflow. As transcripts can belong to multiple categories of differential gene expression (DGE), differential transcript expression (DTE), and differential transcript usage (DTU), here the transcripts are colored by which genes they belong to (those selected to be DGE-, DTE-, or DTU-by-construction)."---- library(rnaseqDTU) library(rafalib) data(simulate) bigpar() pc <- 1 col <- rep(8, nrow(tpms)) col[iso.dge] <- 1 col[iso.dte] <- 2 col[iso.dtu] <- 3 maplot(log2(tpms[,1]+pc), log2(tpms[,2]+pc), n=nrow(tpms), curve.add=FALSE, col=col, cex=.25, pch=20) legend("bottomright", c("DGE","DTE","DTU","null"), col=c(1:3,8), pch=20, bty="n") ## ----------------------------------------------------------------------------- library(rnaseqDTU) csv.dir <- system.file("extdata", package="rnaseqDTU") ## ----------------------------------------------------------------------------- samps <- read.csv(file.path(csv.dir, "samples.csv")) head(samps) samps$condition <- factor(samps$condition) table(samps$condition) files <- file.path("/path/to/dir", samps$sample_id, "quant.sf") names(files) <- samps$sample_id head(files) ## ----eval=FALSE--------------------------------------------------------------- # library(tximport) # txi <- tximport(files, type="salmon", txOut=TRUE, # countsFromAbundance="scaledTPM") # cts <- txi$counts # cts <- cts[rowSums(cts) > 0,] ## ----eval=FALSE--------------------------------------------------------------- # library(GenomicFeatures) # gtf <- "gencode.v28.annotation.gtf.gz" # txdb.filename <- "gencode.v28.annotation.sqlite" # txdb <- makeTxDbFromGFF(gtf) # saveDb(txdb, txdb.filename) ## ----eval=FALSE--------------------------------------------------------------- # txdb <- loadDb(txdb.filename) # txdf <- select(txdb, keys(txdb, "GENEID"), "TXNAME", "GENEID") # tab <- table(txdf$GENEID) # txdf$ntx <- tab[match(txdf$GENEID, names(tab))] ## ----------------------------------------------------------------------------- data(salmon_cts) cts[1:3,1:3] range(colSums(cts)/1e6) ## ----------------------------------------------------------------------------- data(simulate) head(txdf) all(rownames(cts) %in% txdf$TXNAME) txdf <- txdf[match(rownames(cts),txdf$TXNAME),] all(rownames(cts) == txdf$TXNAME) ## ----------------------------------------------------------------------------- counts <- data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts) ## ----------------------------------------------------------------------------- library(DRIMSeq) d <- dmDSdata(counts=counts, samples=samps) d ## ----------------------------------------------------------------------------- methods(class=class(d)) counts(d[1,])[,1:4] ## ----------------------------------------------------------------------------- n <- 12 n.small <- 6 d <- dmFilter(d, min_samps_feature_expr=n.small, min_feature_expr=10, min_samps_feature_prop=n.small, min_feature_prop=0.1, min_samps_gene_expr=n, min_gene_expr=10) d ## ----------------------------------------------------------------------------- table(table(counts(d)$gene_id)) ## ----------------------------------------------------------------------------- design_full <- model.matrix(~condition, data=DRIMSeq::samples(d)) colnames(design_full) ## ----------------------------------------------------------------------------- d <- d[1:250,] 7764 / 250 ## ----------------------------------------------------------------------------- set.seed(1) system.time({ d <- dmPrecision(d, design=design_full) d <- dmFit(d, design=design_full) d <- dmTest(d, coef="condition2") }) ## ----------------------------------------------------------------------------- res <- DRIMSeq::results(d) head(res) res.txp <- DRIMSeq::results(d, level="feature") head(res.txp) ## ----------------------------------------------------------------------------- no.na <- function(x) ifelse(is.na(x), 1, x) res$pvalue <- no.na(res$pvalue) res.txp$pvalue <- no.na(res.txp$pvalue) ## ----plot-prop, out.width="75%", fig.cap="Estimated proportions for one of the significant genes."---- idx <- which(res$adj_pvalue < 0.05)[1] res[idx,] plotProportions(d, res$gene_id[idx], "condition") ## ----------------------------------------------------------------------------- data(drim_tables) nrow(res) nrow(res.txp) ## ----------------------------------------------------------------------------- pScreen <- res$pvalue strp <- function(x) substr(x,1,15) names(pScreen) <- strp(res$gene_id) ## ----------------------------------------------------------------------------- pConfirmation <- matrix(res.txp$pvalue, ncol=1) rownames(pConfirmation) <- strp(res.txp$feature_id) ## ----------------------------------------------------------------------------- tx2gene <- res.txp[,c("feature_id", "gene_id")] for (i in 1:2) tx2gene[,i] <- strp(tx2gene[,i]) ## ----message=FALSE------------------------------------------------------------ library(stageR) stageRObj <- stageRTx(pScreen=pScreen, pConfirmation=pConfirmation, pScreenAdjusted=FALSE, tx2gene=tx2gene) stageRObj <- stageWiseAdjustment(stageRObj, method="dtu", alpha=0.05) suppressWarnings({ drim.padj <- getAdjustedPValues(stageRObj, order=FALSE, onlySignificantGenes=TRUE) }) head(drim.padj) ## ----------------------------------------------------------------------------- res.txp.filt <- DRIMSeq::results(d, level="feature") smallProportionSD <- function(d, filter=0.1) { cts <- as.matrix(subset(counts(d), select=-c(gene_id, feature_id))) gene.cts <- rowsum(cts, counts(d)$gene_id) total.cts <- gene.cts[match(counts(d)$gene_id, rownames(gene.cts)),] props <- cts/total.cts propSD <- sqrt(rowVars(props)) propSD < filter } filt <- smallProportionSD(d) res.txp.filt$pvalue[filt] <- 1 res.txp.filt$adj_pvalue[filt] <- 1 ## ----message=FALSE------------------------------------------------------------ library(DEXSeq) sample.data <- DRIMSeq::samples(d) count.data <- round(as.matrix(counts(d)[,-c(1:2)])) dxd <- DEXSeqDataSet(countData=count.data, sampleData=sample.data, design=~sample + exon + condition:exon, featureID=counts(d)$feature_id, groupID=counts(d)$gene_id) ## ----------------------------------------------------------------------------- system.time({ dxd <- estimateSizeFactors(dxd) dxd <- estimateDispersions(dxd, quiet=TRUE) dxd <- testForDEU(dxd, reducedModel=~sample + exon) }) ## ----------------------------------------------------------------------------- dxr <- DEXSeqResults(dxd, independentFiltering=FALSE) qval <- perGeneQValue(dxr) dxr.g <- data.frame(gene=names(qval),qval) ## ----------------------------------------------------------------------------- columns <- c("featureID","groupID","pvalue") dxr <- as.data.frame(dxr[,columns]) head(dxr) ## ----------------------------------------------------------------------------- data(dex_tables) ## ----------------------------------------------------------------------------- library(stageR) strp <- function(x) substr(x,1,15) pConfirmation <- matrix(dxr$pvalue,ncol=1) dimnames(pConfirmation) <- list(strp(dxr$featureID),"transcript") pScreen <- qval names(pScreen) <- strp(names(pScreen)) tx2gene <- as.data.frame(dxr[,c("featureID", "groupID")]) for (i in 1:2) tx2gene[,i] <- strp(tx2gene[,i]) ## ----------------------------------------------------------------------------- stageRObj <- stageRTx(pScreen=pScreen, pConfirmation=pConfirmation, pScreenAdjusted=TRUE, tx2gene=tx2gene) stageRObj <- stageWiseAdjustment(stageRObj, method="dtu", alpha=0.05) suppressWarnings({ dex.padj <- getAdjustedPValues(stageRObj, order=FALSE, onlySignificantGenes=TRUE) }) head(dex.padj) ## ----eval=FALSE--------------------------------------------------------------- # txi.g <- tximport(files, type="salmon", tx2gene=txdf[,2:1]) ## ----------------------------------------------------------------------------- data(salmon_gene_txi) library(DESeq2) dds <- DESeqDataSetFromTximport(txi.g, samps, ~condition) ## ----message=FALSE------------------------------------------------------------ dds <- DESeq(dds) dres <- DESeq2::results(dds) ## ----------------------------------------------------------------------------- all(dxr.g$gene %in% rownames(dres)) dres <- dres[dxr.g$gene,] # we can only color because we simulated... col <- rep(8, nrow(dres)) col[rownames(dres) %in% dge.genes] <- 1 col[rownames(dres) %in% dte.genes] <- 2 col[rownames(dres) %in% dtu.genes] <- 3 ## ----tuge-plot, dev="png", out.width="75%", fig.cap="Transcript usage over gene expression plot. Each point represents a gene, and plotted are -log10 adjusted p-values for DEXSeq's test of differential transcript usage (y-axis) and DESeq2's test of differential gene expression (x-axis). Because we simulated the data we can color the genes according to their true category."---- bigpar() # here cap the smallest DESeq2 adj p-value cap.padj <- pmin(-log10(dres$padj), 100) # this vector only used for plotting jitter.padj <- -log10(dxr.g$qval + 1e-20) jp.idx <- jitter.padj == 20 jitter.padj[jp.idx] <- rnorm(sum(jp.idx),20,.25) plot(cap.padj, jitter.padj, col=col, xlab="Gene expression", ylab="Transcript usage") legend("topright", c("DGE","DTE","DTU","null"), col=c(1:3,8), pch=20, bty="n") ## ----message=FALSE------------------------------------------------------------ library(edgeR) cts.g <- txi.g$counts normMat <- txi.g$length normMat <- normMat / exp(rowMeans(log(normMat))) o <- log(calcNormFactors(cts.g/normMat)) + log(colSums(cts.g/normMat)) y <- DGEList(cts.g) y <- scaleOffset(y, t(t(log(normMat)) + o)) keep <- filterByExpr(y) y <- y[keep,] ## ----------------------------------------------------------------------------- y <- estimateDisp(y, design_full) fit <- glmFit(y, design_full) lrt <- glmLRT(fit) tt <- topTags(lrt, n=nrow(y), sort="none")[[1]] ## ----------------------------------------------------------------------------- common <- intersect(res$gene_id, rownames(tt)) tt <- tt[common,] res.sub <- res[match(common, res$gene_id),] # we can only color because we simulated... col <- rep(8, nrow(tt)) col[rownames(tt) %in% dge.genes] <- 1 col[rownames(tt) %in% dte.genes] <- 2 col[rownames(tt) %in% dtu.genes] <- 3 ## ----tuge-drim-edger, dev="png", out.width="75%", fig.cap="Transcript usage over gene expression plot, as previously, but for DRIMSeq and edgeR."---- bigpar() plot(-log10(tt$FDR), -log10(res.sub$adj_pvalue), col=col, xlab="Gene expression", ylab="Transcript usage") legend("topright", c("DGE","DTE","DTU","null"), col=c(1:3,8), pch=20, bty="n") ## ----tuge-drim-zoom, dev="png", out.width="75%", fig.cap="Transcript usage over gene expression plot, zooming in on the DRIMSeq adjusted p-values."---- bigpar() plot(-log10(tt$FDR), -log10(res.sub$adj_pvalue), col=col, xlab="Gene expression", ylab="Transcript usage", ylim=c(0,20)) legend("topright", c("DGE","DTE","DTU","null"), col=c(1:3,8), pch=20, bty="n") ## ----------------------------------------------------------------------------- devtools::session_info()