options(digits=3, width=80) library( "parathyroidSE" ) extDataDir <- system.file("extdata", package = "parathyroidSE", mustWork = TRUE) extDataDir list.files( extDataDir ) sampleTable <- read.csv( "/path/to/your/sampleTable.csv", header=TRUE ) sampleTable bamFiles <- file.path( extDataDir, sampleTable$fileName ) library( "Rsamtools" ) seqinfo( BamFile( bamFiles[1] ) ) library( "GenomicFeatures" ) hse <- makeTranscriptDbFromGFF( "/path/to/your/genemodel_file.GTF", format="gtf" ) exonsByGene <- exonsBy( hse, by="gene" ) exonsByGene library( "GenomicAlignments" ) se <- summarizeOverlaps( exonsByGene, BamFileList( bamFiles ), mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE ) se head( assay(se) ) colSums( assay(se) ) colData(se) rowData(se) colData(se) <- DataFrame( sampleTable ) colData(se)$treatment se$treatment colnames(se) <- sampleTable$sampleName head( assay(se) ) data( "parathyroidGenesSE" ) se <- parathyroidGenesSE str( metadata( rowData(se) ) ) colData(se)[1:5,1:4] library( "DESeq2" ) ddsFull <- DESeqDataSet( se, design = ~ patient + treatment ) as.data.frame( colData( ddsFull )[ ,c("sample","patient","treatment","time") ] ) ddsCollapsed <- collapseReplicates( ddsFull, groupby = ddsFull$sample, run = ddsFull$run ) head( as.data.frame( colData(ddsCollapsed)[ ,c("sample","runsCollapsed") ] ), 12 ) original <- rowSums( counts(ddsFull)[ , ddsFull$sample == "SRS308873" ] ) all( original == counts(ddsCollapsed)[ ,"SRS308873" ] ) dds <- ddsCollapsed[ , ddsCollapsed$time == "48h" ] dds$time <- droplevels( dds$time ) dds$treatment <- relevel( dds$treatment, "Control" ) as.data.frame( colData(dds) ) dds <- dds[ rowSums( counts(dds) ) > 0 , ] design(dds) dds <- DESeq(dds) res <- results( dds ) res mcols(res, use.names=TRUE) idx <- which.min(res$pvalue) counts(dds)[ idx, ] counts(dds, normalized=TRUE)[ idx, ] resOHT <- res res <- results( dds, contrast = c("treatment", "DPN", "Control") ) res library( "org.Hs.eg.db" ) columns(org.Hs.eg.db) convertIDs <- function( ids, fromKey, toKey, db, ifMultiple=c( "putNA", "useFirst" ) ) { stopifnot( inherits( db, "AnnotationDb" ) ) ifMultiple <- match.arg( ifMultiple ) suppressWarnings( selRes <- AnnotationDbi::select( db, keys=ids, keytype=fromKey, columns=c(fromKey,toKey) ) ) if( ifMultiple == "putNA" ) { duplicatedIds <- selRes[ duplicated( selRes[,1] ), 1 ] selRes <- selRes[ ! selRes[,1] %in% duplicatedIds, ] } return( selRes[ match( ids, selRes[,1] ), 2 ] ) } res$hgnc_symbol <- convertIDs( row.names(res), "ENSEMBL", "SYMBOL", org.Hs.eg.db ) res$entrezid <- convertIDs( row.names(res), "ENSEMBL", "ENTREZID", org.Hs.eg.db ) head(res, 4) sum( res$pvalue < 0.01, na.rm=TRUE ) table( is.na(res$pvalue) ) sum( res$padj < 0.1, na.rm=TRUE ) resSig <- subset(res, res$padj < 0.1 ) head( resSig[ order( resSig$log2FoldChange ), ], 4) head( resSig[ order( -resSig$log2FoldChange ), ], 4) plotMA( res, ylim = c(-3, 3) ) plotDispEsts( dds, ylim = c(1e-6, 1e1) ) hist( res$pvalue, breaks=20, col="grey" ) # create bins using the quantile function qs <- c( 0, quantile( res$baseMean[res$baseMean > 0], 0:7/7 ) ) # "cut" the genes into the bins bins <- cut( res$baseMean, qs ) # rename the levels of the bins using the middle point levels(bins) <- paste0("~",round(.5*qs[-1] + .5*qs[-length(qs)])) # calculate the ratio of p values less than .01 for each bin ratios <- tapply( res$pvalue, bins, function(p) mean( p < .01, na.rm=TRUE ) ) # plot these ratios barplot(ratios, xlab="mean normalized count", ylab="ratio of small p values") attr(res,"filterThreshold") plot(attr(res,"filterNumRej"),type="b", xlab="quantiles of 'baseMean'", ylab="number of rejections") ## write.csv( as.data.frame(res), file="results.csv" ) library( "reactome.db" ) res2 <- res[ res$entrezid %in% keys( reactome.db, "ENTREZID" ) & !is.na( res$padj ) , ] head(res2) reactomeTable <- AnnotationDbi::select( reactome.db, keys=as.character(res2$entrezid), keytype="ENTREZID", columns=c("ENTREZID","REACTOMEID") ) head(reactomeTable) incm <- do.call( rbind, with(reactomeTable, tapply( ENTREZID, factor(REACTOMEID), function(x) res2$entrezid %in% x ) )) colnames(incm) <- res2$entrez str(incm) within <- function(x, lower, upper) (x>=lower & x<=upper) incm <- incm[ within(rowSums(incm), lower=20, upper=80), ] testCategory <- function( reactomeID ) { isMember <- incm[ reactomeID, ] data.frame( reactomeID = reactomeID, numGenes = sum( isMember ), avgLFC = mean( res2$log2FoldChange[isMember] ), sdLFC = sd( res2$log2FoldChange[isMember] ), zValue = mean( res2$log2FoldChange[isMember] ) / sd( res2$log2FoldChange[isMember] ), strength = sum( res2$log2FoldChange[isMember] ) / sqrt(sum(isMember)), pvalue = t.test( res2$log2FoldChange[ isMember ] )$p.value, reactomeName = reactomePATHID2NAME[[reactomeID]], stringsAsFactors = FALSE ) } testCategory("109606") reactomeResult <- do.call( rbind, lapply( rownames(incm), testCategory ) ) reactomeResult$padjust <- p.adjust( reactomeResult$pvalue, "BH" ) reactomeResultSignif <- reactomeResult[ reactomeResult$padjust < 0.05, ] head( reactomeResultSignif[ order(-reactomeResultSignif$strength), ] ) head( reactomeResult[ order(-reactomeResult$zValue), ] ) rld <- rlog( dds ) assay(rld)[ 1:3, 1:3] par( mfrow = c( 1, 2 ) ) plot( log2( 1+counts(dds, normalized=TRUE)[, 1:2] ), col="#00000020", pch=20, cex=0.3 ) plot( assay(rld)[, 1:2], col="#00000020", pch=20, cex=0.3 ) sampleDists <- dist( t( assay(rld) ) ) as.matrix( sampleDists )[ 1:3, 1:3 ] sampleDistMatrix <- as.matrix( sampleDists ) rownames(sampleDistMatrix) <- paste( rld$treatment, rld$patient, sep="-" ) colnames(sampleDistMatrix) <- NULL library( "gplots" ) library( "RColorBrewer" ) colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) heatmap.2( sampleDistMatrix, trace="none", col=colours) colours <- c(rgb(1:3/4,0,0),rgb(0,1:3/4,0),rgb(0,0,1:3/4),rgb(1:3/4,0,1:3/4)) plotPCA( rld, intgroup = c("patient","treatment"), col=colours ) library( "genefilter" ) topVarGenes <- head( order( rowVars( assay(rld) ), decreasing=TRUE ), 35 ) heatmap.2( assay(rld)[ topVarGenes, ], scale="row", trace="none", dendrogram="column", col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255), ColSideColors = c( Control="gray", DPN="darkgreen", OHT="orange" )[ colData(rld)$treatment ] ) recount2SE <- function(name) { filename <- paste0(name,"_eset.RData") if (!file.exists(filename)) download.file(paste0( "http://bowtie-bio.sourceforge.net/recount/ExpressionSets/", filename),filename) load(filename) e <- get(paste0(name,".eset")) se <- SummarizedExperiment(SimpleList(counts=exprs(e)), colData=DataFrame(pData(e))) se } toLatex(sessionInfo())