options(digits=3, width=80)


library( "parathyroidSE" )


extDataDir <- system.file("extdata", package = "parathyroidSE", mustWork = TRUE)
extDataDir


list.files( extDataDir )


sampleTable <- read.csv( "/path/to/your/sampleTable.csv", header=TRUE )


sampleTable


bamFiles <- file.path( extDataDir, sampleTable$fileName )


library( "Rsamtools" )
seqinfo( BamFile( bamFiles[1] ) )


library( "GenomicFeatures" )
hse <- makeTranscriptDbFromGFF( "/path/to/your/genemodel_file.GTF", format="gtf" )
exonsByGene <- exonsBy( hse, by="gene" )


exonsByGene


library( "GenomicAlignments" )
se <- summarizeOverlaps( exonsByGene, BamFileList( bamFiles ), mode="Union",
	singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE )


se
head( assay(se) )
colSums( assay(se) )
colData(se)
rowData(se)


colData(se) <- DataFrame( sampleTable )


colData(se)$treatment
se$treatment


colnames(se) <- sampleTable$sampleName
head( assay(se) )


data( "parathyroidGenesSE" )
se <- parathyroidGenesSE


str( metadata( rowData(se) ) )


colData(se)[1:5,1:4]


library( "DESeq2" )
ddsFull <- DESeqDataSet( se, design = ~ patient + treatment )


as.data.frame( colData( ddsFull )[ ,c("sample","patient","treatment","time") ] )


ddsCollapsed <- collapseReplicates( ddsFull,
                                   groupby = ddsFull$sample, 
                                   run = ddsFull$run )
head( as.data.frame( colData(ddsCollapsed)[ ,c("sample","runsCollapsed") ] ), 12 )


original <- rowSums( counts(ddsFull)[ , ddsFull$sample == "SRS308873" ] )
all( original == counts(ddsCollapsed)[ ,"SRS308873" ] )


dds <- ddsCollapsed[ , ddsCollapsed$time == "48h" ]


dds$time <- droplevels( dds$time )


dds$treatment <- relevel( dds$treatment, "Control" )


as.data.frame( colData(dds) )


dds <- dds[ rowSums( counts(dds) ) > 0 , ]


design(dds)


dds <- DESeq(dds)


res <- results( dds )
res


mcols(res, use.names=TRUE)


idx <- which.min(res$pvalue)
counts(dds)[ idx, ]
counts(dds, normalized=TRUE)[ idx, ]


resOHT <- res


res <- results( dds, contrast = c("treatment", "DPN", "Control") )
res


library( "org.Hs.eg.db" )


columns(org.Hs.eg.db)


convertIDs <- function( ids, fromKey, toKey, db, ifMultiple=c( "putNA", "useFirst" ) ) {
   stopifnot( inherits( db, "AnnotationDb" ) )
   ifMultiple <- match.arg( ifMultiple )
   suppressWarnings( selRes <- AnnotationDbi::select( 
      db, keys=ids, keytype=fromKey, columns=c(fromKey,toKey) ) )
   if( ifMultiple == "putNA" ) {
      duplicatedIds <- selRes[ duplicated( selRes[,1] ), 1 ]   
      selRes <- selRes[ ! selRes[,1] %in% duplicatedIds, ] }
   return( selRes[ match( ids, selRes[,1] ), 2 ] )
}


res$hgnc_symbol <- convertIDs( row.names(res), "ENSEMBL", "SYMBOL", org.Hs.eg.db )
res$entrezid <- convertIDs( row.names(res), "ENSEMBL", "ENTREZID", org.Hs.eg.db )


head(res, 4)


sum( res$pvalue < 0.01, na.rm=TRUE )
table( is.na(res$pvalue) )


sum( res$padj < 0.1, na.rm=TRUE )


resSig <- subset(res, res$padj < 0.1 )
head( resSig[ order( resSig$log2FoldChange ), ], 4)


head( resSig[ order( -resSig$log2FoldChange ), ], 4)


plotMA( res, ylim = c(-3, 3) )


plotDispEsts( dds, ylim = c(1e-6, 1e1) )


hist( res$pvalue, breaks=20, col="grey" )


# create bins using the quantile function
qs <- c( 0, quantile( res$baseMean[res$baseMean > 0], 0:7/7 ) )
# "cut" the genes into the bins
bins <- cut( res$baseMean, qs )
# rename the levels of the bins using the middle point
levels(bins) <- paste0("~",round(.5*qs[-1] + .5*qs[-length(qs)]))
# calculate the ratio of p values less than .01 for each bin
ratios <- tapply( res$pvalue, bins, function(p) mean( p < .01, na.rm=TRUE ) )
# plot these ratios
barplot(ratios, xlab="mean normalized count", ylab="ratio of small p values")


attr(res,"filterThreshold")
plot(attr(res,"filterNumRej"),type="b",
     xlab="quantiles of 'baseMean'",
     ylab="number of rejections")


## write.csv( as.data.frame(res), file="results.csv" )


library( "reactome.db" )


res2 <- res[ res$entrezid %in% keys( reactome.db, "ENTREZID" ) & 
   !is.na( res$padj ) , ]
head(res2)


reactomeTable <- AnnotationDbi::select( reactome.db, 
   keys=as.character(res2$entrezid), keytype="ENTREZID", 
   columns=c("ENTREZID","REACTOMEID") )
head(reactomeTable)


incm <- do.call( rbind, with(reactomeTable, tapply( 
 ENTREZID, factor(REACTOMEID), function(x) res2$entrezid %in% x ) ))
colnames(incm) <- res2$entrez

str(incm)


within <- function(x, lower, upper) (x>=lower & x<=upper)
incm <- incm[ within(rowSums(incm), lower=20, upper=80), ]


testCategory <- function( reactomeID ) {
  isMember <- incm[ reactomeID, ]
  data.frame( 
     reactomeID  = reactomeID,
     numGenes    = sum( isMember ),
     avgLFC      = mean( res2$log2FoldChange[isMember] ),
     sdLFC       = sd( res2$log2FoldChange[isMember] ),
     zValue      = mean( res2$log2FoldChange[isMember] ) / 
                     sd( res2$log2FoldChange[isMember] ),
     strength    = sum( res2$log2FoldChange[isMember] ) / sqrt(sum(isMember)),
     pvalue      = t.test( res2$log2FoldChange[ isMember ] )$p.value,
     reactomeName = reactomePATHID2NAME[[reactomeID]],
     stringsAsFactors = FALSE ) }


testCategory("109606")


reactomeResult <- do.call( rbind, lapply( rownames(incm), testCategory ) )


reactomeResult$padjust <- p.adjust( reactomeResult$pvalue, "BH" )


reactomeResultSignif <- reactomeResult[ reactomeResult$padjust < 0.05, ]
head( reactomeResultSignif[ order(-reactomeResultSignif$strength), ] )


head( reactomeResult[ order(-reactomeResult$zValue), ] )


rld <- rlog( dds )
assay(rld)[ 1:3, 1:3]


par( mfrow = c( 1, 2 ) )
plot( log2( 1+counts(dds, normalized=TRUE)[, 1:2] ), col="#00000020", pch=20, cex=0.3 )
plot( assay(rld)[, 1:2], col="#00000020", pch=20, cex=0.3 )


sampleDists <- dist( t( assay(rld) ) )
as.matrix( sampleDists )[ 1:3, 1:3 ]


sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rld$treatment, 
   rld$patient, sep="-" )
colnames(sampleDistMatrix) <- NULL   
library( "gplots" )
library( "RColorBrewer" )
colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
heatmap.2( sampleDistMatrix, trace="none", col=colours)


colours <- c(rgb(1:3/4,0,0),rgb(0,1:3/4,0),rgb(0,0,1:3/4),rgb(1:3/4,0,1:3/4))
plotPCA( rld, intgroup = c("patient","treatment"), col=colours )


library( "genefilter" )
topVarGenes <- head( order( rowVars( assay(rld) ), decreasing=TRUE ), 35 )


heatmap.2( assay(rld)[ topVarGenes, ], scale="row", 
     trace="none", dendrogram="column", 
     col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
     ColSideColors = c( Control="gray", DPN="darkgreen", OHT="orange" )[
        colData(rld)$treatment ] )


recount2SE <- function(name) {
  filename <- paste0(name,"_eset.RData")
  if (!file.exists(filename)) download.file(paste0(
    "http://bowtie-bio.sourceforge.net/recount/ExpressionSets/",
    filename),filename)
  load(filename)
  e <- get(paste0(name,".eset"))
  se <- SummarizedExperiment(SimpleList(counts=exprs(e)),
                             colData=DataFrame(pData(e)))
  se                   
}


toLatex(sessionInfo())