Ad hoc exercises

Annotation

Use packages Txdb.Hsapiens.UCSC.hg19.knownGene and BSgenome.Hsapiens.UCSC.hg19 and the functions promoters() and getSeq() to retrieve the DNA sequences of all promoters. Modify the arguments of promoters() so that this means the 5000 nucleotides upstream of the transcription start site. What challenges are introduced by trying to reduce this to one ‘promoter’ per gene?

require(TxDb.Hsapiens.UCSC.hg19.knownGene)
require(BSgenome.Hsapiens.UCSC.hg19)
p <- promoters(TxDb.Hsapiens.UCSC.hg19.knownGene)
dna <- getSeq(BSgenome.Hsapiens.UCSC.hg19, p)
dna

##   A DNAStringSet instance of length 82960
##         width seq
##     [1]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [2]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [3]  2200 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT
##     [4]  2200 TTAAGGTCTATTCTAAATTGCACACTTTGATTCAAAAGAAAC...TTCCTGCTAGCCAACCTCTCACTCATTGATCTGTCTCTGTC
##     [5]  2200 ATTGTGAAGGATACATCTCAGAAACAGTCAATGAAAGAGACG...CTCCAGGCTCTGAACTTTCTCAGTAAGTTCAGGTAGCTGGG
##     ...   ... ...
## [82956]  2200 AGAGAGGGCACAGAGCTCATGGTTTATGGTGTAGGGGCTGGG...GGCTCTCCAGGTCCCCTGGAATGTTCAGCATGGGCCGAGGT
## [82957]  2200 GAGGGTAACTGGGTAAAGAGCTGCAGTGTGGGCAGAAGTGTA...CTGCCCCCTGGCTGATGTACTTTCCTGCAGGAGGACACGGC
## [82958]  2200 CACTGCCTGGTTCAGGAAAGCCCTCAGCTGTAGCCATTATTA...GAAGCACTACTGCTGGCCAGCAGGCTCATGCACCTTGGCCT
## [82959]  2200 GGACTGCCATGTCACCTACGGAGTACAGTCTGGCCCTGACAG...TATGATGTTCCCAGGTCTCTGGCCGCCTGAGTCCAGCCCCT
## [82960]  2200 CCTCACCCTCACCCTCACCCTAACCCTACCCTAACCCCTAAC...GGCCTGTGCCAGGGTGCAAGCTGAGCACTGGAGTGGAGTTT

Input & representation of standard file formats

The experiment data package RNAseqData.HNRNPC.bam.chr14 contains 8 BAM files from an experiment involving knockdown of gene HNRNPC. Use org.Hs.eg.db and the mapIds() function to map from this gene symbol to ENTREZ id, and TxDb.Hsapiens.UCSC.hg19.knownGene genes() function and vals= argument to retrieve genomic coordinates of this gene. Use GenomicAlignments readGAlignemntsList() to input the paired-end reads for the HNRNPC gene for one BAM file. Write a short function to input and count the number of reads overlapping HNRNPC in a single BAM file. Use sapply() to summarize the number of reads in each BAM file. Can you guess, based on reads per region, which 4 samples are control and which are knockdown?

## HNRNPC --> EntrezID --> exonsBy gene
require(org.Hs.eg.db)
require(TxDb.Hsapiens.UCSC.hg19.knownGene)
egid <- mapIds(org.Hs.eg.db, "HNRNPC", "ENTREZID", "SYMBOL")
egid

## HNRNPC 
## "3183"

hnrnpc <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene,vals=list(gene_id=egid))
hnrnpc

## GRanges object with 1 range and 1 metadata column:
##        seqnames               ranges strand |     gene_id
##           <Rle>            <IRanges>  <Rle> | <character>
##   3183    chr14 [21677296, 21737638]      - |        3183
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

## reads overlapping regions of interest
require(RNAseqData.HNRNPC.bam.chr14)
require(GenomicAlignments)

fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES
param <- ScanBamParam(which=hnrnpc)
readGAlignmentsList(fls[1], param=param)

## GAlignmentsList object of length 2711:
## [[1]] 
## GAlignments object with 2 alignments and 0 metadata columns:
##       seqnames strand cigar qwidth    start      end width njunc
##   [1]    chr14      +   72M     72 21702274 21702345    72     0
##   [2]    chr14      -   72M     72 21702313 21702384    72     0
## 
## [[2]] 
## GAlignments object with 2 alignments and 0 metadata columns:
##       seqnames strand               cigar qwidth    start      end width njunc
##   [1]    chr14      +                 72M     72 21702261 21702332    72     0
##   [2]    chr14      - 33M29081N26M330N13M     72 21702356 21731838 29483     2
## 
## [[3]] 
## GAlignments object with 2 alignments and 0 metadata columns:
##       seqnames strand                cigar qwidth    start      end width njunc
##   [1]    chr14      -                  72M     72 21737491 21737562    72     0
##   [2]    chr14      + 19M29081N26M5961N27M     72 21702370 21737483 35114     2
## 
## ...
## <2708 more elements>
## -------
## seqinfo: 93 sequences from an unspecified genome

## count
counter <- function(fl, param)
    length(readGAlignmentsList(fl, param=param))
counter(fls[1], param)

## [1] 2711

sapply(fls, counter, param)

## ERR127306 ERR127307 ERR127308 ERR127309 ERR127302 ERR127303 ERR127304 ERR127305 
##      2711      3160      2947      2779        86        98       158       141

Statistical analysis of differential expression – `DESeq2`

The summarizeOverlaps() function is a more sophisticated (compared to the simple function of the previous exercise) way to count reads overlapping regions of interest. Use it to count reads overlapping the HNRNPC region of interest. The return value is a SummarizedExperiment class, which coordinates row and column information with counts. How does our naive counting compare to summarizeOverlaps()?

se1 <- summarizeOverlaps(hnrnpc, fls, singleEnd=FALSE, ignore.strand=TRUE)
assay(se1)

##      ERR127306 ERR127307 ERR127308 ERR127309 ERR127302 ERR127303 ERR127304 ERR127305
## 3183      2711      3160      2946      2779        86        98       158       141

Use summarizeOverlaps() to count reads overlapping each gene on chr14

exByGn <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene")
se2 <- summarizeOverlaps(exByGn, fls, singleEnd=FALSE, ignore.strand=TRUE)

Run the airway example, and produce a ‘volcano plot’ summarizing the relationship between -10 log(p) and log fold change

library(DESeq2)
library("airway")
data(airway)
airway <- airway[rowSums(assay(airway)) != 0, ]
dds <- DESeqDataSet(airway, design = ~ cell + dex)
dds <- DESeq(dds)

## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

res <- results(dds)
plot(-10*log10(pvalue) ~ log2FoldChange, res)

Conclusion

Acknowledgements

Core (Seattle): Sonali Arora, Marc Carlson, Nate Hayden, Jim Hester, Valerie Obenchain, Hervé Pagès, Paul Shannon, Dan Tenenbaum.
The research reported in this presentation was supported by the National Cancer Institute and the National Human Genome Research Institute of the National Institutes of Health under Award numbers U24CA180996 and U41HG004059, and the National Science Foundation under Award number 1247813. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health or the National Science Foundation.

BioC 2015 Annual Conference, Seattle, WA, 20-22 July.

Key references

Irizarry R, et al. (2015) Biomedical Data Science. Course Notes, EdX PH525.1x.
Huber W, et al. (2015) Orchestrating high-throughput genomic analysis with Bioconductor. Nature Methods 12:115-121; doi:10.1038/nmeth.3252 (full-text free with registration).
Lawrence M, Huber W, Pag&egraves;s H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi: 10.1371/journal.pcbi.1003118

sessionInfo()

## R version 3.2.1 Patched (2015-06-19 r68553)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.2 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DESeq2_1.8.1                            RcppArmadillo_0.5.200.1.0              
##  [3] Rcpp_0.11.6                             airway_0.102.0                         
##  [5] GenomicAlignments_1.4.1                 Rsamtools_1.20.4                       
##  [7] RNAseqData.HNRNPC.bam.chr14_0.6.0       org.Hs.eg.db_3.1.2                     
##  [9] RSQLite_1.0.0                           DBI_0.3.1                              
## [11] BSgenome.Hsapiens.UCSC.hg19_1.4.0       BSgenome_1.36.1                        
## [13] rtracklayer_1.28.5                      Biostrings_2.36.1                      
## [15] XVector_0.8.0                           TxDb.Hsapiens.UCSC.hg19.knownGene_3.1.2
## [17] GenomicFeatures_1.20.1                  AnnotationDbi_1.30.1                   
## [19] Biobase_2.28.0                          GenomicRanges_1.20.5                   
## [21] GenomeInfoDb_1.4.1                      IRanges_2.2.4                          
## [23] S4Vectors_0.6.0                         BiocGenerics_0.14.0                    
## [25] BiocStyle_1.6.0                         BiocInstaller_1.18.3                   
## 
## loaded via a namespace (and not attached):
##  [1] locfit_1.5-9.1       lattice_0.20-31      digest_0.6.8         plyr_1.8.3          
##  [5] futile.options_1.0.0 acepack_1.3-3.3      evaluate_0.7         ggplot2_1.0.1       
##  [9] zlibbioc_1.14.0      annotate_1.46.0      rpart_4.1-9          rmarkdown_0.7       
## [13] proto_0.3-10         splines_3.2.1        BiocParallel_1.2.5   geneplotter_1.46.0  
## [17] stringr_1.0.0        foreign_0.8-63       RCurl_1.95-4.6       biomaRt_2.24.0      
## [21] munsell_0.4.2        htmltools_0.2.6      nnet_7.3-9           gridExtra_0.9.1     
## [25] codetools_0.2-11     Hmisc_3.16-0         XML_3.98-1.2         MASS_7.3-41         
## [29] bitops_1.0-6         grid_3.2.1           xtable_1.7-4         gtable_0.1.2        
## [33] magrittr_1.5         formatR_1.2          scales_0.2.5         stringi_0.5-2       
## [37] reshape2_1.4.1       genefilter_1.50.0    latticeExtra_0.6-26  futile.logger_1.4.1 
## [41] Formula_1.2-1        lambda.r_1.1.7       RColorBrewer_1.1-2   tools_3.2.1         
## [45] survival_2.38-2      yaml_2.1.13          colorspace_1.2-6     cluster_2.0.2       
## [49] knitr_1.10.5

Bioconductor for High Throughput Sequence Analysis

Ad hoc exercises

Annotation

Input & representation of standard file formats

Statistical analysis of differential expression – DESeq2

Conclusion

Statistical analysis of differential expression – `DESeq2`