Authors: Martin Morgan (mtmorgan@fhcrc.org), Sonali Arora (sarora@fredhutch.org)
Date: 16 June, 2015

Basic manipulation of BSgenome annotation resources

library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg19)
Hsapiens                                # also BSgenome.Hsapiens.UCSC.hg19
## Human genome:
## # organism: Homo sapiens (Human)
## # provider: UCSC
## # provider version: hg19
## # release date: Feb. 2009
## # release name: Genome Reference Consortium GRCh37
## # 93 sequences:
## #   chr1                  chr2                  chr3                 
## #   chr4                  chr5                  chr6                 
## #   chr7                  chr8                  chr9                 
## #   ...                   ...                   ...                  
## #   chrUn_gl000244        chrUn_gl000245        chrUn_gl000246       
## #   chrUn_gl000247        chrUn_gl000248        chrUn_gl000249       
## # (use 'seqnames()' to see all the sequence names, use the '$' or '[['
## # operator to access a given sequence)
Hsapiens[["chr19"]]                     # load single chromosome
##   59128983-letter "DNAString" instance
## seq: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
letterFrequency(Hsapiens[["chr19"]], "GC")
##      G|C 
## 26989400
methods(class=class(Hsapiens))          # e.g., getSeq(), matchPWM()
##  [1] [[              $               as.list         bsgenomeName   
##  [5] coerce          coerce<-        commonName      countPWM       
##  [9] export          getSeq          injectSNPs      length         
## [13] masknames       matchPWM        mseqnames       names          
## [17] organism        provider        providerVersion releaseDate    
## [21] releaseName     seqinfo         seqinfo<-       seqnames       
## [25] seqnames<-      show            snpcount        SNPcount       
## [29] SNPlocs_pkgname snplocs         SNPlocs         sourceUrl      
## [33] species         toString        vcountPattern   vcountPDict    
## [37] Views           vmatchPattern   vmatchPDict    
## see '?methods' for accessing help and source code

exonsBy() and friends for TxDb (gene model) packages

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene   # easy-to-type alias
egids <- c(BRCA1="672", PTEN="5728")
genes(txdb, vals=list(gene_id=egids))   # start / end coordinates for two genes
## GRanges object with 2 ranges and 1 metadata column:
##        seqnames               ranges strand |     gene_id
##           <Rle>            <IRanges>  <Rle> | <character>
##   5728    chr10 [89623195, 89728532]      + |        5728
##    672    chr17 [41196312, 41322420]      - |         672
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
exByGn <- exonsBy(txdb, "gene")         # exons grouped by gene
methods(class=class(txdb))              # cds, transcripts, promoters, ...
##  [1] $                      $<-                    annotatedDataFrameFrom
##  [4] as.list                asBED                  asGFF                 
##  [7] assayData              assayData<-            cds                   
## [10] cdsBy                  cdsByOverlaps          coerce                
## [13] columns                combine                contents              
## [16] dbconn                 dbfile                 dbInfo                
## [19] dbmeta                 dbschema               disjointExons         
## [22] distance               exons                  exonsBy               
## [25] exonsByOverlaps        ExpressionSet          extractUpstreamSeqs   
## [28] featureNames           featureNames<-         fiveUTRsByTranscript  
## [31] genes                  initialize             intronsByTranscript   
## [34] isActiveSeq            isActiveSeq<-          isNA                  
## [37] keys                   keytypes               mapIds                
## [40] mappedkeys             mapToTranscripts       metadata              
## [43] microRNAs              nhit                   organism              
## [46] promoters              revmap                 sample                
## [49] sampleNames            sampleNames<-          saveDb                
## [52] select                 seqinfo                seqinfo<-             
## [55] seqlevels0             show                   species               
## [58] storageMode            storageMode<-          threeUTRsByTranscript 
## [61] transcripts            transcriptsBy          transcriptsByOverlaps 
## [64] tRNAs                  updateObject          
## see '?methods' for accessing help and source code

GC content of exons in UCSC hg19 knownGene track

library(BSgenome.Hsapiens.UCSC.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

ex <- getSeq(Hsapiens, exons(txdb))
ex
##   A DNAStringSet instance of length 289969
##           width seq
##      [1]    354 CTTGCCGTCAGCCTTTTCTTTGACCTCT...GCATCAACTTCTCTCACAACCTAGGCCA
##      [2]    127 GCTCCTGTCTCCCCCCAGGTGTGTGGTG...TCTTGTGAGTGTCCCCAGTGTTGCAGAG
##      [3]    109 GTGTGTGGTGATGCCAGGCATGCCCTTC...TCTTGTGAGTGTCCCCAGTGTTGCAGAG
##      ...    ... ...
## [289968]    109 GTGTGTGGTGATGCCAGGCATGCCCTTC...TCTTGTGAGTGTCCCCAGTGTTGCAGAG
## [289969]    354 CTTGCCGTCAGCCTTTTCTTTGACCTCT...GCATCAACTTCTCTGACAACCTAGGCCA
hist(letterFrequency(ex, "GC", as.prob=TRUE))

Eagle-eyed audience members noted that the first and last exon were the same widht, and share many nucleotides. Mike Love notes that these exons are from genes in the DDX11L1 family, which occurs in subtelomeres, where. duplication is not surprising because of telomere sequence similarity. This paper provides some context.

Roadmap epigenomics BED files

From the vignette AnnotationHub How-To’s

library(AnnotationHub)
hub <- AnnotationHub()
## requires 'devel' version of Bioconductor
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]
E126
## GRanges object with 153266 ranges and 6 metadata columns:
##            seqnames                 ranges strand   |        name
##               <Rle>              <IRanges>  <Rle>   | <character>
##        [1]     chr1   [28994424, 28996659]      *   |      Rank_1
##        [2]     chr4   [54957157, 54959151]      *   |      Rank_2
##        [3]    chr14   [75760095, 75763324]      *   |      Rank_3
##        ...      ...                    ...    ... ...         ...
##   [153265]     chr8 [ 57901188,  57901416]      *   | Rank_153265
##   [153266]     chr7 [158387833, 158388077]      *   | Rank_153266
##                score signalValue    pValue    qValue      peak
##            <numeric>   <numeric> <numeric> <numeric> <numeric>
##        [1]       189        <NA>  10.55845  22.01316  18.99911
##        [2]       188        <NA>   8.11483  21.80441  18.80662
##        [3]       180        <NA>   8.89834  20.97714  18.02816
##        ...       ...         ...       ...       ...       ...
##   [153265]         0        <NA>   1.51067   1.00321         0
##   [153266]         0        <NA>   1.50505   1.00238         0
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

liftOver from hg19 to hg38 coordinates

query(hub , c("hg38", "hg19", "chainfile"))
## AnnotationHub with 2 records
## # snapshotDate(): 2015-05-26 
## # $dataprovider: UCSC
## # $species: Homo sapiens
## # $rdataclass: ChainFile
## # additional mcols(): taxonomyid, genome, description, tags,
## #   sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["AH14108"]]' 
## 
##             title                   
##   AH14108 | hg38ToHg19.over.chain.gz
##   AH14150 | hg19ToHg38.over.chain.gz
E126hg38 <- liftOver(E126, hub[["AH14150"]])
E126hg38
## GRangesList object of length 153266:
## $1 
## GRanges object with 1 range and 6 metadata columns:
##       seqnames               ranges strand |        name     score
##          <Rle>            <IRanges>  <Rle> | <character> <numeric>
##   [1]     chr1 [28667912, 28670147]      * |      Rank_1       189
##       signalValue    pValue    qValue      peak
##         <numeric> <numeric> <numeric> <numeric>
##   [1]        <NA>  10.55845  22.01316  18.99911
## 
## $2 
## GRanges object with 1 range and 6 metadata columns:
##       seqnames               ranges strand |   name score signalValue
##   [1]     chr4 [54090990, 54092984]      * | Rank_2   188        <NA>
##        pValue   qValue     peak
##   [1] 8.11483 21.80441 18.80662
## 
## $3 
## GRanges object with 1 range and 6 metadata columns:
##       seqnames               ranges strand |   name score signalValue
##   [1]    chr14 [75293392, 75296621]      * | Rank_3   180        <NA>
##        pValue   qValue     peak
##   [1] 8.89834 20.97714 18.02816
## 
## ...
## <153263 more elements>
## -------
## seqinfo: 23 sequences from an unspecified genome; no seqlengths
## GRangesList because some peaks lift over to multiple genomic locations
table(elementLengths(E126hg38))
## 
##      0      1      2      3      4      5      6      7      8     10 
##     31 152680    338    105     35     32     13      6      9      3 
##     11     12     16     17     18     19     25 
##      4      3      3      1      1      1      1

summarizeOverlaps between aligned reads and known genes

See exercise in Lab 1.3.

Working with Hits object

See the “Working with dbSNP Variants” section of the AnnotationHub How-To vignette.