Bioc2016: RNA-seq EDA

Michael Love

26 June 2016

Goals of this workshop

We will not cover, but you can read about elsewhere:

Gene annotation

We start by looking at two genes, USF2 and CHPF. Let’s obtain some information on these genes from the Homo.sapiens package. We will pull out the transcript ID and name that go along with the gene symbols.

library(Homo.sapiens)
columns(Homo.sapiens)
##  [1] "ACCNUM"       "ALIAS"        "CDSCHROM"     "CDSEND"      
##  [5] "CDSID"        "CDSNAME"      "CDSSTART"     "CDSSTRAND"   
##  [9] "DEFINITION"   "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
## [13] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL" 
## [17] "EXONCHROM"    "EXONEND"      "EXONID"       "EXONNAME"    
## [21] "EXONRANK"     "EXONSTART"    "EXONSTRAND"   "GENEID"      
## [25] "GENENAME"     "GO"           "GOALL"        "GOID"        
## [29] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [33] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [37] "PROSITE"      "REFSEQ"       "SYMBOL"       "TERM"        
## [41] "TXCHROM"      "TXEND"        "TXID"         "TXNAME"      
## [45] "TXSTART"      "TXSTRAND"     "TXTYPE"       "UCSCKG"      
## [49] "UNIGENE"      "UNIPROT"
g <- list()
g[["USF2"]] <- select(Homo.sapiens, "USF2", c("TXID","TXNAME"), "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
g[["CHPF"]] <- select(Homo.sapiens, "CHPF", c("TXID","TXNAME"), "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
g
## $USF2
##   SYMBOL  TXID     TXNAME
## 1   USF2 66826 uc010xss.1
## 2   USF2 66827 uc002nyq.1
## 3   USF2 66828 uc002nyr.1
## 4   USF2 66829 uc002nyt.1
## 5   USF2 66830 uc002nyv.1
## 
## $CHPF
##   SYMBOL  TXID     TXNAME
## 1   CHPF 12798 uc010zlh.2
## 2   CHPF 12799 uc002vmc.4

We can use a TxDb object to get the exons for the transcripts of these two genes. The exonsBy function returns a GRangesList object, and by specifying by="tx", within this GRangesList we can find the GRanges for each exon of every transcript. In ebt we have the exons of every transcript, labelled by transcript ID. We then use the information from the previous chunk to pull out the transcripts of USF2 and CHPF:

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
ebt <- exonsBy(txdb, by="tx")
head(names(ebt))
## [1] "1" "2" "3" "4" "5" "6"
usf2 <- ebt[ g[["USF2"]]$TXID ]
chpf <- ebt[ g[["CHPF"]]$TXID ]

We can take a look at these:

usf2
## GRangesList object of length 5:
## $66826 
## GRanges object with 9 ranges and 3 metadata columns:
##       seqnames               ranges strand |   exon_id   exon_name
##          <Rle>            <IRanges>  <Rle> | <integer> <character>
##   [1]    chr19 [35759896, 35760066]      + |    239103        <NA>
##   [2]    chr19 [35760349, 35760395]      + |    239104        <NA>
##   [3]    chr19 [35760484, 35760602]      + |    239105        <NA>
##   [4]    chr19 [35760706, 35760906]      + |    239106        <NA>
##   [5]    chr19 [35761350, 35761500]      + |    239107        <NA>
##   [6]    chr19 [35761621, 35761708]      + |    239108        <NA>
##   [7]    chr19 [35761986, 35762044]      + |    239110        <NA>
##   [8]    chr19 [35769601, 35769695]      + |    239111        <NA>
##   [9]    chr19 [35769849, 35770211]      + |    239113        <NA>
##       exon_rank
##       <integer>
##   [1]         1
##   [2]         2
##   [3]         3
##   [4]         4
##   [5]         5
##   [6]         6
##   [7]         7
##   [8]         8
##   [9]         9
## 
## $66827 
## GRanges object with 10 ranges and 3 metadata columns:
##        seqnames               ranges strand | exon_id exon_name exon_rank
##    [1]    chr19 [35759896, 35760066]      + |  239103      <NA>         1
##    [2]    chr19 [35760349, 35760395]      + |  239104      <NA>         2
##    [3]    chr19 [35760484, 35760602]      + |  239105      <NA>         3
##    [4]    chr19 [35760706, 35760906]      + |  239106      <NA>         4
##    [5]    chr19 [35761350, 35761500]      + |  239107      <NA>         5
##    [6]    chr19 [35761621, 35761708]      + |  239108      <NA>         6
##    [7]    chr19 [35761986, 35762044]      + |  239110      <NA>         7
##    [8]    chr19 [35769601, 35769695]      + |  239111      <NA>         8
##    [9]    chr19 [35769849, 35769977]      + |  239112      <NA>         9
##   [10]    chr19 [35770070, 35770718]      + |  239115      <NA>        10
## 
## ...
## <3 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
chpf
## GRangesList object of length 2:
## $12798 
## GRanges object with 4 ranges and 3 metadata columns:
##       seqnames                 ranges strand |   exon_id   exon_name
##          <Rle>              <IRanges>  <Rle> | <integer> <character>
##   [1]     chr2 [220407635, 220407819]      - |     46734        <NA>
##   [2]     chr2 [220406338, 220406911]      - |     46733        <NA>
##   [3]     chr2 [220405668, 220405847]      - |     46732        <NA>
##   [4]     chr2 [220403669, 220405364]      - |     46731        <NA>
##       exon_rank
##       <integer>
##   [1]         1
##   [2]         2
##   [3]         3
##   [4]         4
## 
## $12799 
## GRanges object with 4 ranges and 3 metadata columns:
##       seqnames                 ranges strand | exon_id exon_name exon_rank
##   [1]     chr2 [220407947, 220408487]      - |   46735      <NA>         1
##   [2]     chr2 [220406338, 220406911]      - |   46733      <NA>         2
##   [3]     chr2 [220405668, 220405847]      - |   46732      <NA>         3
##   [4]     chr2 [220403669, 220405364]      - |   46731      <NA>         4
## 
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

If we ask about the width of the exons, we get an IntegerList. If we follow that command with sum, we get the length of each transcript.

width(usf2)
## IntegerList of length 5
## [["66826"]] 171 47 119 201 151 88 59 95 363
## [["66827"]] 171 47 119 201 151 88 59 95 129 649
## [["66828"]] 171 47 119 151 88 59 95 129 649
## [["66829"]] 171 47 119 47 59 95 129 649
## [["66830"]] 119 201 151 88 59 95 105 649
sum(width(usf2))
## 66826 66827 66828 66829 66830 
##  1294  1709  1508  1316  1467

It will be useful to calculate the total range of the exons of the transcripts of the genes:

usf2.r <- range(unlist(usf2))
chpf.r <- range(unlist(chpf))

Plotting gene models

Let’s now visualize the exons of the transcripts using base R graphics. First, let’s write a line of code that draws the exons of a single transcript of USF2:

library(rafalib)
lens <- length(usf2)
nullplot(start(usf2.r),end(usf2.r),0,2)
segments(start(usf2[[1]]), rep(1,lens[1]), 
           end(usf2[[1]]), rep(1,lens[1]), lwd=3)

Now, to avoid repeating this code over and over for each gene, let’s write a general function for drawing the exons of each transcript of a gene, contained in a GRangesList x.

plotGRangesList <- function(x,name="") {
  r <- range(unlist(range(x)))
  nullplot(start(r),end(r),0.5,length(x)+0.5,
           main=name,xlab=seqnames(x[[1]][1]))
  lens <- elementNROWS(x)
  for (i in seq_along(x)) {
    segments(start(x[[i]]), rep(i,lens[i]), 
             end(x[[i]]), rep(i,lens[i]), lwd=3)
  }
}
plotGRangesList(usf2, "USF2")

plotGRangesList(chpf, "CHPF")

Reading in alignments

We have provided four BAM files for use in this workshop, so that we can explore how real RNA-seq fragments look using Bioconductor. These four files are associated with GEUVADIS RNA-seq samples, aligned using STAR read aligner version 2.5.0, aligning to the hg19 genome packaged by Illumina iGenomes. The BAM files contain just a subset of the total number of paired-end reads, to limit the disk space required for this workshop. The files contain only the reads that overlap the genes USF2 and CHPF. Full details about the experiments can be found by searching using the ERR ID on the ENA website.

dir <- system.file("extdata", package="bioc2016eda")
samples <- read.csv(file.path(dir,"samples.csv"))
samples
##         run pop center
## 1 ERR188204 TSI      1
## 2 ERR188317 TSI      1
## 3 ERR188297 TSI      2
## 4 ERR188088 TSI      2

We create a vector that points to these files on disk.

library(GenomicAlignments)
bamfiles <- file.path(dir,paste0(samples$run,".bam"))
file.exists(bamfiles)
## [1] TRUE TRUE TRUE TRUE

We did not sort or include an index for these BAM files, so first we must sort and index them. Normally, you would sort and index these outside of R using, e.g. Samtools.

for (i in seq_along(bamfiles)) {
  sortBam(bamfiles[i], destination=file.path(dir,paste0(samples$run[i],"_sort")))
  indexBam(file.path(dir,paste0(samples$run[i],"_sort.bam")))
}
bamfiles <- file.path(dir,paste0(samples$run,"_sort.bam"))
file.exists(bamfiles)
## [1] TRUE TRUE TRUE TRUE

Now we can read in some paired alignments for the first BAM files, specifically those alignments that cover USF2. We need to remove some of the seqnames from the usf2.r ranges, as these do not exist in the BAM header.

usf2.r <- keepStandardChromosomes(usf2.r)
gap <- readGAlignmentPairs(bamfiles[1], 
                           param=ScanBamParam(which=usf2.r))
gap
## GAlignmentPairs object with 2453 pairs, strandMode=1, and 0 metadata columns:
##          seqnames strand   :               ranges  --               ranges
##             <Rle>  <Rle>   :            <IRanges>  --            <IRanges>
##      [1]    chr19      +   : [35504852, 35770379]  -- [35770345, 35770420]
##      [2]    chr19      +   : [35760492, 35760567]  -- [35760548, 35760726]
##      [3]    chr19      +   : [35760495, 35760570]  -- [35760578, 35760756]
##      [4]    chr19      +   : [35760496, 35760571]  -- [35760539, 35760717]
##      [5]    chr19      +   : [35760501, 35760576]  -- [35760539, 35760717]
##      ...      ...    ... ...                  ... ...                  ...
##   [2449]    chr19      -   : [35770623, 35770698]  -- [35770573, 35770648]
##   [2450]    chr19      -   : [35770623, 35770698]  -- [35770572, 35770647]
##   [2451]    chr19      -   : [35770627, 35770701]  -- [35770569, 35770644]
##   [2452]    chr19      -   : [35770639, 35770714]  -- [35770605, 35770680]
##   [2453]    chr19      -   : [35770643, 35770718]  -- [35770598, 35770673]
##   -------
##   seqinfo: 25 sequences from an unspecified genome

Computing genomic coverage

It’s helpful to be able to quickly visualize these alignments, and one such summary visualization is to see where the reads fall on the genome. This plot can be made using the coverage function:

cov <- coverage(gap)
cov.genome <- as.integer(cov[usf2.r][["chr19"]])
pos.genome <- seq(from=start(usf2.r),to=end(usf2.r))
plot(pos.genome, cov.genome, xlab="position (genome)", ylab="coverage")

Find compatible overlaps

The function findCompatibleOverlaps can be used to do just that: find which paired alignments are compatible with which transcripts. “Compatible” in this case means that the splicing must be compatible: the read junctions must align exactly with the introns of the transcript. For more details, see vignette("OverlapEncodings").

The returned object is of the class Hits, and can be interrogated with queryHits and subjectHits. Transcripts 2 and 3 have the most compatible alignments, though these are also the longest transcripts.

fco <- findCompatibleOverlaps(gap, usf2)
fco
## Hits object with 8052 hits and 0 metadata columns:
##          queryHits subjectHits
##          <integer>   <integer>
##      [1]         2           1
##      [2]         2           2
##      [3]         2           5
##      [4]         3           1
##      [5]         3           2
##      ...       ...         ...
##   [8048]      2452           5
##   [8049]      2453           2
##   [8050]      2453           3
##   [8051]      2453           4
##   [8052]      2453           5
##   -------
##   queryLength: 2453 / subjectLength: 5
countSubjectHits(fco)
## [1] 1295 2004 1971 1360 1422

Note that elements of the query (paired alignments) can be compatible with multiple elements in the subject (transcripts). We may therefore want to categorize the paired alignments into equivalence classes, defined by the set of transcripts that the alignments are compatible with. An early proposal for collapsing RNA-seq reads into equivalence classes can be found in Statistical Modeling of RNA-Seq Data by Salzman, J., Jiang, H., and Wong, W.H. Another reference on the use of equivalence classes for RNA-seq data is the Sailfish paper, discussing equivalence classes for read k-mers.

Here we will rename each paired alignment using a 1 or 0 if it is or is not compatible with a transcript. So 1-1-0-0-1 means that a paired alignment is compatible with transcripts 1, 2, and 5.

tab <- table(queryHits(fco), subjectHits(fco))
tab <- as.matrix(tab)
frags <- apply(tab, 1, paste, collapse="-")
frags[1]
##           2 
## "1-1-0-0-1"

We can then tabulate the paired alignments using their new names. Note that we have the same number of alignments this way as if we count the unique number of query hits.

table(frags)
## frags
## 0-0-0-0-1 0-0-0-1-0 0-0-1-0-0 0-1-1-0-1 0-1-1-1-0 0-1-1-1-1 1-0-0-0-0 
##        14         1        11         1       139       583        14 
## 1-1-0-0-1 1-1-1-0-0 1-1-1-0-1 1-1-1-1-0 1-1-1-1-1 
##        44        35       565       422       215
sum(table(frags))
## [1] 2044
length(unique(queryHits(fco)))
## [1] 2044

We can see by scanning the table that many reads are not compatible with transcript 1, 4, or 5.

Map to transcripts

Another useful operation is to map from genomic coordinates to transcript coordinates. Note that, if we only wanted to have transcript alignments from the start, we could have used software such as RSEM or RapMap to produce a BAM file contain transcriptome alignments. Such a file will necessarily contain, for many pairs, the alignment of the paired reads to the many isoforms of a gene with which it is compatible. The genomic alignment file will only contain multiple alignments for pairs when they are at different genomic loci.

To map our genomic coordinates to transcript coordinates, we have to map the start and end of each fragment separately, because mapToTranscripts does not work on GAlignmentPairs. Furthermore, from the man page we can read that, “A successful mapping occurs when x is completely within the transcripts range”. So we will break apart each paired alignment that is compatible with our transcript of interest into a start position and an end position, map these positions to transcript coordinates, then rebuild the fragment afterward.

We don’t keep track of fragment strand in this example, but one could do this simply by taking care of the plus and minus strand fragments separately.

idx <- queryHits(fco)[subjectHits(fco) == 3]
gr <- as(gap[idx],"GRanges")
strand(gr) <- "*"
m2tx.start <- mapToTranscripts(resize(gr, width=1, fix="start"), usf2[3])
m2tx.start
## GRanges object with 1971 ranges and 2 metadata columns:
##          seqnames       ranges strand |     xHits transcriptsHits
##             <Rle>    <IRanges>  <Rle> | <integer>       <integer>
##      [1]    66828   [236, 236]      + |         1               1
##      [2]    66828   [238, 238]      + |         2               1
##      [3]    66828   [238, 238]      + |         3               1
##      [4]    66828   [241, 241]      + |         4               1
##      [5]    66828   [251, 251]      + |         5               1
##      ...      ...          ...    ... .       ...             ...
##   [1967]    66828 [1363, 1363]      + |      1967               1
##   [1968]    66828 [1362, 1362]      + |      1968               1
##   [1969]    66828 [1359, 1359]      + |      1969               1
##   [1970]    66828 [1395, 1395]      + |      1970               1
##   [1971]    66828 [1388, 1388]      + |      1971               1
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Now we do the same for the ends as we did for the starts, mapping them to the transcript coordinates. We then create a GRanges in the transcript coordinate space, using the starts and ends of the paired alignments.

The following code works for a plus strand transcript, because the position of the fragment given by start (the left-most position of the range) is to the left in the transcript coordinates as well. For a minus strand transcript, we would have to reverse these.

m2tx.end <- mapToTranscripts(resize(gr, width=1, fix="end"), usf2[3])
m2tx <- GRanges(seqnames(m2tx.start), 
                IRanges(start(m2tx.start),start(m2tx.end)))
m2tx
## GRanges object with 1971 ranges and 0 metadata columns:
##          seqnames       ranges strand
##             <Rle>    <IRanges>  <Rle>
##      [1]    66828   [236, 330]      *
##      [2]    66828   [238, 334]      *
##      [3]    66828   [238, 363]      *
##      [4]    66828   [241, 469]      *
##      [5]    66828   [251, 387]      *
##      ...      ...          ...    ...
##   [1967]    66828 [1363, 1488]      *
##   [1968]    66828 [1362, 1488]      *
##   [1969]    66828 [1359, 1491]      *
##   [1970]    66828 [1395, 1504]      *
##   [1971]    66828 [1388, 1508]      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

It will be convenient to turn this set of operations into a function so we can repeat the operation for a generic BAM file and transcript.

readTxFragments <- function(file, transcript) {
  r <- range(transcript[[1]])
  r <- keepStandardChromosomes(r)
  # suppress warnings about alignments with ambiguous pairing
  suppressWarnings({ 
    gap <- readGAlignmentPairs(file, param=ScanBamParam(which=r))
  })
  fco <- findCompatibleOverlaps(gap, transcript)
  idx <- queryHits(fco)
  gr <- as(gap[idx],"GRanges")
  strand(gr) <- "*"
  m2tx.start <- mapToTranscripts(resize(gr, width=1, fix="start"), 
                                 transcript)
  m2tx.end <- mapToTranscripts(resize(gr, width=1, fix="end"), 
                               transcript)
  tx.strand <- as.character(strand(transcript)[[1]][1])
  if (tx.strand == "+") {
    m2tx <- GRanges(seqnames(m2tx.start), 
                    IRanges(start(m2tx.start),start(m2tx.end)))
  } else {
    m2tx <- GRanges(seqnames(m2tx.start), 
                    IRanges(start(m2tx.end),start(m2tx.start)))   
  }
  m2tx
}

We can then compute transcript coverage for the different files. We color the coverage by the sequencing center to identify differences in coverage that correlate.

mypar()
nullplot(0,1600,0,900,xlab="position (tx)",ylab="coverage")
for (i in seq_along(bamfiles)) {
  frags <- readTxFragments(bamfiles[i], usf2[3])
  cov.tx <- coverage(frags)
  lines(as.integer(cov.tx[[1]]), col=samples$center[i], lwd=2)
}

And the same for the second transcript of CHPF:

mypar()
nullplot(0,3100,0,150,xlab="position (tx)",ylab="coverage")
for (i in seq_along(bamfiles)) {
  frags <- readTxFragments(bamfiles[i], chpf[2])
  cov.tx <- coverage(frags)
  lines(as.integer(cov.tx[[1]]), col=samples$center[i], lwd=2)
}

Fragment length distribution

Once we have our fragments in the transcript coordinates it is easy to observe the fragment length distribution. For a more comprehensive view, obviously one would want to take fragments from many or all genes.

frags <- readTxFragments(bamfiles[1], usf2[3])
hist(width(frags), col="grey", border="white")

Fragment start and length plot

A density plot of fragment length over fragment start was proposed by Evans, S.N., Hower, V., and Pachter, L. Coverage statistics for sequence census methods and can be seen in Figure 6 in that reference. This plot is useful for reminding us that RNA-seq data is 2-dimensional, and for observing a bias affecting both the fragment starts (fragment position closest to transcript 5’) and the ends (closest to transcript 3’). We can see the bias as vertical and downward-sloping 45 degree lines. We use transparency to allow overplotting at the same or nearby locations in the (start,length) plane.

mypar(4,1,mar=c(2,2,1,1))
for (f in bamfiles) {
  frags <- readTxFragments(f, usf2[3])
  plot(start(frags), width(frags),
       xlim=c(200,1300), ylim=c(50,400),
       pch=15, col=rgb(0,0,0,.2), cex=.5, 
       xlab="", ylab="")
}

For CHPF it is more difficult to see the pattern, as the coverage is lower:

mypar(4,1,mar=c(2,2,1,1))
for (f in bamfiles) {
  frags <- readTxFragments(f, chpf[2])
  plot(start(frags), width(frags),
       xlim=c(0,3000), ylim=c(50,400),
       pch=15, col=rgb(0,0,0,.2), cex=.5, 
       xlab="", ylab="")
}

Extract transcript sequence

Up until this point, we have not yet taken a look at the transcript sequence. In order to see if the distribution of paired alignments is correlated with sequence features, we will need to extract the transcript sequence. Again, we will focus on the third transcript of USF2 and the second of CHPF. We can extract transcript sequence with the following lines:

library(BSgenome.Hsapiens.UCSC.hg19)
usf2.seq <- extractTranscriptSeqs(Hsapiens, usf2[3])[[1]]
chpf.seq <- extractTranscriptSeqs(Hsapiens, chpf[2])[[1]]
usf2.seq
##   1508-letter "DNAString" instance
## seq: GTGAGCGCCGGGCTCGGGGCCCCCCCGGCCGCCC...AACTGCAATAAACTGGCCCCATGTGGCCCCCGC
chpf.seq
##   2991-letter "DNAString" instance
## seq: GCCGAGCGCAAGAACCCTGCGCAGCCCAGAGCAG...CCTTTAATAAACTGGCCAAGTGTGGACTTGTGA

Common sequences at read starts

We can look at, for a single transcript, the sequence beneath the read closest to the 5’ end of the transcript. For uniformly distributed reads, we would expect to see, for example, all 64 of the 3-mers showing up equally:

frags <- readTxFragments(bamfiles[1], usf2[3])
length(frags) / 4^3
## [1] 30.79688

Instead we see some 3-mers are much more common, which leads to the vertical lines in the previous plots.

start.seq <- as(Views(usf2.seq, start=start(frags), width=3), "DNAStringSet")
seq.tab <- table(start.seq)
sort(seq.tab, decreasing=TRUE)
## start.seq
## CGG CAG CCC CCA GGA CCG CAA CAC GCC CTG CGA CCT AGA GCA GGG AAA TGC CTT 
## 132 119 116  97  89  86  80  79  78  67  58  57  56  55  54  49  40  39 
## GAG ATC GAC GAA GCG GTC TGG TTT AGG CGC CTC ACA GTG GTT AAC TGT AGC GAT 
##  37  36  29  28  28  27  27  26  24  23  19  18  18  18  17  16  15  15 
## GCT CTA GGC TCC CAT TAC AAG ACG TCT AAT ATG ATT CGT TGA TTC ACC ACT TTA 
##  15  14  13  13  11  11  10   9   9   8   8   8   8   8   8   7   7   7 
## TTG GTA TAG TCA AGT ATA GGT TAA TCG 
##   6   5   4   4   2   1   1   1   1

Likewise, for the 3-mers for the read closest to the 3’ end of the transcript.

end.seq <- as(Views(usf2.seq, end=end(frags), width=3), "DNAStringSet")
end.seq <- reverseComplement(end.seq)
seq.tab <- table(end.seq)
sort(seq.tab, decreasing=TRUE)
## end.seq
## CTC CTG CCT CTT CCC GGG CCG GTC GTT CGG CAG TCT CGC TCC CCA GCC GGC GCG 
## 275 155 127 118  93  78  73  72  59  58  57  53  52  49  47  47  43  40 
## GCT GGA TTT CGT TGG TTC GCA TGC TCG TGA AGG ATC CAC CGA GTG CAT TTG GAA 
##  30  26  26  24  24  21  20  20  18  18  17  17  15  15  15  14  13  12 
## GAC ATT TGT GTA CAA ACA AGC CTA GGT GAG TCA AAG AGT TTA ACC AGA TAC AAT 
##  12  10  10   9   8   7   7   7   7   6   6   5   5   5   4   4   4   3 
## ACT GAT TAG AAA AAC ATG TAA 
##   3   2   2   1   1   1   1

To properly infer the bias at the starts and ends of the fragments, one would need to look over many or all transcripts. Such a bias calculation is performed by a number of software including Cufflinks, eXpress, Sailfish, Salmon, kallisto and alpine.

Fragment sequence features

Finally, the last kind of bias we will look into is whether the fragment sequence itself has an effect in the distribution of fragments on the transcript. We can extract the fragment sequence with code similar to the lines above.

frag.seq <- as(Views(usf2.seq, start=start(frags), end(frags)), "DNAStringSet")

We can easily find summaries of the fragment sequence, for example, the GC content of the fragments:

frag.gc <- letterFrequency(frag.seq, "GC", as.prob=TRUE)
plot(density(frag.gc))

Let’s calculate what we might expect if fragments were distributed uniformly along the transcript:

uniform.gc <- letterFrequencyInSlidingView(usf2.seq, 
                                           median(width(frags)), 
                                           "GC", as.prob=TRUE) 
plot(density(uniform.gc))

Now let’s combine these plots, iterating over the different samples and coloring by sequencing center:

plot(density(uniform.gc), ylim=c(0,12), lwd=2,
     xlab="fragment GC content", main="")
for (i in seq_along(bamfiles)) {
  frags <- readTxFragments(bamfiles[i], usf2[3])
  frag.seq <- as(Views(usf2.seq, start=start(frags), end(frags)), 
                 "DNAStringSet")
  frag.gc <- letterFrequency(frag.seq, "GC", as.prob=TRUE)
  lines(density(frag.gc), col=samples$center[i], lwd=2)
}

To summarize this plot, it’s clear that the samples from one of the sequencing centers have much less representation than the other sequencing center in the range of 0.65-0.75 GC content.

Again, in order to properly calculate this bias, one would need to look over many of all transcripts, and to adjust for the potentially confounding effect of the read start sequence bias. Such a calculation is performed by the alpine software.

Other QA software

Session Information

sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.2 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] BSgenome.Hsapiens.UCSC.hg19_1.4.0      
##  [2] BSgenome_1.41.2                        
##  [3] rtracklayer_1.33.7                     
##  [4] GenomicAlignments_1.9.3                
##  [5] Rsamtools_1.25.0                       
##  [6] Biostrings_2.41.4                      
##  [7] XVector_0.13.2                         
##  [8] SummarizedExperiment_1.3.5             
##  [9] rafalib_1.0.0                          
## [10] Homo.sapiens_1.3.1                     
## [11] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [12] org.Hs.eg.db_3.3.0                     
## [13] GO.db_3.3.0                            
## [14] OrganismDbi_1.15.1                     
## [15] GenomicFeatures_1.25.12                
## [16] GenomicRanges_1.25.4                   
## [17] GenomeInfoDb_1.9.1                     
## [18] AnnotationDbi_1.35.3                   
## [19] IRanges_2.7.6                          
## [20] S4Vectors_0.11.5                       
## [21] Biobase_2.33.0                         
## [22] BiocGenerics_0.19.1                    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.5          RColorBrewer_1.1-2   BiocInstaller_1.23.4
##  [4] formatR_1.4          bitops_1.0-6         tools_3.3.0         
##  [7] zlibbioc_1.19.0      biomaRt_2.29.2       digest_0.6.9        
## [10] RSQLite_1.0.0        evaluate_0.9         graph_1.51.0        
## [13] DBI_0.4-1            yaml_2.1.13          stringr_1.0.0       
## [16] knitr_1.13           XML_3.98-1.4         RBGL_1.49.1         
## [19] BiocParallel_1.7.4   rmarkdown_0.9.6      magrittr_1.5        
## [22] htmltools_0.3.5      stringi_1.1.1        RCurl_1.95-4.8