Contents

1 History

2 High-throughput sequence work flow

3 DNA Sequences

library(Biostrings)

dna <- DNAStringSet(c("AAACTG", "CCCTTCAAC", "TACGAA"))
dna
##   A DNAStringSet instance of length 3
##     width seq
## [1]     6 AAACTG
## [2]     9 CCCTTCAAC
## [3]     6 TACGAA
length(dna)
## [1] 3
dna[c(1, 3, 1)]
##   A DNAStringSet instance of length 3
##     width seq
## [1]     6 AAACTG
## [2]     6 TACGAA
## [3]     6 AAACTG
width(dna)
## [1] 6 9 6
reverseComplement(dna)
##   A DNAStringSet instance of length 3
##     width seq
## [1]     6 CAGTTT
## [2]     9 GTTGAAGGG
## [3]     6 TTCGTA

Biostrings themes

Help!

methods(class="DNAStringSet")
?"DNAStringSet"
browseVignettes(package="Biostrings")

4 Genomic Ranges

library(GenomicRanges)

gr <- GRanges(c("chr1:100-120", "chr1:115-130", "chr2:200-220"))
gr
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames     ranges strand
##          <Rle>  <IRanges>  <Rle>
##   [1]     chr1 [100, 120]      *
##   [2]     chr1 [115, 130]      *
##   [3]     chr2 [200, 220]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
shift(gr, 1)
## GRanges object with 3 ranges and 0 metadata columns:
##       seqnames     ranges strand
##          <Rle>  <IRanges>  <Rle>
##   [1]     chr1 [101, 121]      *
##   [2]     chr1 [116, 131]      *
##   [3]     chr2 [201, 221]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
reduce(gr)
## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames     ranges strand
##          <Rle>  <IRanges>  <Rle>
##   [1]     chr1 [100, 130]      *
##   [2]     chr2 [200, 220]      *
##   -------
##   seqinfo: 2 sequences from an unspecified genome; no seqlengths
anno <- GRanges(c("chr1:110-150", "chr2:150-210"))
countOverlaps(anno, gr)
## [1] 2 1

GenomicRanges

Intra-range operations

Inter-range operations

Between-range operations

Help!

methods(class="GRanges")
methods(class="GRangesList")
?"GRanges"
?"GRangesList"
browseVignettes(package="GenomicRanges")

4.1 Lists of Genomic Ranges

  • e.g., exons-within-transcripts, alignments-within-reads

5 Summarized Experiments

Component parts

counts <- read.csv("lecture-02-data/airway_counts.csv", row.names=1)
counts <- as.matrix(counts)
head(counts, 3)
##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
colData <- read.csv("lecture-02-data/airway_colData.csv", row.names=1)
colData[, 1:4]
##            SampleName    cell   dex albut
## SRR1039508 GSM1275862  N61311 untrt untrt
## SRR1039509 GSM1275863  N61311   trt untrt
## SRR1039512 GSM1275866 N052611 untrt untrt
## SRR1039513 GSM1275867 N052611   trt untrt
## SRR1039516 GSM1275870 N080611 untrt untrt
## SRR1039517 GSM1275871 N080611   trt untrt
## SRR1039520 GSM1275874 N061011 untrt untrt
## SRR1039521 GSM1275875 N061011   trt untrt
rowRanges <- readRDS("lecture-02-data/airway_rowRanges.rds")
rowRanges
## GRangesList object of length 33469:
## $ENSG00000000003 
## GRanges object with 17 ranges and 2 metadata columns:
##        seqnames               ranges strand |   exon_id       exon_name
##           <Rle>            <IRanges>  <Rle> | <integer>     <character>
##    [1]        X [99883667, 99884983]      - |    667145 ENSE00001459322
##    [2]        X [99885756, 99885863]      - |    667146 ENSE00000868868
##    [3]        X [99887482, 99887565]      - |    667147 ENSE00000401072
##    [4]        X [99887538, 99887565]      - |    667148 ENSE00001849132
##    [5]        X [99888402, 99888536]      - |    667149 ENSE00003554016
##    ...      ...                  ...    ... .       ...             ...
##   [13]        X [99890555, 99890743]      - |    667156 ENSE00003512331
##   [14]        X [99891188, 99891686]      - |    667158 ENSE00001886883
##   [15]        X [99891605, 99891803]      - |    667159 ENSE00001855382
##   [16]        X [99891790, 99892101]      - |    667160 ENSE00001863395
##   [17]        X [99894942, 99894988]      - |    667161 ENSE00001828996
## 
## ...
## <33468 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome

Could manipulate independently…

cidx <- colData$dex == "trt"
plot(
    rowMeans(1 + counts[, cidx]) ~ rowMeans(1 + counts[, !cidx]),
    log="xy"
)

Solution: coordinate in a single object – SummarizedExperiment

library(SummarizedExperiment)

se <- SummarizedExperiment(counts, rowRanges = rowRanges, colData = colData)
cidx <- se$dex == "trt"
plot(
    rowMeans(1 + assay(se)[, cidx]) ~ rowMeans(1 + assay(se)[, !cidx]),
    log="xy"
)

Help!

methods(class="SummarizedExperiment")
?"SummarizedExperiment"
browseVignettes(package="SummarizedExperiment")

6 Bioconductor packages for high-throughput genomic analysis