Author: Martin Morgan (mtmorgan@fredhutch.org)
Date: 7 September, 2015
Back to Workshop Outline

The material in this document requires R version 3.2 and Bioconductor version 3.1

stopifnot(
    getRversion() >= '3.2' && getRversion() < '3.3',
    BiocInstaller::biocVersion() >= "3.1"
)

Statistical analysis of differential expression – DESeq2

  1. Experimental design
  2. Wet-lab preparation
  3. High-throughput sequencing
  4. Alignment
  5. Summary
  6. Statistical analysis
  7. Comprehension

More extensive material

Challenges & solutions

Starting point

Normalization

Error model

Limited sample size

Multiple testing

Work flow

Data representation

Three types of information

SummarizedExperiment coordinates this information

library("airway")
data(airway)
airway
## class: RangedSummarizedExperiment 
## dim: 64102 8 
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowRanges metadata column names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
## main components of SummarizedExperiment
head(assay(airway))
##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## ENSG00000000003        679        448        873        408       1138       1047        770
## ENSG00000000005          0          0          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587        799        417
## ENSG00000000457        260        211        263        164        245        331        233
## ENSG00000000460         60         55         40         35         78         63         76
## ENSG00000000938          0          0          2          0          1          0          0
##                 SRR1039521
## ENSG00000000003        572
## ENSG00000000005          0
## ENSG00000000419        508
## ENSG00000000457        229
## ENSG00000000460         60
## ENSG00000000938          0
colData(airway)
## DataFrame with 8 rows and 9 columns
##            SampleName     cell      dex    albut        Run avgLength Experiment    Sample
##              <factor> <factor> <factor> <factor>   <factor> <integer>   <factor>  <factor>
## SRR1039508 GSM1275862   N61311    untrt    untrt SRR1039508       126  SRX384345 SRS508568
## SRR1039509 GSM1275863   N61311      trt    untrt SRR1039509       126  SRX384346 SRS508567
## SRR1039512 GSM1275866  N052611    untrt    untrt SRR1039512       126  SRX384349 SRS508571
## SRR1039513 GSM1275867  N052611      trt    untrt SRR1039513        87  SRX384350 SRS508572
## SRR1039516 GSM1275870  N080611    untrt    untrt SRR1039516       120  SRX384353 SRS508575
## SRR1039517 GSM1275871  N080611      trt    untrt SRR1039517       126  SRX384354 SRS508576
## SRR1039520 GSM1275874  N061011    untrt    untrt SRR1039520       101  SRX384357 SRS508579
## SRR1039521 GSM1275875  N061011      trt    untrt SRR1039521        98  SRX384358 SRS508580
##               BioSample
##                <factor>
## SRR1039508 SAMN02422669
## SRR1039509 SAMN02422675
## SRR1039512 SAMN02422678
## SRR1039513 SAMN02422670
## SRR1039516 SAMN02422682
## SRR1039517 SAMN02422673
## SRR1039520 SAMN02422683
## SRR1039521 SAMN02422677
rowRanges(airway)
## GRangesList object of length 64102:
## $ENSG00000000003 
## GRanges object with 17 ranges and 2 metadata columns:
##        seqnames               ranges strand   |   exon_id       exon_name
##           <Rle>            <IRanges>  <Rle>   | <integer>     <character>
##    [1]        X [99883667, 99884983]      -   |    667145 ENSE00001459322
##    [2]        X [99885756, 99885863]      -   |    667146 ENSE00000868868
##    [3]        X [99887482, 99887565]      -   |    667147 ENSE00000401072
##    [4]        X [99887538, 99887565]      -   |    667148 ENSE00001849132
##    [5]        X [99888402, 99888536]      -   |    667149 ENSE00003554016
##    ...      ...                  ...    ... ...       ...             ...
##   [13]        X [99890555, 99890743]      -   |    667156 ENSE00003512331
##   [14]        X [99891188, 99891686]      -   |    667158 ENSE00001886883
##   [15]        X [99891605, 99891803]      -   |    667159 ENSE00001855382
##   [16]        X [99891790, 99892101]      -   |    667160 ENSE00001863395
##   [17]        X [99894942, 99894988]      -   |    667161 ENSE00001828996
## 
## ...
## <64101 more elements>
## -------
## seqinfo: 722 sequences (1 circular) from an unspecified genome
## e.g., coordinated subset to include dex 'trt'  samples
airway[, airway$dex == "trt"]
## class: RangedSummarizedExperiment 
## dim: 64102 4 
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowRanges metadata column names(0):
## colnames(4): SRR1039509 SRR1039513 SRR1039517 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
## e.g., keep only rows with non-zero counts
airway <- airway[rowSums(assay(airway)) != 0, ]

DESeq2 work flow

  1. Add experimental design information to the SummarizedExperiment

    library(DESeq2)
    dds <- DESeqDataSet(airway, design = ~ cell + dex)
    
  2. Peform the essential work flow steps

    dds <- DESeq(dds)
    
    ## estimating size factors
    ## estimating dispersions
    ## gene-wise dispersion estimates
    ## mean-dispersion relationship
    ## final dispersion estimates
    ## fitting model and testing
    
    dds
    
    ## class: DESeqDataSet 
    ## dim: 33469 8 
    ## metadata(1): ''
    ## assays(3): counts mu cooks
    ## rownames(33469): ENSG00000000003 ENSG00000000419 ... ENSG00000273492 ENSG00000273493
    ## rowRanges metadata column names(46): baseMean baseVar ... deviance maxCooks
    ## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
    ## colData names(10): SampleName cell ... BioSample sizeFactor
    
  3. Extract results

    res <- results(dds)
    res
    
    ## log2 fold change (MAP): dex untrt vs trt 
    ## Wald test p-value: dex untrt vs trt 
    ## DataFrame with 33469 rows and 6 columns
    ##                    baseMean log2FoldChange      lfcSE       stat       pvalue        padj
    ##                   <numeric>      <numeric>  <numeric>  <numeric>    <numeric>   <numeric>
    ## ENSG00000000003 708.6021697     0.37424998 0.09873107  3.7906000 0.0001502838 0.001287304
    ## ENSG00000000419 520.2979006    -0.20215550 0.10929899 -1.8495642 0.0643763883 0.197080529
    ## ENSG00000000457 237.1630368    -0.03624826 0.13684258 -0.2648902 0.7910940570 0.914355830
    ## ENSG00000000460  57.9326331     0.08523370 0.24654400  0.3457140 0.7295576915 0.883713089
    ## ENSG00000000938   0.3180984     0.11555962 0.14630523  0.7898530 0.4296136448          NA
    ## ...                     ...            ...        ...        ...          ...         ...
    ## ENSG00000273487   8.1632350    -0.56331132  0.3736236 -1.5076976    0.1316319   0.3293588
    ## ENSG00000273488   8.5844790    -0.10805538  0.3684853 -0.2932420    0.7693372   0.9032832
    ## ENSG00000273489   0.2758994    -0.11282164  0.1424265 -0.7921393    0.4282794          NA
    ## ENSG00000273492   0.1059784     0.07644378  0.1248627  0.6122225    0.5403906          NA
    ## ENSG00000273493   0.1061417     0.07628747  0.1250713  0.6099516    0.5418939          NA