Contents

1 Theory

See [slides][]

1.1 Benchmarking: a recent tweet…

A recent tweet provides a nice summary of efforts to benchmark gene set enrichement analysis methods using the GSEABenchmarkR package.

library(EnrichmentBrowser)

2 Practice

Data input and massage

library(airway)
data(airway)
airway$dex <- relevel(airway$dex, "untrt")

Differential expression analysis

library(DESeq2)
des <- DESeqDataSet(airway, design = ~ cell + dex)
des <- DESeq(des)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(des)

Transition to tidy data

library(dplyr)
library(tibble)
tbl <- res %>%
    as.data.frame() %>%
    rownames_to_column("ENSEMBL") %>%
    as_tibble()
tbl
## # A tibble: 64,102 x 7
##    ENSEMBL      baseMean log2FoldChange   lfcSE   stat     pvalue      padj
##    <chr>           <dbl>          <dbl>   <dbl>  <dbl>      <dbl>     <dbl>
##  1 ENSG0000000…  709.           -0.381   0.101  -3.79     1.52e-4   1.28e-3
##  2 ENSG0000000…    0            NA      NA      NA       NA        NA      
##  3 ENSG0000000…  520.            0.207   0.112   1.84     6.53e-2   1.97e-1
##  4 ENSG0000000…  237.            0.0379  0.143   0.264    7.92e-1   9.11e-1
##  5 ENSG0000000…   57.9          -0.0882  0.287  -0.307    7.59e-1   8.95e-1
##  6 ENSG0000000…    0.318        -1.38    3.50   -0.394    6.94e-1  NA      
##  7 ENSG0000000… 5817.            0.426   0.0883  4.83     1.38e-6   1.82e-5
##  8 ENSG0000000… 1282.           -0.241   0.0887 -2.72     6.58e-3   3.28e-2
##  9 ENSG0000000…  610.           -0.0476  0.167  -0.286    7.75e-1   9.03e-1
## 10 ENSG0000000…  369.           -0.500   0.121  -4.14     3.48e-5   3.42e-4
## # … with 64,092 more rows

2.1 Example: hypergeometric test using limma::goana()

Requires ENTREZ identifiers

library(org.Hs.eg.db)
tbl <- tbl %>%
    mutate(
        ENTREZID = mapIds(
            org.Hs.eg.db, ENSEMBL, "ENTREZID", "ENSEMBL"
        ) %>% unname()
    )
## 'select()' returned 1:many mapping between keys and columns
tbl
## # A tibble: 64,102 x 8
##    ENSEMBL baseMean log2FoldChange   lfcSE   stat   pvalue     padj
##    <chr>      <dbl>          <dbl>   <dbl>  <dbl>    <dbl>    <dbl>
##  1 ENSG00…  709.           -0.381   0.101  -3.79   1.52e-4  1.28e-3
##  2 ENSG00…    0            NA      NA      NA     NA       NA      
##  3 ENSG00…  520.            0.207   0.112   1.84   6.53e-2  1.97e-1
##  4 ENSG00…  237.            0.0379  0.143   0.264  7.92e-1  9.11e-1
##  5 ENSG00…   57.9          -0.0882  0.287  -0.307  7.59e-1  8.95e-1
##  6 ENSG00…    0.318        -1.38    3.50   -0.394  6.94e-1 NA      
##  7 ENSG00… 5817.            0.426   0.0883  4.83   1.38e-6  1.82e-5
##  8 ENSG00… 1282.           -0.241   0.0887 -2.72   6.58e-3  3.28e-2
##  9 ENSG00…  610.           -0.0476  0.167  -0.286  7.75e-1  9.03e-1
## 10 ENSG00…  369.           -0.500   0.121  -4.14   3.48e-5  3.42e-4
## # … with 64,092 more rows, and 1 more variable: ENTREZID <chr>

Universe – must be testable for DE

tbl <- tbl %>%
    filter(!is.na(padj), !is.na(ENTREZID))
tbl
## # A tibble: 14,550 x 8
##    ENSEMBL   baseMean log2FoldChange  lfcSE   stat  pvalue    padj ENTREZID
##    <chr>        <dbl>          <dbl>  <dbl>  <dbl>   <dbl>   <dbl> <chr>   
##  1 ENSG0000…    709.         -0.381  0.101  -3.79  1.52e-4 1.28e-3 7105    
##  2 ENSG0000…    520.          0.207  0.112   1.84  6.53e-2 1.97e-1 8813    
##  3 ENSG0000…    237.          0.0379 0.143   0.264 7.92e-1 9.11e-1 57147   
##  4 ENSG0000…     57.9        -0.0882 0.287  -0.307 7.59e-1 8.95e-1 55732   
##  5 ENSG0000…   5817.          0.426  0.0883  4.83  1.38e-6 1.82e-5 3075    
##  6 ENSG0000…   1282.         -0.241  0.0887 -2.72  6.58e-3 3.28e-2 2519    
##  7 ENSG0000…    610.         -0.0476 0.167  -0.286 7.75e-1 9.03e-1 2729    
##  8 ENSG0000…    369.         -0.500  0.121  -4.14  3.48e-5 3.42e-4 4800    
##  9 ENSG0000…    183.         -0.124  0.180  -0.689 4.91e-1 7.24e-1 90529   
## 10 ENSG0000…   2814.         -0.0411 0.103  -0.400 6.89e-1 8.57e-1 57185   
## # … with 14,540 more rows

limma::goana() – Hypergeometric

library(limma)
go <-
    goana(tbl$ENTREZID[tbl$padj < .05], tbl$ENTREZID, "Hs") %>%
    as_tibble()

Hmm, goana() returns GO terms, but we also need GO identifiers

library(GO.db)
go <-
    go %>%
    mutate(
        GOID = mapIds(
            GO.db, .$Term, "GOID", "TERM"
        ) %>% unname()
    ) %>%
    dplyr::select(GOID, everything()) %>%
    arrange(P.DE)
## 'select()' returned 1:1 mapping between keys and columns

Sanity check

go %>% filter(grepl("glucocorticoid", Term))
## # A tibble: 22 x 6
##    GOID     Term                                 Ont       N    DE     P.DE
##    <chr>    <chr>                                <chr> <dbl> <dbl>    <dbl>
##  1 GO:0051… response to glucocorticoid           BP       92    43  1.11e-5
##  2 GO:0071… cellular response to glucocorticoid… BP       42    21  6.42e-4
##  3 GO:2000… positive regulation of glucocortico… BP        2     2  6.64e-2
##  4 GO:2000… regulation of glucocorticoid recept… BP        8     4  1.25e-1
##  5 GO:0006… glucocorticoid biosynthetic process  BP        8     4  1.25e-1
##  6 GO:0043… glucocorticoid mediated signaling p… BP        3     2  1.65e-1
##  7 GO:0008… glucocorticoid metabolic process     BP       13     5  2.26e-1
##  8 GO:0004… glucocorticoid receptor activity     MF        1     1  2.58e-1
##  9 GO:0031… negative regulation of glucocortico… BP        4     2  2.75e-1
## 10 GO:0031… negative regulation of glucocortico… BP        4     2  2.75e-1
## # … with 12 more rows

What genes in set?

genesets <-
    AnnotationDbi::select(org.Hs.eg.db, tbl$ENTREZID, "GO", "ENTREZID") %>%
    as_tibble() %>%
    dplyr::select(ENTREZID, GO, ONTOLOGY) %>%
    distinct()
## 'select()' returned many:many mapping between keys and columns
genesets
## # A tibble: 191,100 x 3
##    ENTREZID GO         ONTOLOGY
##    <chr>    <chr>      <chr>   
##  1 7105     GO:0005515 MF      
##  2 7105     GO:0039532 BP      
##  3 7105     GO:0043123 BP      
##  4 7105     GO:0070062 CC      
##  5 7105     GO:1901223 BP      
##  6 8813     GO:0004169 MF      
##  7 8813     GO:0004582 MF      
##  8 8813     GO:0005515 MF      
##  9 8813     GO:0005634 CC      
## 10 8813     GO:0005783 CC      
## # … with 191,090 more rows

3 Provenance

sessionInfo()
## R version 3.6.0 Patched (2019-04-26 r76431)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] GO.db_3.8.2                 tibble_2.1.3               
##  [3] dplyr_0.8.2                 limma_3.41.6               
##  [5] org.Hs.eg.db_3.8.2          AnnotationDbi_1.47.0       
##  [7] airway_1.5.0                DESeq2_1.25.4              
##  [9] EnrichmentBrowser_2.15.4    graph_1.63.0               
## [11] SummarizedExperiment_1.15.5 DelayedArray_0.11.2        
## [13] BiocParallel_1.19.0         matrixStats_0.54.0         
## [15] Biobase_2.45.0              GenomicRanges_1.37.14      
## [17] GenomeInfoDb_1.21.1         IRanges_2.19.10            
## [19] S4Vectors_0.23.17           BiocGenerics_0.31.4        
## [21] BiocStyle_2.13.2           
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
##  [4] tools_3.6.0            backports_1.1.4        utf8_1.1.4            
##  [7] R6_2.4.0               rpart_4.1-15           Hmisc_4.2-0           
## [10] DBI_1.0.0              lazyeval_0.2.2         colorspace_1.4-1      
## [13] nnet_7.3-12            tidyselect_0.2.5       gridExtra_2.3         
## [16] bit_1.1-14             compiler_3.6.0         cli_1.1.0             
## [19] htmlTable_1.13.1       bookdown_0.11          KEGGgraph_1.45.0      
## [22] scales_1.0.0           checkmate_1.9.3        genefilter_1.67.1     
## [25] rappdirs_0.3.1         stringr_1.4.0          digest_0.6.19         
## [28] foreign_0.8-71         rmarkdown_1.13         XVector_0.25.0        
## [31] base64enc_0.1-3        pkgconfig_2.0.2        htmltools_0.3.6       
## [34] htmlwidgets_1.3        rlang_0.4.0            rstudioapi_0.10       
## [37] RSQLite_2.1.1          acepack_1.4.1          RCurl_1.95-4.12       
## [40] magrittr_1.5           GenomeInfoDbData_1.2.1 Formula_1.2-3         
## [43] Matrix_1.2-17          fansi_0.4.0            Rcpp_1.0.1            
## [46] munsell_0.5.0          stringi_1.4.3          yaml_2.2.0            
## [49] zlibbioc_1.31.0        grid_3.6.0             blob_1.1.1            
## [52] crayon_1.3.4           lattice_0.20-38        splines_3.6.0         
## [55] annotate_1.63.0        locfit_1.5-9.1         zeallot_0.1.0         
## [58] knitr_1.23             pillar_1.4.2           codetools_0.2-16      
## [61] geneplotter_1.63.0     XML_3.98-1.20          glue_1.3.1            
## [64] evaluate_0.14          latticeExtra_0.6-28    data.table_1.12.2     
## [67] BiocManager_1.30.5.1   vctrs_0.1.0            gtable_0.3.0          
## [70] purrr_0.3.2            assertthat_0.2.1       ggplot2_3.2.0         
## [73] xfun_0.8               xtable_1.8-4           survival_2.44-1.1     
## [76] memoise_1.1.0          cluster_2.1.0          GSEABase_1.47.0