Contents

1 Theory

See slides.

A recent tweet provides a nice summary of efforts to benchmark gene set enrichement analysis methods using the GSEABenchmarkR package.

2 Practice

Data input and massage

library(airway)
data(airway)
airway$dex <- relevel(airway$dex, "untrt")

Differential expression analysis

library(DESeq2)
des <- DESeqDataSet(airway, design = ~ cell + dex)
des <- DESeq(des)
res <- results(des)

Transition to tidy data

library(dplyr)
library(tibble)
tbl <- res %>%
    as.data.frame() %>%
    as_tibble(rownames = "ENSEMBL")
tbl
## # A tibble: 64,102 x 7
##    ENSEMBL      baseMean log2FoldChange   lfcSE   stat     pvalue      padj
##    <chr>           <dbl>          <dbl>   <dbl>  <dbl>      <dbl>     <dbl>
##  1 ENSG0000000…  709.           -0.381   0.101  -3.79     1.52e-4   1.28e-3
##  2 ENSG0000000…    0            NA      NA      NA       NA        NA      
##  3 ENSG0000000…  520.            0.207   0.112   1.84     6.53e-2   1.97e-1
##  4 ENSG0000000…  237.            0.0379  0.143   0.264    7.92e-1   9.11e-1
##  5 ENSG0000000…   57.9          -0.0882  0.287  -0.307    7.59e-1   8.95e-1
##  6 ENSG0000000…    0.318        -1.38    3.50   -0.394    6.94e-1  NA      
##  7 ENSG0000000… 5817.            0.426   0.0883  4.83     1.38e-6   1.82e-5
##  8 ENSG0000000… 1282.           -0.241   0.0887 -2.72     6.58e-3   3.28e-2
##  9 ENSG0000000…  610.           -0.0476  0.167  -0.286    7.75e-1   9.03e-1
## 10 ENSG0000000…  369.           -0.500   0.121  -4.14     3.48e-5   3.42e-4
## # … with 64,092 more rows

2.1 Example: hypergeometric test using limma::goana()

Requires ENTREZ identifiers

library(org.Hs.eg.db)
tbl <- tbl %>%
    mutate(
        ENTREZID = mapIds(org.Hs.eg.db, ENSEMBL, "ENTREZID", "ENSEMBL")
    ) %>%
    dplyr::select(ENSEMBL, ENTREZID, everything())
tbl
## # A tibble: 64,102 x 8
##    ENSEMBL ENTREZID baseMean log2FoldChange   lfcSE   stat   pvalue
##    <chr>   <chr>       <dbl>          <dbl>   <dbl>  <dbl>    <dbl>
##  1 ENSG00… 7105      709.           -0.381   0.101  -3.79   1.52e-4
##  2 ENSG00… 64102       0            NA      NA      NA     NA      
##  3 ENSG00… 8813      520.            0.207   0.112   1.84   6.53e-2
##  4 ENSG00… 57147     237.            0.0379  0.143   0.264  7.92e-1
##  5 ENSG00… 55732      57.9          -0.0882  0.287  -0.307  7.59e-1
##  6 ENSG00… 2268        0.318        -1.38    3.50   -0.394  6.94e-1
##  7 ENSG00… 3075     5817.            0.426   0.0883  4.83   1.38e-6
##  8 ENSG00… 2519     1282.           -0.241   0.0887 -2.72   6.58e-3
##  9 ENSG00… 2729      610.           -0.0476  0.167  -0.286  7.75e-1
## 10 ENSG00… 4800      369.           -0.500   0.121  -4.14   3.48e-5
## # … with 64,092 more rows, and 1 more variable: padj <dbl>

Universe – must be testable for DE

tbl <- tbl %>%
    filter(!is.na(padj), !is.na(ENTREZID))
tbl
## # A tibble: 14,550 x 8
##    ENSEMBL   ENTREZID baseMean log2FoldChange  lfcSE   stat  pvalue    padj
##    <chr>     <chr>       <dbl>          <dbl>  <dbl>  <dbl>   <dbl>   <dbl>
##  1 ENSG0000… 7105        709.         -0.381  0.101  -3.79  1.52e-4 1.28e-3
##  2 ENSG0000… 8813        520.          0.207  0.112   1.84  6.53e-2 1.97e-1
##  3 ENSG0000… 57147       237.          0.0379 0.143   0.264 7.92e-1 9.11e-1
##  4 ENSG0000… 55732        57.9        -0.0882 0.287  -0.307 7.59e-1 8.95e-1
##  5 ENSG0000… 3075       5817.          0.426  0.0883  4.83  1.38e-6 1.82e-5
##  6 ENSG0000… 2519       1282.         -0.241  0.0887 -2.72  6.58e-3 3.28e-2
##  7 ENSG0000… 2729        610.         -0.0476 0.167  -0.286 7.75e-1 9.03e-1
##  8 ENSG0000… 4800        369.         -0.500  0.121  -4.14  3.48e-5 3.42e-4
##  9 ENSG0000… 90529       183.         -0.124  0.180  -0.689 4.91e-1 7.24e-1
## 10 ENSG0000… 57185      2814.         -0.0411 0.103  -0.400 6.89e-1 8.57e-1
## # … with 14,540 more rows

limma::goana() – Hypergeometric

library(limma)
go <-
    goana(tbl$ENTREZID[tbl$padj < .05], tbl$ENTREZID, "Hs") %>%
    as_tibble(rownames = "GOID") %>%
    dplyr::select(GOID, Ont, everything())
go
## # A tibble: 20,656 x 6
##    GOID      Ont   Term                                    N    DE     P.DE
##    <chr>     <chr> <chr>                               <dbl> <dbl>    <dbl>
##  1 GO:00017… BP    cell activation                       939   333 6.80e-12
##  2 GO:00022… BP    immune effector process               808   251 2.98e- 4
##  3 GO:00022… BP    cell activation involved in immune…   498   148 2.43e- 2
##  4 GO:00022… BP    myeloid leukocyte activation          463   147 2.01e- 3
##  5 GO:00022… BP    myeloid cell activation involved i…   399   118 4.61e- 2
##  6 GO:00022… BP    neutrophil activation involved in …   361   108 4.08e- 2
##  7 GO:00023… BP    leukocyte activation involved in i…   494   148 1.86e- 2
##  8 GO:00023… BP    immune system process                1958   652 8.14e-16
##  9 GO:00024… BP    leukocyte mediated immunity           538   165 5.33e- 3
## 10 GO:00024… BP    myeloid leukocyte mediated immunity   408   124 1.89e- 2
## # … with 20,646 more rows

What GO id’s are most differentially expressed? (Why do these gene sets seem to have a large size, N?)

go %>% arrange(P.DE)
## # A tibble: 20,656 x 6
##    GOID       Ont   Term                                   N    DE     P.DE
##    <chr>      <chr> <chr>                              <dbl> <dbl>    <dbl>
##  1 GO:0071944 CC    cell periphery                      3081  1104 8.45e-45
##  2 GO:0050896 BP    response to stimulus                5799  1859 8.77e-45
##  3 GO:0032502 BP    developmental process               4168  1410 9.73e-44
##  4 GO:0048856 BP    anatomical structure development    3891  1330 3.45e-43
##  5 GO:0005886 CC    plasma membrane                     3013  1078 3.83e-43
##  6 GO:0032501 BP    multicellular organismal process    4678  1546 1.83e-42
##  7 GO:0048731 BP    system development                  3202  1128 7.52e-42
##  8 GO:0023052 BP    signaling                           4060  1372 7.66e-42
##  9 GO:0007275 BP    multicellular organism development  3579  1231 1.46e-40
## 10 GO:0007154 BP    cell communication                  4076  1371 1.57e-40
## # … with 20,646 more rows

Sanity check?

go %>%
    filter(grepl("glucocorticoid", Term)) %>%
    arrange(P.DE)
## # A tibble: 22 x 6
##    GOID     Ont   Term                                     N    DE     P.DE
##    <chr>    <chr> <chr>                                <dbl> <dbl>    <dbl>
##  1 GO:0051… BP    response to glucocorticoid              92    43  1.11e-5
##  2 GO:0071… BP    cellular response to glucocorticoid…    42    21  6.42e-4
##  3 GO:2000… BP    positive regulation of glucocortico…     2     2  6.64e-2
##  4 GO:2000… BP    regulation of glucocorticoid recept…     8     4  1.25e-1
##  5 GO:0006… BP    glucocorticoid biosynthetic process      8     4  1.25e-1
##  6 GO:0043… BP    glucocorticoid mediated signaling p…     3     2  1.65e-1
##  7 GO:0008… BP    glucocorticoid metabolic process        13     5  2.26e-1
##  8 GO:0004… MF    glucocorticoid receptor activity         1     1  2.58e-1
##  9 GO:0031… BP    negative regulation of glucocortico…     4     2  2.75e-1
## 10 GO:0031… BP    negative regulation of glucocortico…     4     2  2.75e-1
## # … with 12 more rows

What genes in set?

genesets <-
    AnnotationDbi::select(org.Hs.eg.db, tbl$ENTREZID, "GOALL", "ENTREZID") %>%
    as_tibble() %>%
    dplyr::select(GOID = GOALL, Ont = ONTOLOGYALL, ENTREZID) %>%
    distinct() %>%
    arrange(Ont, GOID)
genesets
## # A tibble: 1,574,080 x 3
##    GOID       Ont   ENTREZID
##    <chr>      <chr> <chr>   
##  1 GO:0000002 BP    3980    
##  2 GO:0000002 BP    1890    
##  3 GO:0000002 BP    50484   
##  4 GO:0000002 BP    4205    
##  5 GO:0000002 BP    9093    
##  6 GO:0000002 BP    56652   
##  7 GO:0000002 BP    10891   
##  8 GO:0000002 BP    55186   
##  9 GO:0000002 BP    4358    
## 10 GO:0000002 BP    10000   
## # … with 1,574,070 more rows

3 Provenance

sessionInfo()
## R version 3.6.1 Patched (2019-07-16 r76845)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Users/ma38727/bin/R-3-6-branch/lib/libRblas.dylib
## LAPACK: /Users/ma38727/bin/R-3-6-branch/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tibble_2.1.3                limma_3.41.14              
##  [3] GO.db_3.8.2                 org.Hs.eg.db_3.8.2         
##  [5] AnnotationDbi_1.47.0        dplyr_0.8.3                
##  [7] airway_1.5.0                DESeq2_1.25.7              
##  [9] EnrichmentBrowser_2.15.5    graph_1.63.0               
## [11] SummarizedExperiment_1.15.5 DelayedArray_0.11.4        
## [13] BiocParallel_1.19.0         matrixStats_0.54.0         
## [15] Biobase_2.45.0              GenomicRanges_1.37.14      
## [17] GenomeInfoDb_1.21.1         IRanges_2.19.10            
## [19] S4Vectors_0.23.17           BiocGenerics_0.31.5        
## [21] BiocStyle_2.13.2           
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
##  [4] tools_3.6.1            backports_1.1.4        utf8_1.1.4            
##  [7] R6_2.4.0               rpart_4.1-15           Hmisc_4.2-0           
## [10] DBI_1.0.0              lazyeval_0.2.2         colorspace_1.4-1      
## [13] nnet_7.3-12            tidyselect_0.2.5       gridExtra_2.3         
## [16] bit_1.1-14             compiler_3.6.1         cli_1.1.0             
## [19] htmlTable_1.13.1       bookdown_0.12          KEGGgraph_1.45.1      
## [22] scales_1.0.0           checkmate_1.9.4        genefilter_1.67.1     
## [25] rappdirs_0.3.1         stringr_1.4.0          digest_0.6.20         
## [28] foreign_0.8-71         rmarkdown_1.14         XVector_0.25.0        
## [31] base64enc_0.1-3        pkgconfig_2.0.2        htmltools_0.3.6       
## [34] htmlwidgets_1.3        rlang_0.4.0            rstudioapi_0.10       
## [37] RSQLite_2.1.2          acepack_1.4.1          RCurl_1.95-4.12       
## [40] magrittr_1.5           GenomeInfoDbData_1.2.1 Formula_1.2-3         
## [43] Matrix_1.2-17          fansi_0.4.0            Rcpp_1.0.1            
## [46] munsell_0.5.0          stringi_1.4.3          yaml_2.2.0            
## [49] zlibbioc_1.31.0        grid_3.6.1             blob_1.2.0            
## [52] crayon_1.3.4           lattice_0.20-38        splines_3.6.1         
## [55] annotate_1.63.0        locfit_1.5-9.1         zeallot_0.1.0         
## [58] knitr_1.23             pillar_1.4.2           codetools_0.2-16      
## [61] geneplotter_1.63.0     XML_3.98-1.20          glue_1.3.1            
## [64] evaluate_0.14          latticeExtra_0.6-28    data.table_1.12.2     
## [67] BiocManager_1.30.4     vctrs_0.2.0            gtable_0.3.0          
## [70] purrr_0.3.2            assertthat_0.2.1       ggplot2_3.2.0         
## [73] xfun_0.8               xtable_1.8-4           survival_2.44-1.1     
## [76] memoise_1.1.0          cluster_2.1.0          GSEABase_1.47.0