Contents

Install packages

Installing all necessary packages can be accomplished by the following command:

library(BiocInstaller)
biocLite(c("waldronlab/MicrobiomeWorkshop", "epiviz/metavizr@bioc-workshop"),
         dependencies=TRUE)

Double-check that “metavizr” is installed from GitHub:

if(!("metavizr" %in% installed.packages()) || "1.1.3" != installed.packages()["metavizr","Version"])
  devtools::install_github("epiviz/metavizr@bioc-workshop", build_vignettes = TRUE)
## Downloading GitHub repo epiviz/metavizr@bioc-workshop
## from URL https://api.github.com/repos/epiviz/metavizr/zipball/bioc-workshop
## Installing metavizr
## '/usr/local/lib/R/bin/R' --no-site-file --no-environ --no-save  \
##   --no-restore --quiet CMD build  \
##   '/tmp/RtmpF7ZUtk/devtools1d676e54f4b/epiviz-metavizr-f4884e2'  \
##   --no-resave-data --no-manual
## 
## Installation failed: Command failed (1)

Load all needed packages:

cranpkgs=c("ggplot2","devtools","vegan","httr")
BioCpkgs=c( "curatedMetagenomicData", "phyloseq")
ghpkgs= "metavizr"
allpkg <- c(cranpkgs, BioCpkgs, ghpkgs)
suppressPackageStartupMessages(sapply(allpkg, require, character.only = TRUE))
## No methods found in "RSQLite" for requests: dbGetQuery
##                ggplot2               devtools                  vegan 
##                   TRUE                   TRUE                   TRUE 
##                   httr curatedMetagenomicData               phyloseq 
##                   TRUE                   TRUE                   TRUE 
##               metavizr 
##                   TRUE

curatedMetagenomicData

curatedMetagenomicData provides 6 types of data for each dataset:

  1. Species-level taxonomic profiles, expressed as relative abundance from kingdom to strain level
  2. Presence of unique, clade-specific markers
  3. Abundance of unique, clade-specific markers
  4. Abundance of gene families
  5. Metabolic pathway coverage
  6. Metabolic pathway abundance

Types 1-3 are generated by MetaPhlAn2; 4-6 are generated by HUMAnN2.

Currently, curatedMetagenomicData provides:

These datasets are documented in the reference manuals of the release or devel versions of Bioconductor. The development version has approximately twice as many datasets and significantly more functionality. The AMI provided at the Bioc2017 workshop runs the development version of Bioconductor; see http://bioconductor.org/developers/how-to/useDevel/ for instructions on upgrading your own machine to the development version and switching between devel and release.

Using curatedMetagenomicData Resources

Use of the resources in curatedMetagenomicData is simplified with the use of Bioconductor’s ExperimentHub platform, which allows for the accessing of data through an intuitive interface. First, curatedMetagenomicData is installed using BiocInstaller and then called as a library - the process allows for the user to simply call datasets as functions because the package is aware of the resources present in ExperimentHub S3 buckets.

# BiocInstaller::biocLite("curatedMetagenomicData")  #Bioconductor version
# BiocInstaller::biocLite("waldronlab/curatedMetagenomicData")  #bleeding edge version
suppressPackageStartupMessages(library(curatedMetagenomicData))

Available samples and metadata

The manually curated metadata for all available samples are provided in a single table combined_metadata:

?combined_metadata
View(combined_metadata)
table(combined_metadata$antibiotics_current_use)
## 
##   no  yes 
## 1855  558
table(combined_metadata$disease)
## 
##                                   AD                                AD;AR 
##                                   10                                   16 
##                            AD;asthma                         AD;asthma;AR 
##                                    4                                    8 
##                                   AR                               CDI;NA 
##                                    2                                   14 
##                       CDI;cellulitis                   CDI;osteoarthritis 
##                                    2                                    1 
##                        CDI;pneumonia                    CDI;ureteralstone 
##                                   15                                    1 
##                                  CRC                 CRC;T2D;hypertension 
##                                  231                                    2 
##                      CRC;fatty_liver     CRC;fatty_liver;T2D;hypertension 
##                                    3                                    4 
##         CRC;fatty_liver;hypertension                     CRC;hypertension 
##                                   12                                   12 
##                                  IBD                                  IGT 
##                                  148                                   49 
##                                   NK                                 STEC 
##                                    1                                   43 
##                                  T1D                      T1D;coeliac;IBD 
##                                   24                                    3 
##                                  T2D                     T2D;hypertension 
##                                  223                                    1 
##                      abdominalhernia                              adenoma 
##                                    2                                   50 
##                          adenoma;T2D             adenoma;T2D;hypertension 
##                                    1                                    2 
##                  adenoma;fatty_liver              adenoma;fatty_liver;T2D 
##                                   12                                    1 
## adenoma;fatty_liver;T2D;hypertension     adenoma;fatty_liver;hypertension 
##                                    1                                   14 
##                 adenoma;hypertension                            arthritis 
##                                    8                                   11 
##                            asthma;AR                           bronchitis 
##                                    8                                   18 
##                           cellulitis                            cirrhosis 
##                                    6                                    8 
##                        cirrhosis;HBV                    cirrhosis;HBV;HDV 
##                                   39                                    3 
##            cirrhosis;HBV;HDV;ascites                     cirrhosis;HBV;HE 
##                                    2                                    2 
##                    cirrhosis;HBV;HEV            cirrhosis;HBV;HEV;ascites 
##                                    3                                    4 
##                cirrhosis;HBV;ascites    cirrhosis;HBV;schistosoma;ascites 
##                                   44                                    1 
##         cirrhosis;HBV;wilson;ascites                        cirrhosis;HCV 
##                                    1                                    1 
##                     cirrhosis;HEV;HE                    cirrhosis;ascites 
##                                    1                                    9 
##        cirrhosis;pbcirrhosis;ascites    cirrhosis;schistosoma;HEV;ascites 
##                                    1                                    2 
##        cirrhosis;schistosoma;ascites             cirrhosis;wilson;ascites 
##                                    1                                    1 
##     coeliac;gestational_diabetes;CMV                                cough 
##                                    2                                    2 
##                             cystitis                          fatty_liver 
##                                    1                                    8 
##                      fatty_liver;T2D         fatty_liver;T2D;hypertension 
##                                    3                                    9 
##             fatty_liver;hypertension                                fever 
##                                   13                                    3 
##                             gangrene                              healthy 
##                                    1                                 2455 
##                            hepatitis                         hypertension 
##                                    3                                   11 
##            infectiousgastroenteritis                       osteoarthritis 
##                                    5                                    1 
##                               otitis                            pneumonia 
##                                  107                                    7 
##                            psoriasis                  psoriasis;arthritis 
##                                   36                                   12 
##                        pyelonefritis                       pyelonephritis 
##                                    2                                    6 
##                       respiratoryinf                        salmonellosis 
##                                   13                                    1 
##                        schizophrenia                     schizophrenia;CD 
##                                   12                                    1 
##                    schizophrenia;T2D                               sepsis 
##                                    3                                    1 
##                              skininf                           stomatitis 
##                                    2                                    2 
##                              suspinf                          tonsillitis 
##                                    1                                    3

Accessing datasets

Individual data projects can be fetched via per-dataset functions or the curatedMetagenomicData() function. A function call to a dataset name returns a Bioconductor ExpressionSet object:

suppressPackageStartupMessages(library(curatedMetagenomicData))
zeller.eset = ZellerG_2014.metaphlan_bugs_list.stool()

The following creates a list of two ExpressionSet objects providing the BritoIL_2016 and Castro-NallarE_2015 oral cavity taxonomic profiles (this method works in release or devel, but these datasets are available in devel only). See a complete list of available datasets in the PDF manual.

oral <- c("BritoIL_2016.metaphlan_bugs_list.oralcavity",
          "Castro-NallarE_2015.metaphlan_bugs_list.oralcavity")
esl <- curatedMetagenomicData(oral, dryrun = FALSE)
esl
esl[[1]]
esl[[2]]

And the following would provide all stool metaphlan datasets if dryrun = FALSE were set:

curatedMetagenomicData("*metaphlan_bugs_list.stool*", dryrun = TRUE)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
##  [1] "AsnicarF_2017.metaphlan_bugs_list.stool"       
##  [2] "BritoIL_2016.metaphlan_bugs_list.stool"        
##  [3] "FengQ_2015.metaphlan_bugs_list.stool"          
##  [4] "HMP_2012.metaphlan_bugs_list.stool"            
##  [5] "Heitz-BuschartA_2016.metaphlan_bugs_list.stool"
##  [6] "KarlssonFH_2013.metaphlan_bugs_list.stool"     
##  [7] "LeChatelierE_2013.metaphlan_bugs_list.stool"   
##  [8] "LiuW_2016.metaphlan_bugs_list.stool"           
##  [9] "LomanNJ_2013.metaphlan_bugs_list.stool"        
## [10] "NielsenHB_2014.metaphlan_bugs_list.stool"      
## [11] "Obregon-TitoAJ_2015.metaphlan_bugs_list.stool" 
## [12] "QinJ_2012.metaphlan_bugs_list.stool"           
## [13] "QinN_2014.metaphlan_bugs_list.stool"           
## [14] "RampelliS_2015.metaphlan_bugs_list.stool"      
## [15] "RaymondF_2016.metaphlan_bugs_list.stool"       
## [16] "SchirmerM_2016.metaphlan_bugs_list.stool"      
## [17] "VatanenT_2016.metaphlan_bugs_list.stool"       
## [18] "VincentC_2016.metaphlan_bugs_list.stool"       
## [19] "VogtmannE_2016.metaphlan_bugs_list.stool"      
## [20] "XieH_2016.metaphlan_bugs_list.stool"           
## [21] "YuJ_2015.metaphlan_bugs_list.stool"            
## [22] "ZellerG_2014.metaphlan_bugs_list.stool"

Merging multiple datasets

The following merges the two oral cavity datasets downloaded above into a single ExpressionSet (devel only).

eset <- mergeData(esl)
eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1914 features, 172 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.1.SA
##     BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.10.SA ...
##     Castro-NallarE_2015.metaphlan_bugs_list.oralcavity:ES_080 (172
##     total)
##   varLabels: body_site antibiotics_current_use ... studyID (19
##     total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:

This works for any number of datasets. The function will not stop you from merging different data types (e.g. metaphlan bugs lists with gene families), but you probably don’t want to do that.

Using ExpressionSet Objects

All datasets are represented as ExpressionSet objects because of the integrative nature of the class and its ability to bind data and metadata. There are three main functions, from the Biobase package, that provide access to experiment-level metadata, subject-level metadata, and the data itself.

To access the experiment-level metadata the experimentData() function is used to return a MIAME (Minimum Information About a Microarray Experiment) object.

experimentData( zeller.eset )
## Experiment data
##   Experimenter name: Zeller G, Tap J, Voigt AY, Sunagawa S, Kultima JR, Costea PI, Amiot A, Böhm J, Brunetti F, Habermann N, Hercog R, Koch M, Luciani A, Mende DR, Schneider MA, Schrotz-King P, Tournigand C, Tran Van Nhieu J, Yamada T, Zimmermann J, Benes V, Kloor M, Ulrich CM, von Knebel Doeberitz M, Sobhani I, Bork P 
##   Laboratory: Structural and Computational Biology Unit, European Molecular Biology Laboratory, Heidelberg, Germany. 
##   Contact information:  
##   Title: Potential of fecal microbiota for early-stage detection of colorectal cancer. 
##   URL:  
##   PMIDs: 25432777 
## 
##   Abstract: A 175 word abstract is available. Use 'abstract' method.
##   notes:
##    Sequencing platform:     
##       IlluminaHiSeq

To access the subject-level metadata the pData() function is used to return a data.frame containing subject-level variables of study.

head( pData( zeller.eset ) )
##                    subjectID body_site antibiotics_current_use
## CCIS27304052ST-3-0    FR-001     stool                    <NA>
## CCIS13047523ST-4-0    FR-003     stool                    <NA>
## CCIS15794887ST-4-0    FR-024     stool                    <NA>
## CCIS94603952ST-4-0    FR-025     stool                    <NA>
## CCIS74726977ST-3-0    FR-026     stool                    <NA>
## CCIS02856720ST-4-0    FR-027     stool                    <NA>
##                    study_condition disease disease_subtype age
## CCIS27304052ST-3-0         control    <NA>            <NA>  52
## CCIS13047523ST-4-0         control    <NA>            <NA>  70
## CCIS15794887ST-4-0         control    <NA>            <NA>  37
## CCIS94603952ST-4-0         control    <NA>            <NA>  80
## CCIS74726977ST-3-0         adenoma adenoma    smalladenoma  66
## CCIS02856720ST-4-0         control    <NA>            <NA>  74
##                    age_category gender BMI country non_westernized
## CCIS27304052ST-3-0        adult female  20     FRA              no
## CCIS13047523ST-4-0       senior   male  22     FRA              no
## CCIS15794887ST-4-0        adult female  18     FRA              no
## CCIS94603952ST-4-0       senior female  21     FRA              no
## CCIS74726977ST-3-0       senior   male  24     FRA              no
## CCIS02856720ST-4-0       senior   male  27     FRA              no
##                    DNA_extraction_kit number_reads number_bases
## CCIS27304052ST-3-0              Gnome     48552680           NA
## CCIS13047523ST-4-0              Gnome     62891719           NA
## CCIS15794887ST-4-0              Gnome     45764894           NA
## CCIS94603952ST-4-0              Gnome     69804753           NA
## CCIS74726977ST-3-0              Gnome    140945980           NA
## CCIS02856720ST-4-0              Gnome     73567419           NA
##                    minimum_read_length median_read_length ajcc fobt  tnm
## CCIS27304052ST-3-0                  45                 NA <NA>   no <NA>
## CCIS13047523ST-4-0                  45                 NA <NA>   no <NA>
## CCIS15794887ST-4-0                  45                 NA <NA>   no <NA>
## CCIS94603952ST-4-0                   1                 NA <NA>   no <NA>
## CCIS74726977ST-3-0                  45                 NA <NA>   no <NA>
## CCIS02856720ST-4-0                   1                 NA <NA>  yes <NA>
##                                                                                     NCBI_accession
## CCIS27304052ST-3-0                                         ERR480588;ERR480587;ERR479092;ERR479091
## CCIS13047523ST-4-0 ERR480532;ERR480529;ERR480531;ERR480530;ERR479036;ERR479034;ERR479035;ERR479033
## CCIS15794887ST-4-0                                         ERR480538;ERR480537;ERR479042;ERR479041
## CCIS94603952ST-4-0 ERR480895;ERR480894;ERR480893;ERR480896;ERR479400;ERR479399;ERR479398;ERR479397
## CCIS74726977ST-3-0                                                             ERR480794;ERR479298
## CCIS02856720ST-4-0 ERR480469;ERR480468;ERR480467;ERR480466;ERR478973;ERR478972;ERR478971;ERR478970

To access the data itself (in this case relative abundance), the exprs() function returns a variables by samples (rows by columns) numeric matrix. Note the presence of “synthetic” clades at all levels of the taxonomy, starting with kingdom, e.g. k__bacteria here:

exprs( zeller.eset )[1:6, 1:5]  #first 6 rows and 5 columns
##                              CCIS27304052ST-3-0 CCIS13047523ST-4-0
## k__Bacteria                            95.80672           98.39758
## k__Archaea                              3.11078            0.00847
## k__Viruses                              1.08249            1.59396
## k__Bacteria|p__Firmicutes              69.64557           72.81784
## k__Bacteria|p__Bacteroidetes           21.37090           24.69795
## k__Archaea|p__Euryarchaeota             3.11078            0.00847
##                              CCIS15794887ST-4-0 CCIS94603952ST-4-0
## k__Bacteria                            99.99562          100.00000
## k__Archaea                              0.00438            0.00000
## k__Viruses                              0.00000            0.00000
## k__Bacteria|p__Firmicutes              55.31008           71.41404
## k__Bacteria|p__Bacteroidetes           23.73706           16.01887
## k__Archaea|p__Euryarchaeota             0.00438            0.00000
##                              CCIS74726977ST-3-0
## k__Bacteria                            90.08853
## k__Archaea                              1.98404
## k__Viruses                              7.92743
## k__Bacteria|p__Firmicutes              58.24313
## k__Bacteria|p__Bacteroidetes           27.91643
## k__Archaea|p__Euryarchaeota             1.98404

Bioconductor provides further documentation of the ExpressionSet class and has published an excellent introduction.

Creating phyloseq and metagenomeSeq objects

curatedMetagenomicData provides convenience functions for converting its ExpressionSet objects to phyloseq::phyloseq and metagenomeSeq:MRExperiment class objects for downstream analysis. For example, to convert the zeller.eset object to these two classes:

ExpressionSet2phyloseq(zeller.eset)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10503 taxa and 199 samples ]
## sample_data() Sample Data:       [ 199 samples by 21 sample variables ]
## tax_table()   Taxonomy Table:    [ 10503 taxa by 8 taxonomic ranks ]
ExpressionSet2MRexperiment(zeller.eset)
## MRexperiment (storageMode: environment)
## assayData: 10503 features, 199 samples 
##   element names: counts 
## protocolData: none
## phenoData
##   sampleNames: CCIS27304052ST-3-0 CCIS13047523ST-4-0 ...
##     MMPU72854103ST (199 total)
##   varLabels: subjectID body_site ... NCBI_accession (21 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: k__Bacteria k__Archaea ... t__PRJNA15006 (10503
##     total)
##   fvarLabels: Kingdom Phylum ... Strain (8 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

This also works on the merged eset object from above:

ExpressionSet2phyloseq(eset)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1914 taxa and 172 samples ]
## sample_data() Sample Data:       [ 172 samples by 19 sample variables ]
## tax_table()   Taxonomy Table:    [ 1914 taxa by 8 taxonomic ranks ]
ExpressionSet2MRexperiment(eset)
## MRexperiment (storageMode: environment)
## assayData: 1914 features, 172 samples 
##   element names: counts 
## protocolData: none
## phenoData
##   sampleNames: BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.1.SA
##     BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.10.SA ...
##     Castro-NallarE_2015.metaphlan_bugs_list.oralcavity:ES_080 (172
##     total)
##   varLabels: body_site antibiotics_current_use ... studyID (19
##     total)
##   varMetadata: labelDescription
## featureData
##   featureNames: k__Bacteria k__Viruses ...
##     t__Jonquetella_anthropi_unclassified (1914 total)
##   fvarLabels: Kingdom Phylum ... Strain (8 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Examples of phyloseq analyses using curatedMetagenomicData datasets are provided in the curatedMetagenomicData vignette.

phylogenetic trees and UniFrac distances

Set phylogenetictree = TRUE to include a phylogenetic tree in the phyloseq object (bioc-devel only):

zeller.tree <- ExpressionSet2phyloseq( zeller.eset, phylogenetictree = TRUE)
wt = UniFrac(zeller.tree, weighted=TRUE, normalized=FALSE, 
             parallel=FALSE, fast=TRUE)
plot(hclust(wt), main="Weighted UniFrac distances")

Exploratory analysis of E. coli prevalence

Here’s a direct, exploratory analysis of E. coli prevalence in the zeller dataset using the ExpressionSet object. More elegant solutions will be provided later using subsetting methods provided by the phyloseq package, but for users familiar with grep() and the ExpressionSet object, such manual methods may suffice.

First, which E. coli-related taxa are available?

grep("coli", rownames(zeller.eset), value=TRUE)
##  [1] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli"                                            
##  [2] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli|t__Escherichia_coli_unclassified"           
##  [3] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Anaerotruncus|s__Anaerotruncus_colihominis"                                                     
##  [4] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Anaerotruncus|s__Anaerotruncus_colihominis|t__GCF_000154565"                                    
##  [5] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptostreptococcaceae|g__Peptostreptococcaceae_noname|s__Clostridium_glycolicum"                                   
##  [6] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptostreptococcaceae|g__Peptostreptococcaceae_noname|s__Clostridium_glycolicum|t__GCF_000373865"                  
##  [7] "k__Bacteria|p__Proteobacteria|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter|s__Campylobacter_coli"                                      
##  [8] "k__Bacteria|p__Proteobacteria|c__Epsilonproteobacteria|o__Campylobacterales|f__Campylobacteraceae|g__Campylobacter|s__Campylobacter_coli|t__Campylobacter_coli_unclassified"   
##  [9] "k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Amycolicicoccus"                                                                     
## [10] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae|g__Syntrophobotulus|s__Syntrophobotulus_glycolicus"                                                 
## [11] "k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Amycolicicoccus|s__Amycolicicoccus_subflavus"                                        
## [12] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae|g__Syntrophobotulus|s__Syntrophobotulus_glycolicus|t__GCF_000190635"                                
## [13] "k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Amycolicicoccus|s__Amycolicicoccus_subflavus|t__GCF_000214175"                       
## [14] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Yersinia|s__Yersinia_enterocolitica"                                        
## [15] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_halioticoli"                                                           
## [16] "k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Spirochaetales|f__Brachyspiraceae|g__Brachyspira|s__Brachyspira_pilosicoli"                                                     
## [17] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Yersinia|s__Yersinia_enterocolitica|t__Yersinia_enterocolitica_unclassified"
## [18] "k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Vibrionales|f__Vibrionaceae|g__Vibrio|s__Vibrio_halioticoli|t__GCF_000496695"                                          
## [19] "k__Bacteria|p__Spirochaetes|c__Spirochaetia|o__Spirochaetales|f__Brachyspiraceae|g__Brachyspira|s__Brachyspira_pilosicoli|t__Brachyspira_pilosicoli_unclassified"              
## [20] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Clostridium|s__Clostridium_colicanis"                                                            
## [21] "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Clostridiaceae|g__Clostridium|s__Clostridium_colicanis|t__GCF_000371465"

Create a vector of E. coli relative abundances. This grep call with a “$” at the end selects the only row that ends with “s__Escherichia_coli“:

x = exprs( zeller.eset )[grep("s__Escherichia_coli$", rownames( zeller.eset)), ]
summary( x )
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.01985  0.15273  1.64121  1.37392 37.54376

This could be plotted as a histogram:

hist( x, xlab = "Relative Abundance", main="Prevalence of E. Coli",
      breaks="FD")

Estimating Absolute Raw Count Data

Absolute raw count data can be estimated from the relative count data by multiplying the columns of the ExpressionSet data by the number of reads for each sample, as found in the pData column “number_reads”. For demo purposes you could (but don’t have to!) do this manually by dividing by 100 and multiplying by the number of reads:

zeller.counts = sweep(exprs( zeller.eset ), 2, zeller.eset$number_reads / 100, "*")
zeller.counts = round(zeller.counts)
zeller.counts[1:6, 1:5]
##                              CCIS27304052ST-3-0 CCIS13047523ST-4-0
## k__Bacteria                            46516730           61883930
## k__Archaea                              1510367               5327
## k__Viruses                               525578            1002469
## k__Bacteria|p__Firmicutes              33814791           45796391
## k__Bacteria|p__Bacteroidetes           10376145           15532965
## k__Archaea|p__Euryarchaeota             1510367               5327
##                              CCIS15794887ST-4-0 CCIS94603952ST-4-0
## k__Bacteria                            45762889           69804753
## k__Archaea                                 2005                  0
## k__Viruses                                    0                  0
## k__Bacteria|p__Firmicutes              25312599           49850394
## k__Bacteria|p__Bacteroidetes           10863240           11181933
## k__Archaea|p__Euryarchaeota                2005                  0
##                              CCIS74726977ST-3-0
## k__Bacteria                           126976161
## k__Archaea                              2796425
## k__Viruses                             11173394
## k__Bacteria|p__Firmicutes              82091350
## k__Bacteria|p__Bacteroidetes           39347086
## k__Archaea|p__Euryarchaeota             2796425

or just set the counts argument in curatedMetagenomicData() to TRUE:

zeller.eset2 = curatedMetagenomicData("ZellerG_2014.metaphlan_bugs_list.stool",
                                     counts = TRUE, 
                                     dryrun = FALSE)[[1]]
## Working on ZellerG_2014.metaphlan_bugs_list.stool
## snapshotDate(): 2017-06-09
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## loading from cache '/home/ubuntu//.ExperimentHub/535'
all.equal(exprs(zeller.eset2), zeller.counts)
## [1] TRUE

Taxonomy-Aware Analysis using phyloseq

For the MetaPhlAn2 bugs datasets (but not other data types), you gain a lot of phylogeny-aware, ecological analysis and plotting by conversion to a phyloseq class object. curatedMetagenomicData provides the ExpressionSet2phyloseq() function to make this easy:

suppressPackageStartupMessages(library(phyloseq))
zeller.pseq = ExpressionSet2phyloseq( zeller.eset )

Note the following useful arguments to ExpressionSet2phyloseq:

Visualization with Metavizr

We can now examine the zeller dataset with Metavizr. This visualization tool allows interactive exploration of the taxonomic hierarchy and statistically-guided visual analysis. Specifically, metavizr enables feature selection through a navigation mechanism called FacetZoom. We first create an MRexperiment for the zeller dataset using ExpressionSet2MRexperiment. For demonstration purposes in this workshop, we will subset the analysis to samples under 60 years old and either control or CRC study_condition.

zeller_MR_expr = ExpressionSet2MRexperiment(zeller.eset)
zeller_MR_expr <- zeller_MR_expr[, which(pData(zeller_MR_expr)$age < 60)]
zeller_MR_expr <- zeller_MR_expr[,-which(pData(zeller_MR_expr)$study_condition == "adenoma")]

public_ipv4 <- try(httr::content(httr::GET("http://169.254.169.254/latest/meta-data/public-ipv4")), silent = TRUE)

if(grepl("Timeout was reached", simpleMessage(public_ipv4))){
  app <- startMetaviz(host="http://metaviz-dev.cbcb.umd.edu")
} else{
  app <- startMetaviz(host="http://metaviz-dev.cbcb.umd.edu", ws_host = public_ipv4, daemonized = FALSE, try_ports = TRUE)
}

From the metavizr vignette: Once the browser is open we can visualize the metagenomic features from the zeller_MR_expr object. We use the plot method to do so. The plot function can take as input any of the classes registered with epivirData.

Bioconductor AMI note: On rstudio-server version < 1.1.273, the user need to run the app$service() command after each user input on the UI. There is a bug with Rstudio-server interferring with the R event loop. metavizr should run without issue on rstudio desktop.

facetZoom <- app$plot(zeller_MR_expr, type = "InnerNodeCounts", datasource_name = "zeller", feature_order = colnames(fData(zeller_MR_expr)))
app$service()

From the metavizr Vignette: You should now see a FacetZoom to explore the hierarchy of the metagenomic features from the MRexperiment object. To navigate the complex, hierarchical structure of the feature space, we made use of the FacetZoom visualization design and extended it to metagenomic data. Because of the limitations in the screen size and performance rendering big trees, this technique is an efficient visualization for navigating trees. The FacetZoom helps zoom in and out of trees and traverse subtrees. Every node in the tree has a state associated with it and are three possible states for a node 1) expand - use all subtree nodes during analysis 2) collapse - aggregate all nodes under the subtree to the selected node 3) remove - discard all nodes in the subtree for the analysis. The state of a node is also propagated to all its children and can be identified by the opacity of the node. Row level controls are available on the left side - to set the state of all nodes at a selected depth/taxonomic class of the hierarchy. The right hand side of the FacetZoom shows the location of the current subtree. Users can set states on the nodes to define a cut over the feature space. The cut defines how the count data is visualized in linked plots and charts. In addition to defining the cut, we also have a navigation bar to limit the range of features when querying count data. Navigation controls move the bar left/right and extend over the entire range of the current tree in the icicle. Each of these actions queries the underlying data structure and automatically propagates the changes to other visualizations in the workspace. When navigating outside the scope of the navigation bar, chevrons (left/right) appear on the navigation bar to help identify the current position. Hovering over a node highlights the entire lineage in the FacetZoom along with all other linked charts and plots.

We can now add a heatmap showing the count data from the MRexperiment. After the app$service() call and while ‘Serving Epiviz, escape to continue interactive session…’ is shown on the R command prompt, we can click on the center icon of the node label P on the left hand side of the FacetZoom. This will update the heatmap from showing counts at the Class level to show counts at the Phylum level. The message ‘zeller_1 datasourceGroup caches cleared’ should appear in the R session. We can also make a selection in the UI and run app$service() to process the request.

heatmap_plot <- app$chart_mgr$revisualize(chart_type = "HeatmapPlot", chart = facetZoom)
app$service()

We can remove lineages and show counts at any level of the taxonomy by setting nodes states in the FacetZoom. Once we are done exploring the data, we can remove these charts by calling the chart manager.

app$chart_mgr$rm_all_charts()

Compute differential abundance with metagenomeSeq and use features selections in metaviz

Now we will use metagenomeSeq to compute differential abundance for the zeller dataset at different levels of the taxonomy. For the metagenomeSeq data model, we will keep only the leaf counts and then use the metagenomeSeq function aggregateByTaxonomy to compute counts at different levels of the hierarchy.

# Removes counts and features all levels of the taxonomy except for the lowest level
zeller_MR_expr <- zeller_MR_expr[-unname(which(is.na(fData(zeller_MR_expr)$Strain))),]

# Remove the t__ from each leaf level feature in the MRexperiment counts data
rownames(zeller_MR_expr) <- sapply(strsplit(rownames(zeller_MR_expr), "__"), function(i){unname(i)[2]})

Now we can use the fitFeatureModel function to estimate log fold change for the features at the lowest level of the taxonomy, in this case Strain, between samples in the control and crc study conditions. We first normalize the count data using the cumulative sum scaling method.

# Normalize counts using cummulative sum scaling method.
zeller_MR_expr <- filterData(zeller_MR_expr, present = 5)
zeller_MR_expr <- cumNorm(zeller_MR_expr, p = 0.75)

# Set model to study_condition and use the fitFeatureModel to test
zeller_sample_data <-  pData(zeller_MR_expr)
mod <-  model.matrix(~1+study_condition, data = zeller_sample_data)
results_zeller <-  fitFeatureModel(zeller_MR_expr, mod)
logFC_zeller <- MRcoefs(results_zeller, number = nrow(results_zeller))

metagenomeSeq provides a variety of testing methods. We could also use the fitZig method to test for study_condition controlling for gender. fitZig adds posterior probabilities to the MRexperiment and so we create a separate MRexperiment to pass to that function.

# Creating a separate MRexperiment to pass to fitZig 
zig_zeller_MR_expr = ExpressionSet2MRexperiment(zeller.eset)
zig_zeller_MR_expr <- zig_zeller_MR_expr[, which(pData(zig_zeller_MR_expr)$age < 60)]
zig_zeller_MR_expr <- zig_zeller_MR_expr[,-which(pData(zig_zeller_MR_expr)$study_condition == "adenoma")]
zig_zeller_MR_expr <- zig_zeller_MR_expr[-unname(which(is.na(fData(zig_zeller_MR_expr)$Strain))),]
rownames(zig_zeller_MR_expr) <- sapply(strsplit(rownames(zig_zeller_MR_expr), "__"), function(i){unname(i)[2]})
zig_zeller_MR_expr <- filterData(zig_zeller_MR_expr, present = 5)
zig_zeller_MR_expr <- cumNorm(zig_zeller_MR_expr, p = 0.75)

zig_zeller_sample_data <-  pData(zig_zeller_MR_expr)

mod_zig <-  model.matrix(~1+study_condition+gender, data = zig_zeller_sample_data)
results_zeller_zig <-  fitZig(zig_zeller_MR_expr, mod_zig)
coefs_zeller_zig <- MRtable(results_zeller_zig, number = nrow(results_zeller_zig))

The metagenomeSeq package vignette has further details on the available tests implemented as well as other functions useful for analysis.

We will use the result from fitFeatureModel as it provides logFC estimates for the features at this level between samples in the control and crc groups. We can examine them in tabular form. We can also use these results to select feature names to remove from consideration in a Metaviz. After removing features using a logFC threshold and adjusted p-value cutoff, we can interactively explore the results for the remaining features at different levels of the hierarchy.

features <- rownames(logFC_zeller)

featuresToKeep_names <- features[which(logFC_zeller[which(abs(logFC_zeller$logFC) > 2),]$adjPvalues <.1)]
featuresToKeep <- rep(2, length(featuresToKeep_names))
names(featuresToKeep) <-  featuresToKeep_names

featuresToRemove_names <- features[!(features %in% featuresToKeep_names)]
featuresToRemove <- rep(0, length(featuresToRemove_names))
names(featuresToRemove) <- featuresToRemove_names
featureSelection = c(featuresToKeep, featuresToRemove)

We can add a new FacetZoom, heatmap, and stacked plot and then select features of interest. Once the heatmap and stacked plot are added, we can navigate to the lowest level of the taxonomy and see the features removed from consideration.

facetZoom <- app$plot(zeller_MR_expr, type = "LeafCounts", datasource_name="zeller_leaf_level", feature_order = colnames(fData(zeller_MR_expr)))
app$service()
heatmap <- app$chart_mgr$revisualize(chart_type = "HeatmapPlot", chart = facetZoom)
app$service()
stackedPlot <- app$chart_mgr$revisualize(chart_type ="StackedLinePlot", chart = facetZoom)
app$service()

Now we can call propagateHierarchyChanges with the features of interest. The Strain features should be removed that do not show differential abundance.

app$get_ms_object(facetZoom)$propagateHierarchyChanges(featureSelection, request_with_labels = TRUE)

We can also perform the same fitFeatureModel calculation with counts aggregated to another level of the taxonomy and propagate those changes to the existing visualizations. In this case we will aggregate counts to the Class level of the taxonomy and then keep only features that show a logFC greater than 1 between the two groups.

#Aggregate to the Class level
aggregation_level = "Class"
agg_zeller_MR_expr <- aggregateByTaxonomy(zeller_MR_expr, lvl=aggregation_level)

agg_zeller_MR_expr <-  cumNorm(agg_zeller_MR_expr, p = 0.75)
agg_mod <-  model.matrix(~1+study_condition, data = pData(agg_zeller_MR_expr))
agg_results_zeller <-  fitFeatureModel(agg_zeller_MR_expr, agg_mod)
agg_logFC_zeller <- MRcoefs(agg_results_zeller, number = nrow(agg_results_zeller))

agg_features <- rownames(agg_logFC_zeller)
agg_featuresToKeep_names <- agg_features[which(agg_logFC_zeller[which(abs(agg_logFC_zeller$logFC) > 1),]$adjPvalues <.1)]
agg_featuresToKeep <- rep(2, length(agg_featuresToKeep_names))
names(agg_featuresToKeep) <- agg_featuresToKeep_names

agg_featuresToRemove_names <- agg_features[!(agg_features %in% agg_featuresToKeep_names)]
agg_featuresToRemove <- rep(0, length(agg_featuresToRemove_names))
names(agg_featuresToRemove) <- agg_featuresToRemove_names

agg_selectionUpdate <- c(agg_featuresToKeep, agg_featuresToRemove)
app$get_ms_object(facetZoom)$propagateHierarchyChanges(agg_selectionUpdate, request_with_labels = TRUE)