## ----options, include=FALSE, echo=FALSE--------------------------------------- knitr::opts_chunk$set( warning = FALSE, error = FALSE, message = FALSE) ## ----eval=FALSE--------------------------------------------------------------- # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("benchdamic") ## ----eval=FALSE--------------------------------------------------------------- # if (!require("devtools", quietly = TRUE)) # install.packages("devtools") # # devtools::install_github("mcalgaro93/benchdamic") ## ----load_packs--------------------------------------------------------------- library(benchdamic) # Parallel computation library(BiocParallel) # Generate simulated data library(SPsimSeq) # Data management library(phyloseq) library(SummarizedExperiment) library(plyr) # Graphics and tables library(ggplot2) library(cowplot) library(kableExtra) ## ----dataloading_stool-------------------------------------------------------- data("ps_stool_16S") ps_stool_16S ## ----dataloading_plaque------------------------------------------------------- data("ps_plaque_16S") ps_plaque_16S ## ----figGOF, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Goodness of Fit diagram."---- knitr::include_graphics("./GOF_structure.svg") ## ----fitHURDLE_alone---------------------------------------------------------- example_HURDLE <- fitHURDLE( object = ps_stool_16S, scale = "median" ) head(example_HURDLE) ## ----observe_hurdle----------------------------------------------------------- observed_hurdle <- prepareObserved( object = ps_stool_16S, scale = "median") head(observed_hurdle) ## ----prepareObserved_normal--------------------------------------------------- head(prepareObserved(object = ps_stool_16S)) ## ----meanDifferences_hurdle--------------------------------------------------- head(meanDifferences( estimated = example_HURDLE, observed = observed_hurdle )) ## ----fitting------------------------------------------------------------------ GOF_stool_16S <- fitModels( object = ps_stool_16S, models = c("NB", "ZINB", "DM", "ZIG", "HURDLE"), scale_HURDLE = c("median", "default"), verbose = FALSE # TRUE is always suggested ) ## ----RMSE_MD------------------------------------------------------------------ plotRMSE(GOF_stool_16S, difference = "MD", plotIt = FALSE) ## ----RMSE_ZPD----------------------------------------------------------------- plotRMSE(GOF_stool_16S, difference = "ZPD", plotIt = FALSE) ## ----plotGOFMD, fig.width=15, fig.height=4, fig.cap="MD plot. Mean-difference (MD) between the estimated and observed count values for each distribution."---- plotMD( data = GOF_stool_16S, difference = "MD", split = TRUE ) ## ----NA_values, eval=FALSE---------------------------------------------------- # plyr::ldply(GOF_stool_16S, function(model) # c("Number of NAs" = sum(is.na(model))), # .id = "Distribution") ## ----plotGOFMDnoHurdleDefault, warning=FALSE, fig.width=15, fig.height=4, fig.cap="MD plot reduced. Mean-difference (MD) between the estimated and observed count values for the first 5 distributions."---- plotMD( data = GOF_stool_16S[1:5], difference = "MD", split = TRUE ) ## ----plotGOFZPD, fig.width=15, fig.height=4, fig.cap="ZPD plot. Mean-difference between the estimated probability to observe a zero and the observed proportion of zero values (ZPD) for the first 5 distributions."---- plotMD( data = GOF_stool_16S[1:5], difference = "ZPD", split = TRUE ) ## ----plotGOFMDcollapsed, warning=FALSE, fig.width=10, fig.height=5, fig.cap="MD and ZPD plots. MD and ZPD plotted together for the first 5 distributions."---- plot_grid(plotMD(data = GOF_stool_16S[1:5], difference = "MD", split = FALSE), plotMD(data = GOF_stool_16S[1:5], difference = "ZPD", split = FALSE), ncol = 2 ) ## ----plotGOFRMSE, fig.width=10,fig.height=7, fig.cap="RMSE plot. Root Mean Squared Errors (RMSE) for both the MD and ZPD values for all the distributions."---- plot_grid(plotRMSE(GOF_stool_16S, difference = "MD"), plotRMSE(GOF_stool_16S, difference = "ZPD"), ncol = 2 ) ## ----availableMethodsTable, echo=FALSE---------------------------------------- available_methods <- read.csv(file = "./benchdamic_methods.csv", sep = ";") kable(x = available_methods, caption = "DA methods available in benchdamic.", col.names = c("Method (package)", "Short description", "Test", "Normalization / Transformation", "Suggested input", "Output", "Application"), booktabs = TRUE) %>% kable_styling(latex_options = "scale_down") %>% row_spec(0, bold = TRUE, color = "black") %>% column_spec(c(1,5,6), width = "3cm", color = "black") %>% column_spec(2:4, width = "6cm", color = "black") %>% landscape() ## ----customExample, eval=FALSE------------------------------------------------ # DA_yourMethod <- function( # object, # assay_name = "counts", # param1, # param2, # verbose = TRUE) # { # if(verbose) # message("Reading data") # # Extract the data from phyloseq or TreeSummarizedExperiment # counts_metadata <- get_counts_metadata( # object = object, # assay_name = assay_name) # counts <- counts_metadata[[1]] # First position = counts # metadata <- counts_metadata[[2]] # Second position = metadata # # ### your method's code # # Many things here # if(verbose) # message("I'm doing this step.") # # Many other things here # ### end of your method's code # # if(verbose) # message("Extracting important statistics") # # contains the p-values # vector_of_pval <- NA # # contains the adjusted p-values # vector_of_adjusted_pval <- NA # # contains the OTU, or ASV, or other feature names. # # Usually extracted from the rownames of the count data # name_of_your_features <- NA # # contains the logFCs # vector_of_logFC <- NA # # contains other statistics # vector_of_statistics <- NA # # if(verbose) # message("Preparing the output") # pValMat <- data.frame("rawP" = vector_of_pval, # "adjP" = vector_of_adjusted_pval) # statInfo <- data.frame("logFC" = vector_of_logFC, # "statistics" = vector_of_statistics) # name <- "write.here.the.name" # # Be sure that the algorithm hasn't changed the order of the features. If it # # happens, re-establish the original order. # rownames(pValMat) <- rownames(statInfo) <- name_of_your_features # # # Return the output as a list # return(list("pValMat" = pValMat, "statInfo" = statInfo, "name" = name)) # } # END - function: DA_yourMethod ## ----customExampleInstances, eval=FALSE--------------------------------------- # my_custom_method <- list( # customMethod.1 = list( # First instance # method = "DA_yourMethod", # The name of the function to call # assay_name = "counts", # param1 = "A", # Its combination of parameters # param2 = "B"), # No need of verbose and object parameters # customMethod.2 = list( # Second instance # method = "DA_yourMethod", # assay_name = "counts", # param1 = "C", # param2 = "D") # # Other istances # ) ## ----figTIEC, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Type I Error Control diagram."---- knitr::include_graphics("./TIEC_structure.svg") ## ----createMocks-------------------------------------------------------------- set.seed(123) my_mocks <- createMocks( nsamples = phyloseq::nsamples(ps_stool_16S), N = 3 ) # At least N = 1000 is suggested ## ----normalizationManual, eval=FALSE------------------------------------------ # ps_stool_16S <- norm_edgeR( # object = ps_stool_16S, # method = "TMM" # ) # ps_stool_16S <- norm_DESeq2( # object = ps_stool_16S, # method = "poscounts" # ) # ps_stool_16S <- norm_CSS( # object = ps_stool_16S, # method = "CSS" # ) ## ----setNormalization--------------------------------------------------------- my_normalizations <- setNormalizations( fun = c("norm_edgeR", "norm_DESeq2", "norm_CSS"), method = c("TMM", "poscounts", "CSS")) ps_stool_16S <- runNormalizations(normalization_list = my_normalizations, object = ps_stool_16S, verbose = TRUE) ## ----simple_filterTIEC, message=FALSE----------------------------------------- ps_stool_16S <- phyloseq::filter_taxa( physeq = ps_stool_16S, flist = function(x) sum(x > 0) >= 3, prune = TRUE) ps_stool_16S ## ----weights------------------------------------------------------------------ zinbweights <- weights_ZINB( object = ps_stool_16S, K = 0, design = "~ 1", ) ## ----set_Methods-------------------------------------------------------------- my_basic <- set_basic(pseudo_count = FALSE, contrast = c("group", "grp2", "grp1"), test = c("t", "wilcox"), paired = FALSE, expand = TRUE) my_edgeR <- set_edgeR( pseudo_count = FALSE, group_name = "group", design = ~ group, robust = FALSE, coef = 2, norm = "TMM", weights_logical = c(TRUE, FALSE), expand = TRUE) my_DESeq2 <- set_DESeq2( pseudo_count = FALSE, design = ~ group, contrast = c("group", "grp2", "grp1"), norm = "poscounts", weights_logical = c(TRUE, FALSE), alpha = 0.05, expand = TRUE) my_limma <- set_limma( pseudo_count = FALSE, design = ~ group, coef = 2, norm = "TMM", weights_logical = c(FALSE, TRUE), expand = TRUE) my_ALDEx2 <- set_ALDEx2( pseudo_count = FALSE, design = "group", mc.samples = 128, test = "wilcox", paired.test = FALSE, denom = c("all", "iqlr"), contrast = c("group", "grp2", "grp1"), expand = TRUE) my_metagenomeSeq <- set_metagenomeSeq( pseudo_count = FALSE, design = "~ group", coef = "groupgrp2", norm = "CSS", model = c("fitFeatureModel", "fitZig"), expand = TRUE) # Temporarily removed from CRAN # my_corncob <- set_corncob( # pseudo_count = FALSE, # formula = ~ group, # formula_null = ~ 1, # phi.formula = ~ group, # phi.formula_null = ~ group, # test = c("Wald", "LRT"), # boot = FALSE, # coefficient = "groupgrp2") my_MAST <- set_MAST( pseudo_count = FALSE, rescale = c("default", "median"), design = "~ 1 + group", coefficient = "groupgrp2", expand = TRUE) my_Seurat <- set_Seurat( pseudo_count = FALSE, test = c("t", "wilcox"), contrast = c("group", "grp2", "grp1"), norm = c("LogNormalize", "CLR"), scale.factor = 10^5, expand = TRUE ) my_ANCOM <- set_ANCOM( pseudo_count = FALSE, fix_formula = "group", contrast = c("group", "grp2", "grp1"), BC = TRUE, expand = TRUE ) my_dearseq <- set_dearseq( pseudo_count = FALSE,covariates = NULL, variables2test = "group", preprocessed = FALSE, test = "asymptotic", expand = TRUE) my_linda <- set_linda( formula = "~ group", contrast = c("group", "grp2", "grp1"), is.winsor = TRUE, zero.handling = "pseudo-count", alpha = 0.05, expand = TRUE) my_Maaslin2 <- set_Maaslin2( normalization = "TSS", transform = "LOG", analysis_method = "LM", fixed_effects = "group", contrast = c("group", "grp2", "grp1"), expand = TRUE) my_ZicoSeq <- set_ZicoSeq(contrast = c("group", "grp2", "grp1"), feature.dat.type = "count", is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top", is.post.sample = TRUE, post.sample.no = 25, perm.no = 99, ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, link.func = list(function(x) sign(x) * (abs(x))^0.5)) my_methods <- c(my_basic, my_edgeR, my_DESeq2, my_limma, my_metagenomeSeq, # my_corncob, my_ALDEx2, my_MAST, my_Seurat, my_ANCOM, my_dearseq, my_linda, my_Maaslin2, my_ZicoSeq) ## ----runMocks----------------------------------------------------------------- bpparam <- BiocParallel::SerialParam() # Random grouping each time Stool_16S_mockDA <- runMocks( mocks = my_mocks, method_list = my_methods, object = ps_stool_16S, weights = zinbweights, verbose = FALSE, BPPARAM = bpparam) ## ----eval=FALSE--------------------------------------------------------------- # ancom_index <- which(grepl(pattern = "ANCOM", # names(my_methods))) # bpparam = BiocParallel::MulticoreParam() # Stool_16S_mockDA <- runMocks( # mocks = my_mocks, # method_list = my_methods[-ancom_index], # object = ps_stool_16S, # weights = zinbweights, # verbose = FALSE, # BPPARAM = bpparam) ## ----eval=FALSE--------------------------------------------------------------- # # Modify the n_cl parameter # my_ANCOM_parallel <- set_ANCOM( # pseudo_count = FALSE, # fix_formula = "group", # contrast = c("group", "grp2", "grp1"), # BC = TRUE, # n_cl = 2, # Set this number according to your machine # expand = TRUE # ) # # bpparam = BiocParallel::SerialParam() # Stool_16S_mockDA_ANCOM <- runMocks( # mocks = my_mocks, # method_list = my_ANCOM_parallel, # Only ANCOM # object = ps_stool_16S, # weights = zinbweights, # verbose = FALSE, # BPPARAM = bpparam) ## ----new_methods-------------------------------------------------------------- my_new_limma <- set_limma( pseudo_count = FALSE, design = ~ group, coef = 2, norm = "CSS", weights_logical = FALSE) ## ----runMocks_new_limma, message=FALSE---------------------------------------- Stool_16S_mockDA_new_limma <- runMocks( mocks = my_mocks, method_list = my_new_limma, object = ps_stool_16S, verbose = FALSE, BPPARAM = bpparam) ## ----merging_old_and_new------------------------------------------------------ Stool_16S_mockDA_merged <- mapply( Stool_16S_mockDA, # List of old results Stool_16S_mockDA_new_limma, # List of new results FUN = function(old, new){ c(old, new) # Concatenate the elements }, SIMPLIFY = FALSE) ## ----createTIEC, warning=FALSE------------------------------------------------ TIEC_summary <- createTIEC(Stool_16S_mockDA) ## ----FPRplot, fig.cap="FPR plot. Boxplots of the proportion of raw p-values lower than the commonly used thresholds for the nominal $\\alpha$ (i.e. the False Positive Rate) for each DA method.", fig.width=8, fig.height=5---- cols <- createColors(variable = levels(TIEC_summary$df_pval$Method)) plotFPR(df_FPR = TIEC_summary$df_FPR, cols = cols) ## ----FDRplot, fig.cap="FDR plot. Average nominal False Discovery Rate values for several commonly used thresholds for each DA method.", fig.width=8, fig.height=5---- plotFDR(df_FDR = TIEC_summary$df_FDR, cols = cols) ## ----QQplot, fig.cap="QQ plot. Quantile-quantile plot from 0 to 0.1 for each DA methods. Average curves are reported", fig.width=8, fig.height=7---- plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 0.1), cols = cols) + guides(colour = guide_legend(ncol = 1)) ## ----QQplotsplit, fig.cap="QQ plot. Quantile-quantile plots from 0 to 1 for each DA method are displayed separately. Average curves are reported", fig.height=12, fig.width=12---- plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 1), cols = cols, split = TRUE) ## ----KSplot, fig.cap="KS plot. Kolmogorov-Smirnov (KS) statistic boxplots for each DA methods where the raw p-value distribution is compared with a uniform distribution."---- plotKS(df_KS = TIEC_summary$df_KS, cols = cols) ## ----LogPplot, fig.cap="-log10(p-value) plot. Negative logarithm distribution of p-values. Red-shaded vertical bars represent the 90, 95, and 99 percentiles for the negative log distribution of p-values for each method. They should align with the dotted lines which represent the percentiles of the IDEAL distribution.", fig.width=8, fig.height=10---- plotLogP(df_pval = TIEC_summary$df_pval, cols = cols) ## ----AveLogPplot, fig.cap="-log10(average p-value) plot ('average' refers to the average p-value computed for each quantile across mocks comparisons). Negative logarithm distribution of average p-values. Red-shaded vertical bars represent the 90, 95, and 99 percentiles for the negative log distribution of average p-values for each method. They should align with the dotted lines which represent the percentiles of the IDEAL distribution.", fig.width=8, fig.height=10---- plotLogP(df_QQ = TIEC_summary$df_QQ, cols = cols) ## ----figconcordance, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Concordance diagram."---- knitr::include_graphics("./concordance_structure.svg") ## ----createSplits------------------------------------------------------------- set.seed(123) # Make sure that groups and subject IDs are factors sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE <- factor(sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE) sample_data(ps_plaque_16S)$RSID <- factor(sample_data(ps_plaque_16S)$RSID) my_splits <- createSplits( object = ps_plaque_16S, varName = "HMP_BODY_SUBSITE", paired = "RSID", balanced = TRUE, N = 2 ) # At least 100 is suggested ## ----set_Methods_noweights---------------------------------------------------- my_edgeR_noWeights <- set_edgeR( group_name = "HMP_BODY_SUBSITE", design = ~ 1 + RSID + HMP_BODY_SUBSITE, coef = "HMP_BODY_SUBSITESupragingival Plaque", norm = "TMM") my_DESeq2_noWeights <- set_DESeq2( contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), design = ~ 1 + RSID + HMP_BODY_SUBSITE, norm = "poscounts") my_limma_noWeights <- set_limma( design = ~ 1 + RSID + HMP_BODY_SUBSITE, coef = "HMP_BODY_SUBSITESupragingival Plaque", norm = "TMM") my_ALDEx2 <- set_ALDEx2( pseudo_count = FALSE, design = "HMP_BODY_SUBSITE", mc.samples = 128, test = "wilcox", paired.test = TRUE, denom = "all", contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque")) my_MAST <- set_MAST( pseudo_count = FALSE, rescale = "median", design = "~ 1 + RSID + HMP_BODY_SUBSITE", coefficient = "HMP_BODY_SUBSITESupragingival Plaque") my_dearseq <- set_dearseq( pseudo_count = FALSE, covariates = NULL, variables2test = "HMP_BODY_SUBSITE", sample_group = "RSID", test = "asymptotic", preprocessed = FALSE) # Temporarily removed (v2.3.2 crashes R) # # Very time consuming # my_ANCOM <- set_ANCOM( # pseudo_count = FALSE, # adj_formula = NULL, # rand_formula = "(1|RSID)", # lme_control = lme4::lmerControl(), # contrast = c("HMP_BODY_SUBSITE", # "Supragingival Plaque", "Subgingival Plaque"), # BC = FALSE) my_linda <- set_linda( formula = "~ HMP_BODY_SUBSITE + (1|RSID)", contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), is.winsor = TRUE, zero.handling = "pseudo-count", alpha = 0.05) my_Maaslin2 <- set_Maaslin2( normalization = "TSS", transform = "LOG", analysis_method = "LM", fixed_effects = "HMP_BODY_SUBSITE", random_effects = "RSID", contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque")) # Temporarily removed due to issues with ID_variables # my_mixMC <- set_mixMC( # pseudo_count = 1, # ID_variable = "RSID", # contrast = c("HMP_BODY_SUBSITE", # "Supragingival Plaque", "Subgingival Plaque")) my_ZicoSeq <- set_ZicoSeq( contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), strata = "RSID", feature.dat.type = "count", is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top", is.post.sample = TRUE, post.sample.no = 25, perm.no = 99, ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, link.func = list(function(x) sign(x) * (abs(x))^0.5)) my_methods_noWeights <- c( my_edgeR_noWeights, my_DESeq2_noWeights, my_limma_noWeights, my_ALDEx2, my_MAST, my_dearseq, # my_ANCOM, # Temporary bug with random effects my_linda, my_Maaslin2, # my_mixMC, # Temporary bug with multilevel my_ZicoSeq) ## ----info_normalizations------------------------------------------------------ str(my_normalizations) ## ----runSplits---------------------------------------------------------------- # Set the parallel framework # Remember that ANCOMBC based methods are compatible only with SerialParam() bpparam <- BiocParallel::SerialParam() # Make sure the subject ID variable is a factor phyloseq::sample_data(ps_plaque_16S)[, "RSID"] <- as.factor( phyloseq::sample_data(ps_plaque_16S)[["RSID"]]) Plaque_16S_splitsDA <- runSplits( split_list = my_splits, method_list = my_methods_noWeights, normalization_list = my_normalizations, object = ps_plaque_16S, min_counts = 0, min_samples = 2, verbose = FALSE, BPPARAM = bpparam) ## ----second_round------------------------------------------------------------- my_basic <- set_basic( pseudo_count = FALSE, contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), test = "wilcox", paired = TRUE) Plaque_16S_splitsDA_basic <- runSplits( split_list = my_splits, method_list = my_basic, normalization_list = NULL, object = ps_plaque_16S, min_counts = 0, min_samples = 2, verbose = FALSE) ## ----add_a_second_round------------------------------------------------------- Plaque_16S_splitsDA_all <- mapply( Plaque_16S_splitsDA, # List of old results Plaque_16S_splitsDA_basic, # List of new results FUN = function(subset_old, subset_new){ mapply( subset_old, subset_new, FUN = function(old, new){ return(c(old, new)) }, SIMPLIFY = FALSE) }, SIMPLIFY = FALSE) ## ----createConcordance-------------------------------------------------------- concordance <- createConcordance( object = Plaque_16S_splitsDA_all, slot = "pValMat", colName = "rawP", type = "pvalue" ) head(concordance) ## ----getNames----------------------------------------------------------------- names(Plaque_16S_splitsDA_all$Subset1$Comparison1) ## ----logFC_names-------------------------------------------------------------- cat("edgeR.TMM", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$edgeR.TMM$statInfo) cat("DESeq2.poscounts", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$DESeq2.poscounts$statInfo) cat("limma.TMM", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$limma.TMM$statInfo) cat("ALDEx2.all.wilcox.paired", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ALDEx2.all.wilcox.paired$ statInfo) cat("MAST.median", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$MAST.median$statInfo) cat("dearseq.repeated.asymptotic", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$dearseq.repeated.asymptotic$ statInfo) # cat("ANCOM", "\n") # names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ANCOM$statInfo) cat("linda", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$linda.win0.03.pc0.5$statInfo) cat("Maaslin2", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$Maaslin2.TSSnorm.LOGtrans.LM$statInfo) # cat("mixMC", "\n") # names(Plaque_16S_splitsDA_all$Subset1$Comparison1$mixMC.pc1$statInfo) cat("ZicoSeq", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ZicoSeq.winsor0.03top.ref0.5.excl0.2$statInfo) cat("basic.wilcox.paired", "\n") names(Plaque_16S_splitsDA_all$Subset1$Comparison1$basic.wilcox.paired$statInfo) ## ----alternativeConcordance, eval=FALSE--------------------------------------- # concordance_alternative <- createConcordance( # object = Plaque_16S_splitsDA_all, # slot = "statInfo", # colName = c("logFC", "log2FoldChange", "logFC", # "effect", "logFC", "rawP", # # "W", # ANCOM # "log2FoldChange", # "coef", # "importance", # mixMC # "effect", "logFC"), # type = c("logfc", "logfc", "logfc", "logfc", # "logfc", "pvalue", # # "logfc", # ANCOM # "logfc", "logfc", # #"logfc", # mixMC # "logfc", "logfc") # ) ## ----plotConcordance, fig.cap="BMC and WMC plot. Between-method concordance (BMC) and within-method concordance (WMC) (main diagonal) averaged values from rank 1 to 30.", fig.width=10, fig.height=8---- pC <- plotConcordance(concordance = concordance, threshold = 30) cowplot::plot_grid(plotlist = pC, ncol = 2, align = "h", axis = "tb", rel_widths = c(1, 3)) ## ----figenrichment, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Enrichment analysis diagram."---- knitr::include_graphics("./enrichment_structure.svg") ## ----priorKnowledge----------------------------------------------------------- data("microbial_metabolism") head(microbial_metabolism) ## ----exampleOfIntegration----------------------------------------------------- # Extract genera from the phyloseq tax_table slot genera <- tax_table(ps_plaque_16S)[, "GENUS"] # Genera as rownames of microbial_metabolism data.frame rownames(microbial_metabolism) <- microbial_metabolism$Genus # Match OTUs to their metabolism priorInfo <- data.frame(genera, "Type" = microbial_metabolism[genera, "Type"]) unknown_metabolism <- is.na(priorInfo$Type) priorInfo[unknown_metabolism, "Type"] <- "Unknown" # Relabel 'F Anaerobic' to 'F_Anaerobic' to remove space priorInfo$Type <- factor(priorInfo$Type, levels = c("Aerobic","Anaerobic","F Anaerobic","Unknown"), labels = c("Aerobic","Anaerobic","F_Anaerobic","Unknown")) # Add a more informative names column priorInfo[, "newNames"] <- paste0(rownames(priorInfo), "|", priorInfo[, "GENUS"]) ## ----normalize_plaque--------------------------------------------------------- ps_plaque_16S <- runNormalizations(my_normalizations, object = ps_plaque_16S) ## ----filter_plauque----------------------------------------------------------- ps_plaque_16S <- phyloseq::filter_taxa(physeq = ps_plaque_16S, flist = function(x) sum(x > 0) >= 3, prune = TRUE) ps_plaque_16S ## ----plaque_weights----------------------------------------------------------- plaque_weights <- weights_ZINB(object = ps_plaque_16S, design = ~ 1, zeroinflation = TRUE) ## ----enrichment_methods------------------------------------------------------- my_edgeR <- set_edgeR( group_name = "HMP_BODY_SUBSITE", design = ~ 1 + RSID + HMP_BODY_SUBSITE, coef = "HMP_BODY_SUBSITESupragingival Plaque", norm = "TMM", weights_logical = TRUE) my_DESeq2 <- set_DESeq2( contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"), design = ~ 0 + RSID + HMP_BODY_SUBSITE, norm = "poscounts", weights_logical = TRUE) my_limma <- set_limma( design = ~ 1 + RSID + HMP_BODY_SUBSITE, coef = "HMP_BODY_SUBSITESupragingival Plaque", norm = "TMM", weights_logical = TRUE) my_methods <- c(my_methods_noWeights, my_edgeR, my_DESeq2, my_limma) ## ----runDA_enrichment, message=FALSE------------------------------------------ Plaque_16S_DA <- runDA(method_list = my_methods, object = ps_plaque_16S, weights = plaque_weights, verbose = FALSE) ## ----info_DA------------------------------------------------------------------ names(Plaque_16S_DA) ## ----createEnrichment--------------------------------------------------------- enrichment <- createEnrichment( object = Plaque_16S_DA[-c(6)], priorKnowledge = priorInfo, enrichmentCol = "Type", namesCol = "newNames", slot = "pValMat", colName = "adjP", type = "pvalue", direction = c( "logFC", # edgeR "log2FoldChange", # DEseq2 "logFC", # limma "effect", # ALDEx2 "logFC", # MAST "log2FoldChange", # linda "coef", # Maaslin2 # "importance", # mixMC "effect", # ZicoSeq "logFC", # edgeR with weights "log2FoldChange", # DESeq2 with weights "logFC"), # limma with weights threshold_pvalue = 0.1, threshold_logfc = 0, top = NULL, alternative = "greater", verbose = TRUE ) ## ----plotContingency, fig.cap="Contingency tables plot. Contingency tables for Aerobic and Anaerobic taxa found as differentially abundant by DESeq2.poscounts DA method. Fisher exact test has been performed on each contingency table. If the enrichment is signficantly present, the correspondent cell will be highlighted.", fig.height=4, fig.width=6---- plotContingency(enrichment = enrichment, levels_to_plot = c("Aerobic", "Anaerobic"), method = "DESeq2.poscounts") ## ----plotEnrichment, fig.width=5, fig.height=6, fig.cap="Enrichment plot. Number of differentially abundant features, colored by aerobic or anaerobic metabolism, and directed according to differential abundance direction (UP or DOWN abundant)."---- plotEnrichment(enrichment = enrichment, enrichmentCol = "Type", levels_to_plot = c("Aerobic", "Anaerobic")) ## ----plotMutualFindings, fig.width=6, fig.height=6, fig.cap="Mutual Findings plot. Number of differentially abundant features mutually found by 1 or more methods, colored by the differential abundance direction and separated by aerobic and anaerobic metabolism."---- plotMutualFindings(enrichment, enrichmentCol = "Type", levels_to_plot = c("Aerobic", "Anaerobic"), n_methods = 1) ## ----createPositives---------------------------------------------------------- positives <- createPositives( object = Plaque_16S_DA[-c(6)], priorKnowledge = priorInfo, enrichmentCol = "Type", namesCol = "newNames", slot = "pValMat", colName = "rawP", type = "pvalue", direction = c( "logFC", # edgeR "log2FoldChange", # DEseq2 "logFC", # limma "effect", # ALDEx2 "logFC", # MAST "log2FoldChange", # linda "coef", # Maaslin2 # "importance", # mixMC "effect", # ZicoSeq "logFC", # edgeR with weights "log2FoldChange", # DESeq2 with weights "logFC"), # limma with weights threshold_pvalue = 0.1, threshold_logfc = 0, top = seq.int(from = 0, to = 30, by = 3), alternative = "greater", verbose = FALSE, TP = list(c("DOWN Abundant", "Anaerobic"), c("UP Abundant", "Aerobic")), FP = list(c("DOWN Abundant", "Aerobic"), c("UP Abundant", "Anaerobic")) ) head(positives) ## ----plotPositives, fig.cap="TP, FP differences plot. Differences between the number of True Positives and False Positives for several thresholds of the top ranked raw p-values (the top 3 lowest p-values, the top 6, 9, ..., 30) for each method.", fig.height=5, fig.width=7---- plotPositives(positives) ## ----createEnrichment_nodir--------------------------------------------------- enrichment_nodir <- createEnrichment( object = Plaque_16S_DA, priorKnowledge = priorInfo, enrichmentCol = "Type", namesCol = "newNames", slot = "pValMat", colName = "adjP", type = "pvalue", threshold_pvalue = c( 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, # thresholds for methods # 0.4, # ANCOM threshold on 1-W/(ntaxa-1) 0.4 = liberal 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), # other methods threshold_logfc = 0, top = NULL, alternative = "greater", verbose = FALSE ) ## ----plotEnrichmentnodir, fig.width=6, fig.height=5, fig.cap="Enrichment plot. Number of differentially abundant features, colored by metabolism."---- plotEnrichment(enrichment = enrichment_nodir, enrichmentCol = "Type") ## ----plotMutualFindingsnodir, fig.width=5, fig.height=6, fig.cap="Mutual Findings plot. Number of differentially abundant features mutually found by 1 or more methods, separated by aerobic and anaerobic metabolism."---- plotMutualFindings(enrichment_nodir, enrichmentCol = "Type", levels_to_plot = c("Aerobic", "Anaerobic"), n_methods = 1) ## ----simulate_plaques--------------------------------------------------------- data("ps_plaque_16S") counts_and_metadata <- get_counts_metadata(ps_plaque_16S) plaque_counts <- counts_and_metadata[["counts"]] plaque_metadata <- counts_and_metadata[["metadata"]] set.seed(123) sim_list <- SPsimSeq( n.sim = 1, s.data = plaque_counts, group = plaque_metadata[, "HMP_BODY_SUBSITE"], n.genes = 100, batch.config = 1, group.config = c(0.5, 0.5), tot.samples = 50, pDE = 0.2, lfc.thrld = 0.5, model.zero.prob = FALSE, result.format = "list") ## ----simulated_to_TSE--------------------------------------------------------- sim_obj <- TreeSummarizedExperiment::TreeSummarizedExperiment( assays = list("counts" = sim_list[[1]][["counts"]]), rowData = sim_list[[1]]["rowData"], colData = sim_list[[1]]["colData"], ) # Group as factor SummarizedExperiment::colData(sim_obj)[, "colData.Group"] <- as.factor( SummarizedExperiment::colData(sim_obj)[, "colData.Group"]) ## ----simulated_priorInfo------------------------------------------------------ priorInfo <- sim_list[[1]][["rowData"]] priorInfo$Reality <- ifelse(priorInfo[, "DE.ind"], "is DA", "is not DA") ## ----normalization_simulated_data--------------------------------------------- sim_obj <- runNormalizations( normalization_list = my_normalizations, object = sim_obj, verbose = TRUE) ## ----simulated_filter--------------------------------------------------------- taxa_to_keep <- apply(assays(sim_obj)[["counts"]], 1, function(x) sum(x > 0) >= 3 & sd(x) > 1) sim_obj <- sim_obj[taxa_to_keep, ] priorInfo <- priorInfo[taxa_to_keep, ] ## ----simulated_weights-------------------------------------------------------- sim_weights <- weights_ZINB( object = sim_obj, design = ~ 1, zeroinflation = TRUE) ## ----simulated_set------------------------------------------------------------ my_basic <- set_basic(pseudo_count = FALSE, contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), test = c("t", "wilcox"), paired = FALSE, expand = TRUE) my_edgeR <- set_edgeR( pseudo_count = FALSE, group_name = "colData.Group", design = ~ colData.Group, robust = FALSE, coef = 2, norm = "TMM", weights_logical = c(TRUE, FALSE), expand = TRUE) my_DESeq2 <- set_DESeq2( pseudo_count = FALSE, design = ~ colData.Group, contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), norm = "poscounts", weights_logical = c(TRUE, FALSE), alpha = 0.1, expand = TRUE) my_limma <- set_limma( pseudo_count = FALSE, design = ~ colData.Group, coef = 2, norm = "TMM", weights_logical = c(FALSE, TRUE), expand = TRUE) my_ALDEx2 <- set_ALDEx2( pseudo_count = FALSE, design = "colData.Group", mc.samples = 128, test = "wilcox", paired.test = FALSE, denom = c("all", "iqlr"), contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), expand = TRUE) my_metagenomeSeq <- set_metagenomeSeq( pseudo_count = FALSE, design = "~ colData.Group", coef = "colData.GroupSupragingival Plaque", norm = "CSS", model = "fitFeatureModel", expand = TRUE) # Temporarily removed from CRAN # my_corncob <- set_corncob( # pseudo_count = FALSE, # formula = ~ colData.Group, # formula_null = ~ 1, # phi.formula = ~ colData.Group, # phi.formula_null = ~ colData.Group, # test = c("Wald", "LRT"), # boot = FALSE, # coefficient = "colData.GroupSupragingival Plaque") my_MAST <- set_MAST( pseudo_count = FALSE, rescale = c("default", "median"), design = "~ 1 + colData.Group", coefficient = "colData.GroupSupragingival Plaque", expand = TRUE) my_Seurat <- set_Seurat( pseudo_count = FALSE, test = c("t", "wilcox"), contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), norm = c("LogNormalize", "CLR"), scale.factor = 10^5, expand = TRUE ) my_ANCOM <- set_ANCOM( pseudo_count = FALSE, fix_formula = "colData.Group", contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), BC = c(TRUE, FALSE), alpha = 0.1, expand = TRUE ) my_dearseq <- set_dearseq( pseudo_count = FALSE, covariates = NULL, variables2test = "colData.Group", preprocessed = FALSE, test = c("permutation", "asymptotic"), expand = TRUE) my_NOISeq <- set_NOISeq( pseudo_count = FALSE, contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), norm = c("rpkm", "tmm"), expand = TRUE) my_linda <- set_linda( formula = "~ colData.Group", contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), is.winsor = TRUE, zero.handling = "pseudo-count", alpha = 0.1, expand = TRUE) my_Maaslin2 <- set_Maaslin2( normalization = "TSS", transform = "LOG", analysis_method = "LM", fixed_effects = "colData.Group", contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), expand = TRUE) my_mixMC <- set_mixMC( pseudo_count = 1, contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), expand = TRUE ) my_ZicoSeq <- set_ZicoSeq( contrast = c("colData.Group", "Supragingival Plaque", "Subgingival Plaque"), feature.dat.type = "count", strata = NULL, is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top", is.post.sample = TRUE, post.sample.no = 25, perm.no = 99, ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, link.func = list(function(x) sign(x) * (abs(x))^0.5)) my_methods <- c(my_basic, my_edgeR, my_DESeq2, my_limma, my_metagenomeSeq, #my_corncob, my_ALDEx2, my_MAST, my_Seurat, my_ANCOM, my_dearseq, my_NOISeq, my_linda, my_Maaslin2, my_mixMC, my_ZicoSeq) ## ----simulated_run------------------------------------------------------------ sim_DA <- runDA( method_list = my_methods, object = sim_obj, weights = sim_weights, verbose = FALSE) ## ----createEnrichment_nodir_sim----------------------------------------------- enrichment_nodir <- createEnrichment( object = sim_DA, priorKnowledge = priorInfo, enrichmentCol = "Reality", namesCol = NULL, slot = "pValMat", colName = "adjP", type = "pvalue", threshold_pvalue = c( rep(0.1,17), # adjP thresholds 0.4, # adjP threshold for ANCOM on 1-W/(ntaxa-1) rep(0.1,7), # adjP thresholds for other methods 0.9, # adjP threshold for mixMC 1 - stability 0.1), # ZicoSeq threshold_logfc = 0, top = NULL, alternative = "greater", verbose = FALSE ) ## ----plotEnrichmentnodirsim, fig.width=6, fig.height=6, fig.cap="Enrichment plot for simulated data. Number of differentially abundant features, colored by DA reality."---- plotEnrichment(enrichment = enrichment_nodir, enrichmentCol = "Reality") ## ----createPositives_sim------------------------------------------------------ positives_nodir <- createPositives( object = sim_DA, priorKnowledge = priorInfo, enrichmentCol = "Reality", namesCol = NULL, slot = "pValMat", colName = "adjP", type = "pvalue", threshold_pvalue = c( rep(0.1,19), # adjP thresholds 0.4, # adjP threshold for ANCOM on 1-W/(ntaxa-1) rep(0.1,7), # adjP thresholds for other methods 0.9, # adjP threshold for mixMC 1 - stability 0.1), # ZicoSeq threshold_logfc = 0, top = seq(5, 30, by = 5), alternative = "greater", verbose = FALSE, TP = list(c("DA", "is DA")), FP = list(c("DA", "is not DA")) ) ## ----plotPositivessim, fig.cap="TP, FP differences plot. Differences between the number of True Positives and False Positives for several thresholds of the top ranked adjusted p-values lower than 0.1 (the top 5 lowest adjusted p-values, the top 10, 15, ..., 30) for each method in simulated data. Red dotted line represents the total number of DA simulated features.", fig.width=10, fig.height=10---- plotPositives(positives = positives_nodir) + facet_wrap( ~ method) + theme(legend.position = "none") + geom_hline(aes(yintercept = 20), linetype = "dotted", color = "red") + geom_hline(aes(yintercept = 0), color = "black") + ylim(NA, 21) ## ----sessionInfo-------------------------------------------------------------- sessionInfo()