## ----options, include=FALSE, echo=FALSE---------------------------------------
knitr::opts_chunk$set(
    warning = FALSE, 
    error = FALSE, 
    message = FALSE)

## ----eval=FALSE---------------------------------------------------------------
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# BiocManager::install("benchdamic")

## ----eval=FALSE---------------------------------------------------------------
# if (!require("devtools", quietly = TRUE))
#     install.packages("devtools")
# 
# devtools::install_github("mcalgaro93/benchdamic")

## ----load_packs---------------------------------------------------------------
library(benchdamic)
# Parallel computation
library(BiocParallel)
# Generate simulated data
library(SPsimSeq)
# Data management
library(phyloseq)
library(SummarizedExperiment)
library(plyr)
# Graphics and tables
library(ggplot2)
library(cowplot)
library(kableExtra)

## ----dataloading_stool--------------------------------------------------------
data("ps_stool_16S")
ps_stool_16S

## ----dataloading_plaque-------------------------------------------------------
data("ps_plaque_16S")
ps_plaque_16S

## ----figGOF, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Goodness of Fit diagram."----
knitr::include_graphics("./GOF_structure.svg")

## ----fitHURDLE_alone----------------------------------------------------------
example_HURDLE <- fitHURDLE(
    object = ps_stool_16S,
    scale = "median"
)
head(example_HURDLE)

## ----observe_hurdle-----------------------------------------------------------
observed_hurdle <- prepareObserved(
    object = ps_stool_16S, 
    scale = "median")
head(observed_hurdle)

## ----prepareObserved_normal---------------------------------------------------
head(prepareObserved(object = ps_stool_16S))

## ----meanDifferences_hurdle---------------------------------------------------
head(meanDifferences(
    estimated = example_HURDLE,
    observed = observed_hurdle
))

## ----fitting------------------------------------------------------------------
GOF_stool_16S <- fitModels(
    object = ps_stool_16S,
    models = c("NB", "ZINB", "DM", "ZIG", "HURDLE"),
    scale_HURDLE = c("median", "default"),
    verbose = FALSE # TRUE is always suggested
)

## ----RMSE_MD------------------------------------------------------------------
plotRMSE(GOF_stool_16S, difference = "MD", plotIt = FALSE)

## ----RMSE_ZPD-----------------------------------------------------------------
plotRMSE(GOF_stool_16S, difference = "ZPD", plotIt = FALSE)

## ----plotGOFMD, fig.width=15, fig.height=4, fig.cap="MD plot. Mean-difference (MD) between the estimated and observed count values for each distribution."----
plotMD(
    data = GOF_stool_16S,
    difference = "MD",
    split = TRUE
)

## ----NA_values, eval=FALSE----------------------------------------------------
# plyr::ldply(GOF_stool_16S, function(model)
#     c("Number of NAs" = sum(is.na(model))),
#     .id = "Distribution")

## ----plotGOFMDnoHurdleDefault, warning=FALSE, fig.width=15, fig.height=4, fig.cap="MD plot reduced. Mean-difference (MD) between the estimated and observed count values for the first 5 distributions."----
plotMD(
    data = GOF_stool_16S[1:5],
    difference = "MD",
    split = TRUE
)

## ----plotGOFZPD, fig.width=15, fig.height=4, fig.cap="ZPD plot. Mean-difference between the estimated probability to observe a zero and the observed proportion of zero values (ZPD) for the first 5 distributions."----
plotMD(
    data = GOF_stool_16S[1:5],
    difference = "ZPD",
    split = TRUE
)

## ----plotGOFMDcollapsed, warning=FALSE, fig.width=10, fig.height=5, fig.cap="MD and ZPD plots. MD and ZPD plotted together for the first 5 distributions."----
plot_grid(plotMD(data = GOF_stool_16S[1:5], difference = "MD", split = FALSE),
    plotMD(data = GOF_stool_16S[1:5], difference = "ZPD", split = FALSE),
    ncol = 2
)

## ----plotGOFRMSE, fig.width=10,fig.height=7, fig.cap="RMSE plot. Root Mean Squared Errors (RMSE) for both the MD and ZPD values for all the distributions."----
plot_grid(plotRMSE(GOF_stool_16S, difference = "MD"),
    plotRMSE(GOF_stool_16S, difference = "ZPD"),
    ncol = 2
)

## ----availableMethodsTable, echo=FALSE----------------------------------------
available_methods <- read.csv(file = "./benchdamic_methods.csv", sep = ";")
kable(x = available_methods, 
    caption = "DA methods available in benchdamic.",
    col.names = c("Method (package)", "Short description", "Test",
        "Normalization / Transformation", "Suggested input", 
        "Output", "Application"), booktabs = TRUE) %>%
    kable_styling(latex_options = "scale_down") %>%
    row_spec(0, bold = TRUE, color = "black") %>%
    column_spec(c(1,5,6), width = "3cm", color = "black") %>% 
    column_spec(2:4, width = "6cm", color = "black") %>%
    landscape()

## ----customExample, eval=FALSE------------------------------------------------
# DA_yourMethod <- function(
#         object,
#         assay_name = "counts",
#         param1,
#         param2,
#         verbose = TRUE)
# {
#     if(verbose)
#         message("Reading data")
#     # Extract the data from phyloseq or TreeSummarizedExperiment
#     counts_metadata <- get_counts_metadata(
#         object = object,
#         assay_name = assay_name)
#     counts <- counts_metadata[[1]] # First position = counts
#     metadata <- counts_metadata[[2]] # Second position = metadata
# 
#     ### your method's code
#     # Many things here
#     if(verbose)
#         message("I'm doing this step.")
#     # Many other things here
#     ### end of your method's code
# 
#     if(verbose)
#         message("Extracting important statistics")
#     # contains the p-values
#     vector_of_pval <- NA
#     # contains the adjusted p-values
#     vector_of_adjusted_pval <- NA
#     # contains the OTU, or ASV, or other feature names.
#     # Usually extracted from the rownames of the count data
#     name_of_your_features <- NA
#     # contains the logFCs
#     vector_of_logFC <- NA
#     # contains other statistics
#     vector_of_statistics <- NA
# 
#     if(verbose)
#         message("Preparing the output")
#     pValMat <- data.frame("rawP" = vector_of_pval,
#                           "adjP" = vector_of_adjusted_pval)
#     statInfo <- data.frame("logFC" = vector_of_logFC,
#                            "statistics" = vector_of_statistics)
#     name <- "write.here.the.name"
#     # Be sure that the algorithm hasn't changed the order of the features. If it
#     # happens, re-establish the original order.
#     rownames(pValMat) <- rownames(statInfo) <- name_of_your_features
# 
#     # Return the output as a list
#     return(list("pValMat" = pValMat, "statInfo" = statInfo, "name" = name))
# } # END - function: DA_yourMethod

## ----customExampleInstances, eval=FALSE---------------------------------------
# my_custom_method <- list(
#     customMethod.1 = list( # First instance
#         method = "DA_yourMethod", # The name of the function to call
#         assay_name = "counts",
#         param1 = "A", # Its combination of parameters
#         param2 = "B"), # No need of verbose and object parameters
#     customMethod.2 = list( # Second instance
#         method = "DA_yourMethod",
#         assay_name = "counts",
#         param1 = "C",
#         param2 = "D")
#     # Other istances
# )

## ----figTIEC, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Type I Error Control diagram."----
knitr::include_graphics("./TIEC_structure.svg")

## ----createMocks--------------------------------------------------------------
set.seed(123)
my_mocks <- createMocks(
    nsamples = phyloseq::nsamples(ps_stool_16S),
    N = 3
) # At least N = 1000 is suggested

## ----normalizationManual, eval=FALSE------------------------------------------
# ps_stool_16S <- norm_edgeR(
#     object = ps_stool_16S,
#     method = "TMM"
# )
# ps_stool_16S <- norm_DESeq2(
#     object = ps_stool_16S,
#     method = "poscounts"
# )
# ps_stool_16S <- norm_CSS(
#     object = ps_stool_16S,
#     method = "CSS"
# )

## ----setNormalization---------------------------------------------------------
my_normalizations <- setNormalizations(
    fun = c("norm_edgeR", "norm_DESeq2", "norm_CSS"), 
    method = c("TMM", "poscounts", "CSS"))
ps_stool_16S <- runNormalizations(normalization_list = my_normalizations, 
    object = ps_stool_16S, verbose = TRUE)

## ----simple_filterTIEC, message=FALSE-----------------------------------------
ps_stool_16S <- phyloseq::filter_taxa(
    physeq = ps_stool_16S, 
    flist = function(x) sum(x > 0) >= 3, prune = TRUE)
ps_stool_16S

## ----weights------------------------------------------------------------------
zinbweights <- weights_ZINB(
    object = ps_stool_16S,
    K = 0,
    design = "~ 1",
)

## ----set_Methods--------------------------------------------------------------
my_basic <- set_basic(pseudo_count = FALSE,
    contrast = c("group", "grp2", "grp1"), 
    test = c("t", "wilcox"), 
    paired = FALSE, 
    expand = TRUE)

my_edgeR <- set_edgeR(
    pseudo_count = FALSE,
    group_name = "group",
    design = ~ group,
    robust = FALSE,
    coef = 2,
    norm = "TMM",
    weights_logical = c(TRUE, FALSE),
    expand = TRUE)

my_DESeq2 <- set_DESeq2(
    pseudo_count = FALSE,
    design = ~ group,
    contrast = c("group", "grp2", "grp1"),
    norm = "poscounts",
    weights_logical = c(TRUE, FALSE),
    alpha = 0.05,
    expand = TRUE)

my_limma <- set_limma(
    pseudo_count = FALSE,
    design = ~ group,
    coef = 2,
    norm = "TMM",
    weights_logical = c(FALSE, TRUE),
    expand = TRUE)

my_ALDEx2 <- set_ALDEx2(
    pseudo_count = FALSE,
    design = "group",
    mc.samples = 128,
    test = "wilcox",
    paired.test = FALSE, 
    denom = c("all", "iqlr"), 
    contrast = c("group", "grp2", "grp1"),
    expand = TRUE)

my_metagenomeSeq <- set_metagenomeSeq(
    pseudo_count = FALSE, 
    design = "~ group", 
    coef = "groupgrp2", 
    norm = "CSS", 
    model = c("fitFeatureModel", "fitZig"),
    expand = TRUE)

my_corncob <- set_corncob(
    pseudo_count = FALSE,
    formula = ~ group,
    formula_null = ~ 1,
    phi.formula = ~ group,
    phi.formula_null = ~ group,
    test = c("Wald", "LRT"),
    boot = FALSE,
    coefficient = "groupgrp2")

my_MAST <- set_MAST(
    pseudo_count = FALSE,
    rescale = c("default", "median"),
    design = "~ 1 + group",
    coefficient = "groupgrp2",
    expand = TRUE)

my_Seurat <- set_Seurat(
    pseudo_count = FALSE,
    test = c("t", "wilcox"),
    contrast = c("group", "grp2", "grp1"),
    norm = c("LogNormalize", "CLR"),
    scale.factor = 10^5,
    expand = TRUE
)

my_ANCOM <- set_ANCOM(
    pseudo_count = FALSE,
    fix_formula = "group", 
    contrast = c("group", "grp2", "grp1"),
    BC = TRUE, 
    expand = TRUE   
)

my_dearseq <- set_dearseq(
    pseudo_count = FALSE,covariates = NULL,
    variables2test = "group", 
    preprocessed = FALSE, 
    test = "asymptotic",
    expand = TRUE)

my_linda <- set_linda(
    formula = "~ group", 
    contrast = c("group", "grp2", "grp1"),
    is.winsor = TRUE,
    zero.handling = "pseudo-count",
    alpha = 0.05,
    expand = TRUE)

my_Maaslin2 <- set_Maaslin2(
    normalization = "TSS",
    transform = "LOG",
    analysis_method = "LM",
    fixed_effects = "group",
    contrast = c("group", "grp2", "grp1"),
    expand = TRUE)

my_maaslin3 <- set_maaslin3(
    normalization = "TSS",
    transform = "LOG",
    median_comparison_abundance = TRUE,
    stat_type = "abundance",
    pvalue_type = c("abundance", "joint"),
    formula = ~ group,
    contrast = c("group", "grp2", "grp1"),
    expand = TRUE)

my_ZicoSeq <- set_ZicoSeq(contrast = c("group", "grp2", "grp1"),
    feature.dat.type = "count", 
    is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top",
    is.post.sample = TRUE, post.sample.no = 25, perm.no = 99,
    ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, 
    link.func = list(function(x) sign(x) * (abs(x))^0.5))

my_methods <- c(my_basic, my_edgeR, my_DESeq2, 
    my_limma, my_metagenomeSeq, my_corncob, 
    my_ALDEx2, my_MAST, my_Seurat, my_ANCOM, my_dearseq,
    my_linda, my_Maaslin2, my_maaslin3, my_ZicoSeq)

## ----runMocks-----------------------------------------------------------------
bpparam <- BiocParallel::SerialParam()
# Random grouping each time
Stool_16S_mockDA <- runMocks(
    mocks = my_mocks, 
    method_list = my_methods, 
    object = ps_stool_16S, 
    weights = zinbweights, 
    verbose = FALSE, 
    BPPARAM = bpparam)

## ----eval=FALSE---------------------------------------------------------------
# # Example of a run without ancom-based methods
# ancom_index <- which(grepl(pattern = "ANCOM",
#     names(my_methods)))
# bpparam = BiocParallel::MulticoreParam()
# Stool_16S_mockDA <- runMocks(
#     mocks = my_mocks,
#     method_list = my_methods[-ancom_index],
#     object = ps_stool_16S,
#     weights = zinbweights,
#     verbose = FALSE,
#     BPPARAM = bpparam)

## ----eval=FALSE---------------------------------------------------------------
# # Modify the n_cl parameter
# my_ANCOM_parallel <- set_ANCOM(
#     pseudo_count = FALSE,
#     fix_formula = "group",
#     contrast = c("group", "grp2", "grp1"),
#     BC = TRUE,
#     n_cl = 2, # Set this number according to your machine
#     expand = TRUE
# )
# 
# bpparam = BiocParallel::SerialParam()
# Stool_16S_mockDA_ANCOM <- runMocks(
#     mocks = my_mocks,
#     method_list = my_ANCOM_parallel, # Only ANCOM
#     object = ps_stool_16S,
#     weights = zinbweights,
#     verbose = FALSE,
#     BPPARAM = bpparam)

## ----new_methods--------------------------------------------------------------
my_new_limma <- set_limma(
    pseudo_count = FALSE,
    design = ~ group,
    coef = 2,
    norm = "CSS",
    weights_logical = FALSE)

## ----runMocks_new_limma, message=FALSE----------------------------------------
Stool_16S_mockDA_new_limma <- runMocks(
    mocks = my_mocks, 
    method_list = my_new_limma, 
    object = ps_stool_16S, 
    verbose = FALSE, BPPARAM = bpparam)

## ----merging_old_and_new------------------------------------------------------
Stool_16S_mockDA_merged <- mapply(
    Stool_16S_mockDA, # List of old results
    Stool_16S_mockDA_new_limma, # List of new results
    FUN = function(old, new){
        c(old, new) # Concatenate the elements
    }, SIMPLIFY = FALSE)

## ----createTIEC, warning=FALSE------------------------------------------------
TIEC_summary <- createTIEC(Stool_16S_mockDA)

## ----FPRplot, fig.cap="FPR plot. Boxplots of the proportion of raw p-values lower than the commonly used thresholds for the nominal $\\alpha$ (i.e. the False Positive Rate) for each DA method.", fig.width=8, fig.height=5----
cols <- createColors(variable = levels(TIEC_summary$df_pval$Method))
plotFPR(df_FPR = TIEC_summary$df_FPR, cols = cols)

## ----FDRplot, fig.cap="FDR plot. Average nominal False Discovery Rate values for several commonly used thresholds for each DA method.", fig.width=8, fig.height=5----
plotFDR(df_FDR = TIEC_summary$df_FDR, cols = cols)

## ----QQplot, fig.cap="QQ plot. Quantile-quantile plot from 0 to 0.1 for each DA methods. Average curves are reported", fig.width=8, fig.height=7----
plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 0.1), cols = cols) + 
    guides(colour = guide_legend(ncol = 1))

## ----QQplotsplit, fig.cap="QQ plot. Quantile-quantile plots from 0 to 1 for each DA method are displayed separately. Average curves are reported", fig.height=12, fig.width=12----
plotQQ(df_QQ = TIEC_summary$df_QQ, zoom = c(0, 1), cols = cols, split = TRUE)

## ----KSplot, fig.cap="KS plot. Kolmogorov-Smirnov (KS) statistic boxplots for each DA methods where the raw p-value distribution is compared with a uniform distribution."----
plotKS(df_KS = TIEC_summary$df_KS, cols = cols)

## ----LogPplot, fig.cap="-log10(p-value) plot. Negative logarithm distribution of p-values. Red-shaded vertical bars represent the 90, 95, and 99 percentiles for the negative log distribution of p-values for each method. They should align with the dotted lines which represent the percentiles of the IDEAL distribution.", fig.width=8, fig.height=10----
plotLogP(df_pval = TIEC_summary$df_pval, cols = cols)

## ----AveLogPplot, fig.cap="-log10(average p-value) plot ('average' refers to the average p-value computed for each quantile across mocks comparisons). Negative logarithm distribution of average p-values. Red-shaded vertical bars represent the 90, 95, and 99 percentiles for the negative log distribution of average p-values for each method. They should align with the dotted lines which represent the percentiles of the IDEAL distribution.", fig.width=8, fig.height=10----
plotLogP(df_QQ = TIEC_summary$df_QQ, cols = cols)

## ----figconcordance, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Concordance diagram."----
knitr::include_graphics("./concordance_structure.svg")

## ----createSplits-------------------------------------------------------------
set.seed(123)

# Make sure that groups and subject IDs are factors
sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE <- 
    factor(sample_data(ps_plaque_16S)$HMP_BODY_SUBSITE)
sample_data(ps_plaque_16S)$RSID <- 
    factor(sample_data(ps_plaque_16S)$RSID)

my_splits <- createSplits(
  object = ps_plaque_16S,
  varName = "HMP_BODY_SUBSITE",
  paired = "RSID",
  balanced = TRUE,
  N = 2
) # At least 100 is suggested

## ----set_Methods_noweights----------------------------------------------------
my_edgeR_noWeights <- set_edgeR(
    group_name = "HMP_BODY_SUBSITE", 
    design = ~ 1 + RSID + HMP_BODY_SUBSITE, 
    coef = "HMP_BODY_SUBSITESupragingival Plaque", 
    norm = "TMM")

my_DESeq2_noWeights <- set_DESeq2(
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"), 
    design = ~ 1 + RSID + HMP_BODY_SUBSITE, 
    norm = "poscounts")

my_limma_noWeights <- set_limma(
    design = ~ 1 + RSID + HMP_BODY_SUBSITE, 
    coef = "HMP_BODY_SUBSITESupragingival Plaque", 
    norm = "TMM")

my_ALDEx2 <- set_ALDEx2(
    pseudo_count = FALSE,
    design = "HMP_BODY_SUBSITE",
    mc.samples = 128,
    test = "wilcox",
    paired.test = TRUE, 
    denom = "all", 
    contrast = c("HMP_BODY_SUBSITE", "Supragingival Plaque", "Subgingival Plaque"))

my_MAST <- set_MAST(
    pseudo_count = FALSE,
    rescale = "median",
    design = "~ 1 + RSID + HMP_BODY_SUBSITE",
    coefficient = "HMP_BODY_SUBSITESupragingival Plaque")

my_dearseq <- set_dearseq(
    pseudo_count = FALSE, 
    covariates = NULL,
    variables2test = "HMP_BODY_SUBSITE",
    sample_group = "RSID", 
    test = "asymptotic", 
    preprocessed = FALSE)

my_ANCOM <- set_ANCOM(
    pseudo_count = FALSE,
    fix_formula = "HMP_BODY_SUBSITE",
    rand_formula = "(1|RSID)",
    lme_control = lme4::lmerControl(),
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"),
    BC = c(TRUE, FALSE))

my_linda <- set_linda(
    formula = "~ HMP_BODY_SUBSITE + (1|RSID)",
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"),
    is.winsor = TRUE, 
    zero.handling = "pseudo-count",
    alpha = 0.05)

my_Maaslin2 <- set_Maaslin2(
    normalization = "TSS",
    transform = "LOG",
    analysis_method = "LM",
    fixed_effects = "HMP_BODY_SUBSITE",
    random_effects = "RSID",
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"))

my_maaslin3 <- set_maaslin3(
    normalization = "TSS",
    transform = "LOG",
    median_comparison_abundance = TRUE,
    stat_type = "abundance", 
    pvalue_type = "joint",
    formula = "~ HMP_BODY_SUBSITE + (1 | RSID)",
    contrast = c("HMP_BODY_SUBSITE",
        "Supragingival Plaque", "Subgingival Plaque"),
    expand = TRUE)

my_mixMC <- set_mixMC(
    pseudo_count = 1,
    ID_variable = "RSID",
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"))

my_ZicoSeq <- set_ZicoSeq(
    contrast = c("HMP_BODY_SUBSITE",
        "Supragingival Plaque", "Subgingival Plaque"),
    strata = "RSID",
    feature.dat.type = "count", 
    is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top",
    is.post.sample = TRUE, post.sample.no = 25, perm.no = 99,
    ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, 
    link.func = list(function(x) sign(x) * (abs(x))^0.5))

my_methods_noWeights <- c(
    my_edgeR_noWeights, 
    my_DESeq2_noWeights, 
    my_limma_noWeights,  
    my_ALDEx2,
    my_MAST,
    my_dearseq,
    my_ANCOM,
    my_linda,
    my_Maaslin2,
    my_maaslin3,
    my_mixMC,
    my_ZicoSeq)

## ----info_normalizations------------------------------------------------------
str(my_normalizations)

## ----runSplits----------------------------------------------------------------
# Set the parallel framework
# Remember that ANCOMBC based methods are compatible only with SerialParam()
bpparam <- BiocParallel::SerialParam()

# Make sure the subject ID variable is a factor
phyloseq::sample_data(ps_plaque_16S)[, "RSID"] <- as.factor(
    phyloseq::sample_data(ps_plaque_16S)[["RSID"]])

Plaque_16S_splitsDA <- runSplits(
    split_list = my_splits, 
    method_list = my_methods_noWeights, 
    normalization_list = my_normalizations, 
    object = ps_plaque_16S, 
    min_counts = 0, min_samples = 2, 
    verbose = FALSE,
    BPPARAM = bpparam)

## ----second_round-------------------------------------------------------------
my_basic <- set_basic(
    pseudo_count = FALSE, 
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"), 
    test = "wilcox", 
    paired = TRUE)

Plaque_16S_splitsDA_basic <- runSplits(
    split_list = my_splits, 
    method_list = my_basic, 
    normalization_list = NULL, 
    object = ps_plaque_16S,
    min_counts = 0, min_samples = 2, 
    verbose = FALSE)

## ----add_a_second_round-------------------------------------------------------
Plaque_16S_splitsDA_all <- mapply(
    Plaque_16S_splitsDA, # List of old results
    Plaque_16S_splitsDA_basic, # List of new results
    FUN = function(subset_old, subset_new){
        mapply( 
            subset_old, 
            subset_new,
            FUN = function(old, new){ 
                return(c(old, new))
        }, SIMPLIFY = FALSE)
    }, SIMPLIFY = FALSE)

## ----createConcordance--------------------------------------------------------
concordance <- createConcordance(
    object = Plaque_16S_splitsDA_all,
    slot = "pValMat",
    colName = "rawP",
    type = "pvalue"
)

head(concordance)

## ----getNames-----------------------------------------------------------------
names(Plaque_16S_splitsDA_all$Subset1$Comparison1)

## ----logFC_names--------------------------------------------------------------
cat("edgeR.TMM", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$edgeR.TMM$statInfo)
cat("DESeq2.poscounts", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$DESeq2.poscounts$statInfo)
cat("limma.TMM", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$limma.TMM$statInfo)
cat("ALDEx2.all.wilcox.paired", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ALDEx2.all.wilcox.paired$
    statInfo)
cat("MAST.median", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$MAST.median$statInfo)
cat("dearseq.repeated.asymptotic", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$dearseq.repeated.asymptotic$
    statInfo)
cat("ANCOM", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ANCOM$statInfo)
cat("ANCOMBC2", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ANCOM.BC$statInfo)
cat("linda", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$linda.win0.03.pc0.5$statInfo)
cat("Maaslin2", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$Maaslin2.TSSnorm.LOGtrans.LM$statInfo)
cat("maaslin3", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$maaslin3.TSSnorm.LOGtrans.med.abS.jointP$statInfo)
cat("mixMC", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$mixMC.pc1$statInfo)
cat("ZicoSeq", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$ZicoSeq.winsor0.03top.ref0.5.excl0.2$statInfo)
cat("basic.wilcox.paired", "\n")
names(Plaque_16S_splitsDA_all$Subset1$Comparison1$basic.wilcox.paired$statInfo)

## ----alternativeConcordance, eval=FALSE---------------------------------------
# concordance_alternative <- createConcordance(
#     object = Plaque_16S_splitsDA_all,
#     slot = "statInfo",
#     colName = c("logFC", "log2FoldChange", "logFC",
#         "effect", "logFC", "rawP", "W",
#         "lfc_HMP_BODY_SUBSITESupragingival Plaque",
#         "log2FoldChange", "coef", "coef",
#         "importance",
#         "effect", "logFC"),
#     type = c("logfc", "logfc", "logfc", "logfc",
#         "logfc", "pvalue", "logfc",
#         "logfc", "logfc", "logfc", "logfc",
#         "logfc",
#         "logfc", "logfc")
# )

## ----plotConcordance, fig.cap="BMC and WMC plot. Between-method concordance (BMC) and within-method concordance (WMC) (main diagonal) averaged values from rank 1 to 30.", fig.width=10, fig.height=8----
pC <- plotConcordance(concordance = concordance, threshold = 30)
cowplot::plot_grid(plotlist = pC, ncol = 2, align = "h", axis = "tb",
    rel_widths = c(1, 3))

## ----figenrichment, eval=TRUE, echo=FALSE, message=FALSE, error=FALSE, warning=FALSE, fig.cap="Enrichment analysis diagram."----
knitr::include_graphics("./enrichment_structure.svg")

## ----priorKnowledge-----------------------------------------------------------
data("microbial_metabolism")
head(microbial_metabolism)

## ----exampleOfIntegration-----------------------------------------------------
# Extract genera from the phyloseq tax_table slot
genera <- tax_table(ps_plaque_16S)[, "GENUS"]

# Genera as rownames of microbial_metabolism data.frame
rownames(microbial_metabolism) <- microbial_metabolism$Genus

# Match OTUs to their metabolism
priorInfo <- data.frame(genera, "Type" =  microbial_metabolism[genera, "Type"])
unknown_metabolism <- is.na(priorInfo$Type)
priorInfo[unknown_metabolism, "Type"] <- "Unknown"

# Relabel 'F Anaerobic' to 'F_Anaerobic' to remove space
priorInfo$Type <- factor(priorInfo$Type, 
    levels = c("Aerobic","Anaerobic","F Anaerobic","Unknown"), 
    labels = c("Aerobic","Anaerobic","F_Anaerobic","Unknown"))

# Add a more informative names column
priorInfo[, "newNames"] <- paste0(rownames(priorInfo), "|",
    priorInfo[, "GENUS"])

## ----normalize_plaque---------------------------------------------------------
ps_plaque_16S <- runNormalizations(my_normalizations, object = ps_plaque_16S)

## ----filter_plauque-----------------------------------------------------------
ps_plaque_16S <- phyloseq::filter_taxa(physeq = ps_plaque_16S, 
    flist = function(x) sum(x > 0) >= 3, prune = TRUE)
ps_plaque_16S

## ----plaque_weights-----------------------------------------------------------
plaque_weights <- weights_ZINB(object = ps_plaque_16S, design = ~ 1,
    zeroinflation = TRUE)

## ----enrichment_methods-------------------------------------------------------
my_edgeR <- set_edgeR(
    group_name = "HMP_BODY_SUBSITE", 
    design = ~ 1 + RSID + HMP_BODY_SUBSITE, 
    coef = "HMP_BODY_SUBSITESupragingival Plaque", 
    norm = "TMM", 
    weights_logical = TRUE)

my_DESeq2 <- set_DESeq2(
    contrast = c("HMP_BODY_SUBSITE",
    "Supragingival Plaque", "Subgingival Plaque"), 
    design = ~ 0 + RSID + HMP_BODY_SUBSITE, 
    norm = "poscounts",
    weights_logical = TRUE)

my_limma <- set_limma(
    design = ~ 1 + RSID + HMP_BODY_SUBSITE, 
    coef = "HMP_BODY_SUBSITESupragingival Plaque", 
    norm = "TMM",
    weights_logical = TRUE)

my_methods <- c(my_methods_noWeights, my_edgeR, my_DESeq2, my_limma)

## ----runDA_enrichment, message=FALSE------------------------------------------
Plaque_16S_DA <- runDA(method_list = my_methods, 
    object = ps_plaque_16S, weights = plaque_weights, verbose = FALSE)

## ----info_DA------------------------------------------------------------------
names(Plaque_16S_DA)

## ----createEnrichment---------------------------------------------------------
enrichment <- createEnrichment(
    object = Plaque_16S_DA[-c(6)], 
    priorKnowledge = priorInfo, 
    enrichmentCol = "Type",
    namesCol = "newNames", 
    slot = "pValMat", 
    colName = "adjP", 
    type = "pvalue",
    direction = c(
        "logFC", # edgeR
        "log2FoldChange", # DEseq2
        "logFC", # limma
        "effect", # ALDEx2
        "logFC", # MAST
        "direction", # ANCOM
        "lfc_HMP_BODY_SUBSITESupragingival Plaque", # ANCOMBC2
        "log2FoldChange", # linda
        "coef", # Maaslin2
        "coef", # maaslin3
        "importance", # mixMC
        "effect", # ZicoSeq
        "logFC", # edgeR with weights
        "log2FoldChange", # DESeq2 with weights
        "logFC"), # limma with weights
    threshold_pvalue = c(0.1, 0.1, 0.1, 0.1, 0.1,
        0.4, # ANCOM threshold on 1-W/(ntaxa-1) 0.4 = liberal
        0.1, 0.1, 0.1, 0.1, 0.1, 
        0.1, 0.1, 0.1, 0.1), # other methods
    threshold_logfc = 0,
    top = NULL,
    alternative = "greater",
    verbose = TRUE
)

## ----plotContingency, fig.cap="Contingency tables plot. Contingency tables for Aerobic and Anaerobic taxa found as differentially abundant by DESeq2.poscounts DA method. Fisher exact test has been performed on each contingency table. If the enrichment is signficantly present, the correspondent cell will be highlighted.", fig.height=4, fig.width=6----
plotContingency(enrichment = enrichment, 
    levels_to_plot = c("Aerobic", "Anaerobic"), 
    method = "DESeq2.poscounts")

## ----plotEnrichment, fig.width=5, fig.height=6, fig.cap="Enrichment plot. Number of differentially abundant features, colored by aerobic or anaerobic metabolism, and directed according to differential abundance direction (UP or DOWN abundant)."----
plotEnrichment(enrichment = enrichment, enrichmentCol = "Type", 
    levels_to_plot = c("Aerobic", "Anaerobic"))

## ----plotMutualFindings, fig.width=6, fig.height=6, fig.cap="Mutual Findings plot. Number of differentially abundant features mutually found by 1 or more methods, colored by the differential abundance direction and separated by aerobic and anaerobic metabolism."----
plotMutualFindings(enrichment, enrichmentCol = "Type", 
    levels_to_plot = c("Aerobic", "Anaerobic"), n_methods = 1)

## ----createPositives----------------------------------------------------------
positives <- createPositives(
    object = Plaque_16S_DA[-c(6)], 
    priorKnowledge = priorInfo, 
    enrichmentCol = "Type",
    namesCol = "newNames", 
    slot = "pValMat", 
    colName = "rawP", 
    type = "pvalue",
    direction = c(
        "logFC", # edgeR
        "log2FoldChange", # DEseq2
        "logFC", # limma
        "effect", # ALDEx2
        "logFC", # MAST
        "direction", # ANCOM
        "lfc_HMP_BODY_SUBSITESupragingival Plaque", # ANCOMBC2
        "log2FoldChange", # linda
        "coef", # Maaslin2
        "coef", # maaslin3
        "importance", # mixMC
        "effect", # ZicoSeq
        "logFC", # edgeR with weights
        "log2FoldChange", # DESeq2 with weights
        "logFC"), # limma with weights
    threshold_pvalue = c(0.1, 0.1, 0.1, 0.1, 0.1,
        0.4, # ANCOM threshold on 1-W/(ntaxa-1) 0.4 = liberal
        0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, # other methods
        0.1), 
    threshold_logfc = 0,
    top = seq.int(from = 0, to = 30, by = 3),
    alternative = "greater",
    verbose = FALSE,
    TP = list(c("DOWN Abundant", "Anaerobic"), c("UP Abundant", "Aerobic")),
    FP = list(c("DOWN Abundant", "Aerobic"), c("UP Abundant", "Anaerobic"))
)
head(positives)

## ----plotPositives, fig.cap="TP, FP differences plot. Differences between the number of True Positives and False Positives for several thresholds of the top ranked raw p-values (the top 3 lowest p-values, the top 6, 9, ..., 30) for each method.", fig.height=5, fig.width=7----
plotPositives(positives)

## ----createEnrichment_nodir---------------------------------------------------
enrichment_nodir <- createEnrichment(
    object = Plaque_16S_DA, 
    priorKnowledge = priorInfo, 
    enrichmentCol = "Type",
    namesCol = "newNames", 
    slot = "pValMat", 
    colName = "adjP", 
    type = "pvalue",
    threshold_pvalue = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
        0.4, # ANCOM 
        0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1),
    threshold_logfc = 0,
    top = NULL,
    alternative = "greater",
    verbose = FALSE
)

## ----plotEnrichmentnodir, fig.width=6, fig.height=5, fig.cap="Enrichment plot. Number of differentially abundant features, colored by metabolism."----
plotEnrichment(enrichment = enrichment_nodir, enrichmentCol = "Type")

## ----plotMutualFindingsnodir, fig.width=5.5, fig.height=6.5, fig.cap="Mutual Findings plot. Number of differentially abundant features mutually found by 1 or more methods, separated by aerobic and anaerobic metabolism."----
plotMutualFindings(enrichment_nodir, enrichmentCol = "Type", 
    levels_to_plot = c("Aerobic", "Anaerobic"), n_methods = 1)

## ----simulate_plaques---------------------------------------------------------
data("ps_plaque_16S")
counts_and_metadata <- get_counts_metadata(ps_plaque_16S)
plaque_counts <- counts_and_metadata[["counts"]]
plaque_metadata <- counts_and_metadata[["metadata"]]

set.seed(123)

sim_list <- SPsimSeq(
    n.sim = 1, 
    s.data = plaque_counts,
    group = plaque_metadata[, "HMP_BODY_SUBSITE"],
    n.genes = 100, 
    batch.config = 1,
    group.config = c(0.5, 0.5), 
    tot.samples = 50, 
    pDE = 0.2, 
    lfc.thrld = 0.5, 
    model.zero.prob = FALSE,
    result.format = "list")

## ----simulated_to_TSE---------------------------------------------------------
sim_obj <- TreeSummarizedExperiment::TreeSummarizedExperiment(
    assays = list("counts" = sim_list[[1]][["counts"]]),
    rowData = sim_list[[1]]["rowData"],
    colData = sim_list[[1]]["colData"],
)
# Group as factor
SummarizedExperiment::colData(sim_obj)[, "colData.Group"] <- as.factor(
    SummarizedExperiment::colData(sim_obj)[, "colData.Group"])

## ----simulated_priorInfo------------------------------------------------------
priorInfo <- sim_list[[1]][["rowData"]]
priorInfo$Reality <- ifelse(priorInfo[, "DE.ind"], "is DA", "is not DA")

## ----normalization_simulated_data---------------------------------------------
sim_obj <- runNormalizations(
    normalization_list = my_normalizations, 
    object = sim_obj, 
    verbose = TRUE)

## ----simulated_filter---------------------------------------------------------
taxa_to_keep <- apply(assays(sim_obj)[["counts"]], 1, 
    function(x) sum(x > 0) >= 3 & sd(x) > 1)
sim_obj <- sim_obj[taxa_to_keep, ]
priorInfo <- priorInfo[taxa_to_keep, ]

## ----simulated_weights--------------------------------------------------------
sim_weights <- weights_ZINB(
    object = sim_obj, 
    design = ~ 1, 
    zeroinflation = TRUE)

## ----simulated_set------------------------------------------------------------
my_basic <- set_basic(pseudo_count = FALSE,
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"), 
    test = c("t", "wilcox"), 
    paired = FALSE, 
    expand = TRUE)

my_edgeR <- set_edgeR(
    pseudo_count = FALSE,
    group_name = "colData.Group",
    design = ~ colData.Group,
    robust = FALSE,
    coef = 2,
    norm = "TMM",
    weights_logical = c(TRUE, FALSE),
    expand = TRUE)

my_DESeq2 <- set_DESeq2(
    pseudo_count = FALSE,
    design = ~ colData.Group,
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    norm = "poscounts",
    weights_logical = c(TRUE, FALSE),
    alpha = 0.1,
    expand = TRUE)

my_limma <- set_limma(
    pseudo_count = FALSE,
    design = ~ colData.Group,
    coef = 2,
    norm = "TMM",
    weights_logical = c(FALSE, TRUE),
    expand = TRUE)

my_ALDEx2 <- set_ALDEx2(
    pseudo_count = FALSE,
    design = "colData.Group",
    mc.samples = 128,
    test = "wilcox",
    paired.test = FALSE, 
    denom = c("all", "iqlr"), 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    expand = TRUE)

my_metagenomeSeq <- set_metagenomeSeq(
    pseudo_count = FALSE, 
    design = "~ colData.Group", 
    coef = "colData.GroupSupragingival Plaque", 
    norm = "CSS", 
    model = "fitFeatureModel",
    expand = TRUE)

my_corncob <- set_corncob(
    pseudo_count = FALSE,
    formula = ~ colData.Group,
    formula_null = ~ 1,
    phi.formula = ~ colData.Group,
    phi.formula_null = ~ colData.Group,
    test = c("Wald", "LRT"),
    boot = FALSE,
    coefficient = "colData.GroupSupragingival Plaque")

my_MAST <- set_MAST(
    pseudo_count = FALSE,
    rescale = c("default", "median"),
    design = "~ 1 + colData.Group",
    coefficient = "colData.GroupSupragingival Plaque",
    expand = TRUE)

my_Seurat <- set_Seurat(
    pseudo_count = FALSE,
    test = c("t", "wilcox"),
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    norm = c("LogNormalize", "CLR"),
    scale.factor = 10^5,
    expand = TRUE
)

my_ANCOM <- set_ANCOM(
    pseudo_count = FALSE,
    fix_formula = "colData.Group", 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    BC = c(TRUE, FALSE), 
    alpha = 0.1,
    expand = TRUE   
)

my_dearseq <- set_dearseq(
    pseudo_count = FALSE,
    covariates = NULL,
    variables2test = "colData.Group", 
    preprocessed = FALSE, 
    test = c("permutation", "asymptotic"),
    expand = TRUE)

my_NOISeq <- set_NOISeq(
    pseudo_count = FALSE, 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    norm = c("rpkm", "tmm"), 
    expand = TRUE)

my_linda <- set_linda(
    formula = "~ colData.Group", 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    is.winsor = TRUE,
    zero.handling = "pseudo-count",
    alpha = 0.1,
    expand = TRUE)

my_Maaslin2 <- set_Maaslin2(
    normalization = "TSS",
    transform = "LOG",
    analysis_method = "LM",
    fixed_effects = "colData.Group", 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    expand = TRUE)

my_maaslin3 <- set_maaslin3(
    normalization = "TSS",
    transform = "LOG",
    median_comparison_abundance = TRUE,
    formula = "~ colData.Group", 
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    expand = TRUE)

my_mixMC <- set_mixMC(
    pseudo_count = 1,
    contrast = c("colData.Group", "Supragingival Plaque",
        "Subgingival Plaque"),
    expand = TRUE
)

my_ZicoSeq <- set_ZicoSeq(
    contrast = c("colData.Group", "Supragingival Plaque", 
        "Subgingival Plaque"),
    feature.dat.type = "count", strata = NULL,
    is.winsor = TRUE, outlier.pct = 0.03, winsor.end = "top",
    is.post.sample = TRUE, post.sample.no = 25, perm.no = 99,
    ref.pct = 0.5, stage.no = 6, excl.pct = 0.2, 
    link.func = list(function(x) sign(x) * (abs(x))^0.5))

my_methods <- c(my_basic, my_edgeR, my_DESeq2, 
    my_limma, my_metagenomeSeq, my_corncob, 
    my_ALDEx2, my_MAST, my_Seurat, my_ANCOM, 
    my_dearseq, my_NOISeq, my_linda, my_Maaslin2, 
    my_maaslin3, my_mixMC, my_ZicoSeq)

## ----simulated_run------------------------------------------------------------
sim_DA <- runDA(
    method_list = my_methods, 
    object = sim_obj, 
    weights = sim_weights, 
    verbose = FALSE)

## ----createEnrichment_nodir_sim-----------------------------------------------
enrichment_nodir <- createEnrichment(
    object = sim_DA, 
    priorKnowledge = priorInfo, 
    enrichmentCol = "Reality",
    namesCol = NULL, 
    slot = "pValMat", 
    colName = "adjP", 
    type = "pvalue",
    threshold_pvalue = c(
        rep(0.1,19), # adjP thresholds 
        0.4, # adjP threshold for ANCOM on 1-W/(ntaxa-1)
        rep(0.1,8), # adjP thresholds for other methods
        0.9, # adjP threshold for mixMC 1 - stability 
        0.1), # ZicoSeq
    threshold_logfc = 0,
    top = NULL,
    alternative = "greater",
    verbose = FALSE
)

## ----plotEnrichmentnodirsim, fig.width=6, fig.height=6, fig.cap="Enrichment plot for simulated data. Number of differentially abundant features, colored by DA reality."----
plotEnrichment(enrichment = enrichment_nodir, enrichmentCol = "Reality")

## ----createPositives_sim------------------------------------------------------
positives_nodir <- createPositives(
    object = sim_DA, 
    priorKnowledge = priorInfo, 
    enrichmentCol = "Reality",
    namesCol = NULL, 
    slot = "pValMat", 
    colName = "adjP", 
    type = "pvalue",
    threshold_pvalue = c(
        rep(0.1,19), # adjP thresholds 
        0.4, # adjP threshold for ANCOM on 1-W/(ntaxa-1)
        rep(0.1,8), # adjP thresholds for other methods
        0.9, # adjP threshold for mixMC 1 - stability 
        0.1), # ZicoSeq
    threshold_logfc = 0,
    top = seq(5, 30, by = 5),
    alternative = "greater",
    verbose = FALSE,
    TP = list(c("DA", "is DA")),
    FP = list(c("DA", "is not DA"))
)

## ----plotPositivessim, fig.cap="TP, FP differences plot. Differences between the number of True Positives and False Positives for several thresholds of the top ranked adjusted p-values lower than 0.1 (the top 5 lowest adjusted p-values, the top 10, 15, ..., 30) for each method in simulated data. Red dotted line represents the total number of DA simulated features.", fig.width=10, fig.height=10----
plotPositives(positives = positives_nodir) +
    facet_wrap( ~ method) + 
    theme(legend.position = "none") +
    geom_hline(aes(yintercept = 20), linetype = "dotted", color = "red") +
    geom_hline(aes(yintercept = 0), color = "black") +
    ylim(NA, 21)

## ----sessionInfo--------------------------------------------------------------
sessionInfo()