## ----biocSetup, include = FALSE, results="hide", warning=FALSE, echo=FALSE---- ## Changing the YAML to the following changes the output to 'latex' ## output: ## BiocWorkflowTools::f1000_article ## More at: ## https://stackoverflow.com/questions/35144130/in-knitr-how-can-i-test-for-if-the-output-will-be-pdf-or-word ## add figures to current directory if Rmd is converted to LaTeX ## This simplifies processing on overleaf if (!is.null(knitr::opts_knit$get("rmarkdown.pandoc.to"))) { to_html <- knitr::opts_knit$get("rmarkdown.pandoc.to") != 'latex' if (!to_html) { knitr::opts_chunk$set(fig.path = "", out.width = '.8\\linewidth') } } ## here, pdf specific adapations can be made ## however, the option "rmarkdown.pandoc.to" is only set by rmarkdown::render, ## so it returns NULL if run interactively! ## ----testAlternativeToChunkAbove, eval=FALSE, include=FALSE, echo=FALSE------- # # this does not work as it will only test the output formats # # specified in the YAML header # any(grepl("f1000", rmarkdown::all_output_formats(knitr::current_input()))) ## ----knitrOptions, include=FALSE---------------------------------------------- library(BiocStyle) library(knitr) options(digits = 4, width = 80, timeout = 600) opts_chunk$set(echo = TRUE, tidy = FALSE, include = TRUE, dev = c('png'), fig.width = 6, fig.height = 3.5, comment = ' ', dpi = 300, cache = TRUE, fig.pos = "H") ## ----installBioc, eval=FALSE-------------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install("maEndToEnd") ## ----maEndToEndImport--------------------------------------------------------- suppressPackageStartupMessages({library("maEndToEnd")}) ## ----pkgList, results="hide"-------------------------------------------------- #General Bioconductor packages library(Biobase) library(oligoClasses) #Annotation and data import packages library(ArrayExpress) library(pd.hugene.1.0.st.v1) library(hugene10sttranscriptcluster.db) #Quality control and pre-processing packages library(oligo) library(arrayQualityMetrics) #Analysis and statistics packages library(limma) library(topGO) library(ReactomePA) library(clusterProfiler) #Plotting and color options packages library(gplots) library(ggplot2) library(geneplotter) library(RColorBrewer) library(pheatmap) library(enrichplot) #Formatting/documentation packages #library(rmarkdown) #library(BiocStyle) library(dplyr) library(tidyr) #Helpers: library(stringr) library(matrixStats) library(genefilter) library(openxlsx) #library(devtools) ## ----generateFolderForRawData, eval = TRUE------------------------------------ raw_data_dir <- tools::R_user_dir(package = "maEndToEnd", which = "cache") if (!dir.exists(raw_data_dir)) { dir.create(raw_data_dir) } ## ----getDataEBI, eval=TRUE, results='hide', message=FALSE--------------------- anno_AE <- getAE("E-MTAB-2967", path = raw_data_dir, type = "raw") download.file(url = "https://www.ebi.ac.uk/biostudies/files/E-MTAB-2967/E-MTAB-2967.sdrf.txt", destfile = file.path(raw_data_dir, "E-MTAB-2967.sdrf.txt"), mode = "wb") ## ----sumexp, fig.cap = "Structure of Bioconductor’s ExpressionSet class.", echo=FALSE, fig.show="asis", fig.width = 7, fig.height = 4.5---- par(mar=c(0,0,0,0)) plot(1,1,xlim=c(0,100),ylim=c(0, 150),bty="n", type="n",xlab="",ylab="",xaxt="n",yaxt="n") polygon(c(50,85,85,50),c(70,70,130,130),col="lightcoral",border=NA) polygon(c(55, 56, 56, 55),c(70,70,130,130),col=rgb(1,0,0,.5),border=NA) polygon(c(50,85,85,50),c(118,118,120,120),col=rgb(1,0,0,.5),border=NA) text(67.5,100,"assay(s)", cex = 1) text(67.5,90,"e.g. 'exprs'", cex = 1) polygon(c(45,49,49,45),c(70,70,130,130),col="honeydew2",border=NA) text(47,100, "microarray probes", srt=90, cex=1) polygon(c(50,85,85,50),c(131,131,140,140),col="darkseagreen3",border=NA) text(67.5,135.5,"sample IDs", cex = 1) polygon(c(20,40,40,20),c(70,70,130,130),col=rgb(0,0,1,.5),border=NA) polygon(c(20,40,40,20),c(118,118,120,120),col=rgb(0,0,1,.5),border=NA) text(30,100,"featureData", cex = 1) polygon(c(15,19,19,15),c(70,70,130,130),col="honeydew2",border=NA) text(17,100, "microarray probes", srt=90, cex=1) polygon(c(20,40,40,20),c(131,131,140,140),col="bisque3",border=NA) text(30,135.5,"features", cex = 1) polygon(c(50,65, 65, 50),c(55, 55, 0, 0),col=rgb(.5,0,.5,.5),border=NA) polygon(c(55, 56, 56, 55),c(55, 55, 0, 0),col=rgb(.5,0,.5,.5),border=NA) text(57.5, 30,"phenoData", cex = 1, srt=270) polygon(c(50,65, 65, 50),c(56,56,65,65),col="darkseagreen3",border=NA) text(57.5,60.5,"sample IDs", cex = 1) ## ----getSDRF------------------------------------------------------------------ sdrf_location <- file.path(raw_data_dir, "E-MTAB-2967.sdrf.txt") SDRF <- read.delim(sdrf_location) rownames(SDRF) <- SDRF$Array.Data.File SDRF <- AnnotatedDataFrame(SDRF) ## ----importCelfiles, results="hide", eval=TRUE, dependson="getSDRF", warning = FALSE---- raw_data <- oligo::read.celfiles(filenames = file.path(raw_data_dir, SDRF$Array.Data.File), verbose = FALSE, phenoData = SDRF) stopifnot(validObject(raw_data)) ## ----inspectPhenoData, eval=TRUE---------------------------------------------- head(Biobase::pData(raw_data)) ## ----reassignpData, eval=TRUE------------------------------------------------- Biobase::pData(raw_data) <- Biobase::pData(raw_data)[, c("Source.Name", "Characteristics.individual.", "Factor.Value.disease.", "Factor.Value.phenotype.")] ## ----inspectAssayData, eval=TRUE---------------------------------------------- Biobase::exprs(raw_data)[1:5, 1:5] ## ----qualityControlRawDataPCA, fig.cap="PCA plot of the log–transformed raw expression data."---- exp_raw <- log2(Biobase::exprs(raw_data)) PCA_raw <- prcomp(t(exp_raw), scale. = FALSE) percentVar <- round(100*PCA_raw$sdev^2/sum(PCA_raw$sdev^2),1) sd_ratio <- sqrt(percentVar[2] / percentVar[1]) dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2], Disease = pData(raw_data)$Factor.Value.disease., Phenotype = pData(raw_data)$Factor.Value.phenotype., Individual = pData(raw_data)$Characteristics.individual.) ggplot(dataGG, aes(PC1, PC2)) + geom_point(aes(shape = Disease, colour = Phenotype)) + ggtitle("PCA plot of the log-transformed raw expression data") + xlab(paste0("PC1, VarExp: ", percentVar[1], "%")) + ylab(paste0("PC2, VarExp: ", percentVar[2], "%")) + theme(plot.title = element_text(hjust = 0.5))+ coord_fixed(ratio = sd_ratio) + scale_shape_manual(values = c(4,15)) + scale_color_manual(values = c("darkorange2", "dodgerblue4")) ## ----qualityControlRawDataBox, fig.cap="Intensity boxplots of the log2–transformed raw data."---- oligo::boxplot(raw_data, target = "core", main = "Boxplot of log2-intensitites for the raw data") ## ----arrayQualityMetricsRaw, eval = TRUE, warning=FALSE, message=FALSE-------- arrayQualityMetrics(expressionset = raw_data, outdir = tempdir(), force = TRUE, do.logtransform = TRUE, intgroup = c("Factor.Value.disease.", "Factor.Value.phenotype.")) ## ----annotationDataBaseContent, eval = TRUE----------------------------------- head(ls("package:hugene10sttranscriptcluster.db")) ## ----DifferenceBetweenExonAndGeneTypeArrays, fig.width=7, fig.height=6, echo=FALSE, fig.cap="Visualization of the difference between \"Exon\" type array (left) and \"Gene\" type array (right)."---- library(grid) par(mar=c(0,0,0,0)) plot(1,1,xlim=c(0,100),ylim=c(0, 20),bty="n", type="n",xlab="",ylab="",xaxt="n",yaxt="n") dat <- data.frame(x = rep(seq(0, 2.9, 0.1), 30), y = rep(seq(0, 2.9, 0.1), each = 30)) col <- c("blue", "darkslategray1", "goldenrod1", "yellow", "royalblue1", "coral") dat$colours<-sample(rep(col, 150)) dat$colours2 <- sample(c(rep("yellow", 2), rep("goldenrod1", 5), rep("coral",7), rep("blue", 5), rep ("darkslategray1",3), rep("royalblue1", 8), rep(gray(0:9/10), 87))) # opening the graphic device and # setting up a viewport with borders: vp1 <- viewport(x = 0.1, y = 0.1, w = 0.12, h = 0.12, just = c("left", "bottom"), name = "vp1") # plotting rectangles using x/y positions grid.rect(x=dat$x,y=dat$y, height=0.1, width=0.1, hjust=0,vjust=0,vp=vp1, gp=gpar(col=1, fill=as.character(dat$colours))) ########################## vp2 <- viewport(x = 0.5, y = 0.1, w = 0.12, h = 0.12, just = c("left", "bottom"), name = "vp1") # plotting rectangles using x/y positions grid.rect(x=dat$x,y=dat$y, height=0.1, width=0.1, hjust=0,vjust=0,vp=vp2, gp=gpar(col=1, fill=as.character(dat$colours2))) ## ----RMAcalibrationForRLE, eval=TRUE------------------------------------------ palmieri_eset <- oligo::rma(raw_data, target = "core", normalize = FALSE) ## ----boxplotDataForRLETidy, fig.cap="Boxplot for the RLE values", warning=FALSE---- row_medians_assayData <- Biobase::rowMedians(as.matrix(Biobase::exprs(palmieri_eset))) RLE_data <- sweep(Biobase::exprs(palmieri_eset), 1, row_medians_assayData) RLE_data <- as.data.frame(RLE_data) RLE_data_gathered <- tidyr::gather(RLE_data, patient_array, log2_expression_deviation) ggplot2::ggplot(RLE_data_gathered, aes(patient_array, log2_expression_deviation)) + geom_boxplot(outlier.shape = NA) + ylim(c(-2, 2)) + theme(axis.text.x = element_text(colour = "aquamarine4", angle = 60, size = 6.5, hjust = 1 , face = "bold")) ## ----RMAcalibrationWITHnormalization, eval=TRUE------------------------------- palmieri_eset_norm <- oligo::rma(raw_data, target = "core") ## ----PCAMetricsCalibrated, fig.cap = "PCA plot of the calibrated, summarized data.", eval = TRUE---- exp_palmieri <- Biobase::exprs(palmieri_eset_norm) PCA <- prcomp(t(exp_palmieri), scale = FALSE) percentVar <- round(100*PCA$sdev^2/sum(PCA$sdev^2),1) sd_ratio <- sqrt(percentVar[2] / percentVar[1]) dataGG <- data.frame(PC1 = PCA$x[,1], PC2 = PCA$x[,2], Disease = Biobase::pData(palmieri_eset_norm)$Factor.Value.disease., Phenotype = Biobase::pData(palmieri_eset_norm)$Factor.Value.phenotype.) ggplot(dataGG, aes(PC1, PC2)) + geom_point(aes(shape = Disease, colour = Phenotype)) + ggtitle("PCA plot of the calibrated, summarized data") + xlab(paste0("PC1, VarExp: ", percentVar[1], "%")) + ylab(paste0("PC2, VarExp: ", percentVar[2], "%")) + theme(plot.title = element_text(hjust = 0.5)) + coord_fixed(ratio = sd_ratio) + scale_shape_manual(values = c(4,15)) + scale_color_manual(values = c("darkorange2", "dodgerblue4")) ## ----rownamesForHeatmap, fig.height = 8.5, fig.width = 7, eval = TRUE, echo=TRUE---- phenotype_names <- ifelse(str_detect(pData (palmieri_eset_norm)$Factor.Value.phenotype., "non"), "non_infl.", "infl.") disease_names <- ifelse(str_detect(pData (palmieri_eset_norm)$Factor.Value.disease., "Crohn"), "CD", "UC") annotation_for_heatmap <- data.frame(Phenotype = phenotype_names, Disease = disease_names) row.names(annotation_for_heatmap) <- row.names(pData(palmieri_eset_norm)) ## ----HeatmapWithAnnotation, fig.height = 8.5, fig.width = 7, fig.cap="Heatmap of the sample-to-sample distances"---- dists <- as.matrix(dist(t(exp_palmieri), method = "manhattan")) rownames(dists) <- row.names(pData(palmieri_eset_norm)) hmcol <- rev(colorRampPalette(RColorBrewer::brewer.pal(9, "YlOrRd"))(255)) colnames(dists) <- NULL diag(dists) <- NA ann_colors <- list( Phenotype = c(non_infl. = "chartreuse4", infl. = "burlywood3"), Disease = c(CD = "blue4", UC = "cadetblue2") ) pheatmap(dists, col = (hmcol), annotation_row = annotation_for_heatmap, annotation_colors = ann_colors, legend = TRUE, treeheight_row = 0, legend_breaks = c(min(dists, na.rm = TRUE), max(dists, na.rm = TRUE)), legend_labels = (c("small distance", "large distance")), main = "Clustering heatmap for the calibrated samples") ## ----IntensityBasedManualFiltering, fig.cap="Histogram of the median intensities per gene"---- palmieri_medians <- rowMedians(Biobase::exprs(palmieri_eset_norm)) hist_res <- hist(palmieri_medians, 100, col = "cornsilk1", freq = FALSE, main = "Histogram of the median intensities", border = "antiquewhite4", xlab = "Median intensities") ## ----setManualThreshold, fig.cap="Histogram of the median intensities per gene with manual intensity filtering threshold (red line)."---- man_threshold <- 4 hist_res <- hist(palmieri_medians, 100, col = "cornsilk", freq = FALSE, main = "Histogram of the median intensities", border = "antiquewhite4", xlab = "Median intensities") abline(v = man_threshold, col = "coral4", lwd = 2) ## ----expGroups, dependson="PCAMetricsCalibrated"------------------------------ no_of_samples <- table(paste0(pData(palmieri_eset_norm)$Factor.Value.disease., "_", pData(palmieri_eset_norm)$Factor.Value.phenotype.)) no_of_samples ## ----filteringOfLowIntensity_transcripts-------------------------------------- samples_cutoff <- min(no_of_samples) idx_man_threshold <- apply(Biobase::exprs(palmieri_eset_norm), 1, function(x){ sum(x > man_threshold) >= samples_cutoff}) table(idx_man_threshold) palmieri_manfiltered <- subset(palmieri_eset_norm, idx_man_threshold) ## ----annotateData, eval=TRUE, dependson="intensityBasedFiltering", message = FALSE---- anno_palmieri <- AnnotationDbi::select(hugene10sttranscriptcluster.db, keys = (featureNames(palmieri_manfiltered)), columns = c("SYMBOL", "GENENAME"), keytype = "PROBEID") anno_palmieri <- subset(anno_palmieri, !is.na(SYMBOL)) ## ----multipleMappings, dependson="annotateData"------------------------------- anno_grouped <- group_by(anno_palmieri, PROBEID) anno_summarized <- dplyr::summarize(anno_grouped, no_of_matches = n_distinct(SYMBOL)) head(anno_summarized) anno_filtered <- filter(anno_summarized, no_of_matches > 1) head(anno_filtered) probe_stats <- anno_filtered nrow(probe_stats) ## ----excludeMultipleMappingsFromAssayData, dependson="multipleMappings"------- ids_to_exlude <- (featureNames(palmieri_manfiltered) %in% probe_stats$PROBEID) table(ids_to_exlude) palmieri_final <- subset(palmieri_manfiltered, !ids_to_exlude) validObject(palmieri_final) ## ----recallAnnoPalmieri------------------------------------------------------- head(anno_palmieri) ## ----excludeMultipleMappingsFromFeatureData----------------------------------- fData(palmieri_final)$PROBEID <- rownames(fData(palmieri_final)) ## ----excludeMultipleMappingsFromFeatureData2---------------------------------- fData(palmieri_final) <- left_join(fData(palmieri_final), anno_palmieri) # restore rownames after left_join rownames(fData(palmieri_final)) <- fData(palmieri_final)$PROBEID validObject(palmieri_final) ## ----introductionOfAbbreviations---------------------------------------------- individual <- as.character(Biobase::pData(palmieri_final)$Characteristics.individual.) tissue <- str_replace_all(Biobase::pData(palmieri_final)$Factor.Value.phenotype., " ", "_") tissue <- ifelse(tissue == "non-inflamed_colonic_mucosa", "nI", "I") disease <- str_replace_all(Biobase::pData(palmieri_final)$Factor.Value.disease., " ", "_") disease <- ifelse(str_detect(Biobase::pData(palmieri_final)$Factor.Value.disease., "Crohn"), "CD", "UC") ## ----createDesign, eval=TRUE, dependson="excludeMultipleMappings"------------- i_CD <- individual[disease == "CD"] design_palmieri_CD <- model.matrix(~ 0 + tissue[disease == "CD"] + i_CD) colnames(design_palmieri_CD)[1:2] <- c("I", "nI") rownames(design_palmieri_CD) <- i_CD i_UC <- individual[disease == "UC"] design_palmieri_UC <- model.matrix(~ 0 + tissue[disease == "UC"] + i_UC ) colnames(design_palmieri_UC)[1:2] <- c("I", "nI") rownames(design_palmieri_UC) <- i_UC ## ----inspectDesignMatrix, eval = TRUE, dependson="createDesign"--------------- head(design_palmieri_CD[, 1:6]) head(design_palmieri_UC[, 1:6]) ## ----visualizeExpressionChanges, fig.cap="Visualization of expression changes"---- tissue_CD <- tissue[disease == "CD"] crat_expr <- Biobase::exprs(palmieri_final)["8164535", disease == "CD"] crat_data <- as.data.frame(crat_expr) colnames(crat_data)[1] <- "org_value" crat_data <- mutate(crat_data, individual = i_CD, tissue_CD) crat_data$tissue_CD <- factor(crat_data$tissue_CD, levels = c("nI", "I")) ggplot(data = crat_data, aes(x = tissue_CD, y = org_value, group = individual, color = individual)) + geom_line() + ggtitle("Expression changes for the CRAT gene") ## ----obtainFitCRAT------------------------------------------------------------ crat_coef <- lmFit(palmieri_final[,disease == "CD"], design = design_palmieri_CD)$coefficients["8164535",] crat_coef ## ----2obtainFitCRAT----------------------------------------------------------- crat_fitted <- design_palmieri_CD %*% crat_coef rownames(crat_fitted) <- names(crat_expr) colnames(crat_fitted) <- "fitted_value" crat_fitted ## ----3obtainFitCRAT, fig.cap="Expression changes for the CRAT gene"----------- crat_data$fitted_value <- crat_fitted ggplot(data = crat_data, aes(x = tissue_CD, y = fitted_value, group = individual, color = individual)) + geom_line() + ggtitle("Fitted expression changes for the CRAT gene") ## ----tTest-------------------------------------------------------------------- crat_noninflamed <- na.exclude(crat_data$org_value[tissue == "nI"]) crat_inflamed <- na.exclude(crat_data$org_value[tissue == "I"]) res_t <- t.test(crat_noninflamed ,crat_inflamed , paired = TRUE) res_t ## ----createContrastMatrixAndFitModel, eval=TRUE, dependson="createDesign"----- contrast_matrix_CD <- makeContrasts(I-nI, levels = design_palmieri_CD) palmieri_fit_CD <- eBayes(contrasts.fit(lmFit(palmieri_final[,disease == "CD"], design = design_palmieri_CD), contrast_matrix_CD)) contrast_matrix_UC <- makeContrasts(I-nI, levels = design_palmieri_UC) palmieri_fit_UC <- eBayes(contrasts.fit(lmFit(palmieri_final[,disease == "UC"], design = design_palmieri_UC), contrast_matrix_UC)) ## ----extractResultsCD, eval = TRUE, dependson="createContrastMatrixAndFitModel", message=FALSE, fig.cap="Histogram of the p–values for Crohn’s disease."---- table_CD <- topTable(palmieri_fit_CD, number = Inf) head(table_CD) hist(table_CD$P.Value, col = brewer.pal(3, name = "Set2")[1], main = "inflamed vs non-inflamed - Crohn's disease", xlab = "p-values") ## ----extractResultsUC, eval = TRUE, dependson="createContrastMatrixAndFitModel", message=FALSE, fig.cap="Histogram of the p–values for ulcerative colitis."---- table_UC <- topTable(palmieri_fit_UC, number = Inf) head(table_UC) hist(table_UC$P.Value, col = brewer.pal(3, name = "Set2")[2], main = "inflamed vs non-inflamed - Ulcerative colitis", xlab = "p-values") ## ----pCutForCD---------------------------------------------------------------- nrow(subset(table_UC, P.Value < 0.001)) ## ----FDRforUC----------------------------------------------------------------- tail(subset(table_UC, P.Value < 0.001)) ## ----compareDEgenes, dependson=c("extractResultsUC", "extractResultsCD")------ fpath <- system.file("extdata", "palmieri_DE_res.xlsx", package = "maEndToEnd") palmieri_DE_res <- sapply(1:4, function(i) read.xlsx(cols = 1, fpath, sheet = i, startRow = 4)) names(palmieri_DE_res) <- c("CD_UP", "CD_DOWN", "UC_UP", "UC_DOWN") palmieri_DE_res <- lapply(palmieri_DE_res, as.character) paper_DE_genes_CD <- Reduce("c", palmieri_DE_res[1:2]) paper_DE_genes_UC <- Reduce("c", palmieri_DE_res[3:4]) overlap_CD <- length(intersect(subset(table_CD, P.Value < 0.001)$SYMBOL, paper_DE_genes_CD)) / length(paper_DE_genes_CD) overlap_UC <- length(intersect(subset(table_UC, P.Value < 0.001)$SYMBOL, paper_DE_genes_UC)) / length(paper_DE_genes_UC) overlap_CD overlap_UC total_genenumber_CD <- length(subset(table_CD, P.Value < 0.001)$SYMBOL) total_genenumber_UC <- length(subset(table_UC, P.Value < 0.001)$SYMBOL) total_genenumber_CD total_genenumber_UC ## ----VolcanoPlot, fig.cap="Volcano plot of the DE-genes", fig.height=8, fig.width=7---- volcano_names <- ifelse(abs(palmieri_fit_CD$coefficients)>=1, palmieri_fit_CD$genes$SYMBOL, NA) volcanoplot(palmieri_fit_CD, coef = 1L, style = "p-value", highlight = 100, names = volcano_names, xlab = "Log2 Fold Change", ylab = NULL, pch=16, cex=0.35) ## ----FDRcontrolledDEgenes, dependson=c("extractResultsUC", "extractResultsCD"), eval=TRUE---- DE_genes_CD <- subset(table_CD, adj.P.Val < 0.1)$PROBEID ## ----GOAnalysisCreateBackgrounds, eval=TRUE, warning=FALSE, message=FALSE----- back_genes_idx <- genefilter::genefinder(palmieri_final, as.character(DE_genes_CD), method = "manhattan", scale = "none") ## ----GOAnalysisCreateBackgrounds2, eval=TRUE, warning=FALSE, message=FALSE---- back_genes_idx <- sapply(back_genes_idx, function(x)x$indices) ## ----GOAnalysisCreateBackgrounds3, eval=TRUE, warning=FALSE, message=FALSE---- back_genes <- featureNames(palmieri_final)[back_genes_idx] back_genes <- setdiff(back_genes, DE_genes_CD) intersect(back_genes, DE_genes_CD) length(back_genes) ## ----multidensityPlot, fig.cap="Selecting a background set of genes for the gene ontology analysis."---- multidensity(list( all = table_CD[,"AveExpr"] , fore = table_CD[DE_genes_CD , "AveExpr"], back = table_CD[rownames(table_CD) %in% back_genes, "AveExpr"]), col = c("#e46981", "#ae7ee2", "#a7ad4a"), xlab = "mean expression", main = "DE genes for CD-background-matching") ## ----createFactorOfInterestingGenes, dependson="GOAnalysisCreateBackgrounds", eval=TRUE---- gene_IDs <- rownames(table_CD) in_universe <- gene_IDs %in% c(DE_genes_CD, back_genes) in_selection <- gene_IDs %in% DE_genes_CD all_genes <- in_selection[in_universe] all_genes <- factor(as.integer(in_selection[in_universe])) names(all_genes) <- gene_IDs[in_universe] ## ----createTopGODataSet, dependson="createFactorOfInterestingGenes", eval=TRUE, message = FALSE, warning=FALSE---- top_GO_data <- new("topGOdata", ontology = "BP", allGenes = all_genes, nodeSize = 10, annot = annFUN.db, affyLib = "hugene10sttranscriptcluster.db") ## ----runtopGOTests, results='hide', eval=TRUE, dependson = "createTopGODataSet", message = FALSE---- result_top_GO_elim <- runTest(top_GO_data, algorithm = "elim", statistic = "Fisher") result_top_GO_classic <- runTest(top_GO_data, algorithm = "classic", statistic = "Fisher") ## ----processtopGOResults, eval=TRUE, dependson="runtopGOTests"---------------- res_top_GO <- GenTable(top_GO_data, Fisher.elim = result_top_GO_elim, Fisher.classic = result_top_GO_classic, orderBy = "Fisher.elim" , topNodes = 100) genes_top_GO <- printGenes(top_GO_data, whichTerms = res_top_GO$GO.ID, chip = "hugene10sttranscriptcluster.db", geneCutOff = 1000) res_top_GO$sig_genes <- sapply(genes_top_GO, function(x){ str_c(paste0(x[x$'raw p-value' == 2, "Symbol.id"],";"), collapse = "") }) head(res_top_GO[,1:8], 20) ## ----graphOfResults, fig.height = 6, eval=TRUE, results='hide', dpi=600, fig.cap="Significantly enriched GO nodes in the GO hierarchy"---- # showSigOfNodes(top_GO_data, score(result_top_GO_elim), firstSigNodes = 3, # useInfo = 'def') ## ----mapIDsToEntrez, dependson="createFactorOfInterestingGenes", message = FALSE---- entrez_ids <- mapIds(hugene10sttranscriptcluster.db, keys = rownames(table_CD), keytype = "PROBEID", column = "ENTREZID") ## ----runReactomeEnrichment, fig.cap="Enriched Reactome pathways and their p–values as a bar chart.", eval = TRUE, warning=FALSE---- reactome_enrich <- enrichPathway(gene = entrez_ids[DE_genes_CD], universe = entrez_ids[c(DE_genes_CD, back_genes)], organism = "human", pvalueCutoff = 0.05, qvalueCutoff = 0.9, readable = TRUE) reactome_enrich@result$Description <- paste0(str_sub( reactome_enrich@result$Description, 1, 20), "...") head(as.data.frame(reactome_enrich))[1:6] ## ----reactomeBar, dependson="runReactomeEnrichment", fig.cap="Enriched Reactome pathways and their p–values as a bar chart."---- barplot(reactome_enrich) ## ----emapplot, dependson="runReactomeEnrichment", fig.width=6, fig.height = 7, fig.cap="Enriched Reactome pathways enrichment results as a graph."---- reactome_enrich <- pairwise_termsim(reactome_enrich) emapplot(reactome_enrich, showCategory = 10) ## ----------------------------------------------------------------------------- gc() length(getLoadedDLLs()) sessionInfo()