## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, out.width = "80%", fig.align = "center", crop = NULL # suppress "The magick package is required to crop" issue ) library(BiocStyle) ## ----libs, message=FALSE------------------------------------------------------ library(monaLisa) library(JASPAR2020) library(TFBSTools) library(BSgenome.Mmusculus.UCSC.mm10) library(Biostrings) library(SummarizedExperiment) library(ComplexHeatmap) library(circlize) ## ----loadData----------------------------------------------------------------- # load GRanges object with logFC and peaks gr_path <- system.file("extdata", "atac_liver_vs_lung.rds", package = "monaLisa") gr <- readRDS(gr_path) ## ----predictorMatrix, warning=FALSE------------------------------------------- # get PFMs (vertebrate TFs from Jaspar) pfms <- getMatrixSet(JASPAR2020, list(matrixtype = "PFM", tax_group = "vertebrates")) # randomly sample 300 PFMs for illustration purposes (for quick runtime) set.seed(4563) pfms <- pfms[sample(length(pfms), size = 300)] # convert PFMs to PWMs pwms <- toPWM(pfms) # get TFBS on given GRanges (peaks) # suppress warnings generated by matchPWM due to the presence of Ns # in the sequences peakSeq <- getSeq(BSgenome.Mmusculus.UCSC.mm10, gr) suppressWarnings({ hits <- findMotifHits(query = pwms, subject = peakSeq, min.score = 10.0, BPPARAM = BiocParallel::SerialParam()) }) # get TFBS matrix TFBSmatrix <- unclass(table(factor(seqnames(hits), levels = seqlevels(hits)), factor(hits$pwmname, levels = name(pwms)))) TFBSmatrix[1:6, 1:6] # remove TF motifs with 0 binding sites in all regions zero_TF <- colSums(TFBSmatrix) == 0 sum(zero_TF) TFBSmatrix <- TFBSmatrix[, !zero_TF] # calculate G+C and CpG obs/expected fMono <- oligonucleotideFrequency(peakSeq, width = 1L, as.prob = TRUE) fDi <- oligonucleotideFrequency(peakSeq, width = 2L, as.prob = TRUE) fracGC <- fMono[, "G"] + fMono[, "C"] oeCpG <- (fDi[, "CG"] + 0.01) / (fMono[, "G"] * fMono[, "C"] + 0.01) # add GC and oeCpG to predictor matrix TFBSmatrix <- cbind(fracGC, oeCpG, TFBSmatrix) TFBSmatrix[1:6, 1:6] ## ----stabSelTFs--------------------------------------------------------------- ## randLassoStabSel() is stochastic, so if we run with parallelization ## (`mc.cores` argument), we must select a random number generator that can ## provide multiple streams of random numbers used in the `parallel` package ## and set its seed for reproducible results # RNGkind("L'Ecuyer-CMRG") # set.seed(123) # se <- randLassoStabSel(x = TFBSmatrix, y = gr$logFC_liver_vs_lung, # cutoff = 0.8, mc.preschedule = TRUE, # mc.set.seed = TRUE, mc.cores = 2L) # if not running in parallel mode, it is enough to use set.seed() before # the call to ensure reproducibility (`mc.sores = 1`) set.seed(123) se <- randLassoStabSel(x = TFBSmatrix, y = gr$logFC_liver_vs_lung, cutoff = 0.8) se # selected TFs colnames(se)[se$selected] ## ----plotStabilityPaths------------------------------------------------------- plotStabilityPaths(se) ## ----plotSelProbs, fig.width=8, fig.height=5---------------------------------- plotSelectionProb(se, directional = TRUE) ## ----TFBScor_selected, fig.width=10, fig.height=7----------------------------- # subset the selected TFs sel <- colnames(se)[se$selected] se_sub <- se[, sel] # exclude oeCpG and fracGC excl <- colnames(se_sub) %in% c("oeCpG", "fracGC") se_sub <- se_sub[, !excl] # correlation matrix TFBSmatrixCorSel <- cor(TFBSmatrix[, colnames(se_sub)], method = "pearson") # heatmap pfmsSel <- pfms[match(colnames(TFBSmatrixCorSel), name(pfms))] maxwidth <- max(sapply(TFBSTools::Matrix(pfmsSel), ncol)) seqlogoGrobs <- lapply(pfmsSel, seqLogoGrob, xmax = maxwidth) hmSeqlogo <- rowAnnotation(logo = annoSeqlogo(seqlogoGrobs, which = "row"), annotation_width = unit(2, "inch"), show_annotation_name = FALSE ) colAnn <- HeatmapAnnotation(AUC = se_sub$selAUC, selProb = se_sub$selProb, show_legend = TRUE, show_annotation_name = TRUE, col = list( AUC = colorRamp2(c(0, 1), c("white", "brown")), selProb = colorRamp2(c(0, 1), c("white", "steelblue"))) ) Heatmap(TFBSmatrixCorSel, show_row_names = TRUE, show_column_names = TRUE, name = "Pear. Cor.", column_title = "Selected TFs", col = colorRamp2(c(-1, 0, 1), c("blue", "white", "red")), right_annotation = hmSeqlogo, top_annotation = colAnn) ## ----topPeaks----------------------------------------------------------------- TF <- sel[2] TF i <- which(assay(se, "x")[, TF] > 0) # peaks that contain TF hits... nm <- names(sort(abs(gr$logFC_liver_vs_lung[i]), decreasing = TRUE)) # ... order by |logFC| gr[nm] ## ----session------------------------------------------------------------------ sessionInfo()