## ----set-options, echo=FALSE, cache=FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- options(width = 400) knitr::opts_chunk$set(warnings = FALSE, message = FALSE) ## ----eval = FALSE--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # if (!requireNamespace("BiocManager", quietly=TRUE)) # # install.packages("BiocManager") # BiocManager::install("preciseTAD") # # For the latest version install from GitHub # # BiocManager::install("dozmorovlab/preciseTAD") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(knitr) library(e1071) library(preciseTAD) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # arrowhead -c 1 \ #chromosome to call TADs on # -r 5000 \ #HiC data resolution # ~/GSE63525_GM12878_insitu_primary.hic \ #location of the .HIC file # ~/arrowhead_output #location to store the output ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data("arrowhead_gm12878_5kb") head(arrowhead_gm12878_5kb) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- bounds <- extractBoundaries(domains.mat = arrowhead_gm12878_5kb, filter = FALSE, CHR = c("CHR1", "CHR22"), resolution = 5000) # View unique boundaries bounds ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # path <- "./path/to/BEDfiles" # tfbsList <- bedToGRangesList(filepath=path, bedList=NULL, bedNames=NULL, pattern = "*.bed", signal=4) data("tfbsList") names(tfbsList) tfbsList ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- set.seed(123) tadData <- createTADdata(bounds.GR = bounds, resolution = 5000, genomicElements.GR = tfbsList, featureType = "distance", resampling = "rus", trainCHR = "CHR1", predictCHR = "CHR22") # View subset of training data tadData[[1]][1:5,1:4] # Check it is balanced table(tadData[[1]]$y) # View subset of testing data tadData[[2]][1:5,1:4] ## ----results='hide'------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- set.seed(123) rfe_res <- TADrfe(trainData = tadData[[1]], tuneParams = list(ntree = 500, nodesize = 1), cvFolds = 5, cvMetric = "Accuracy", verbose = TRUE) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # View RFE performances rfe_res[[1]] # View the variable importance among top n features across each CV fold head(rfe_res[[2]]) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Restrict the data matrix to include only SMC3, RAD21, CTCF, and ZNF143 tfbsList_filt <- tfbsList[names(tfbsList) %in% c("Gm12878-Ctcf-Broad", "Gm12878-Rad21-Haib", "Gm12878-Smc3-Sydh", "Gm12878-Znf143-Sydh")] set.seed(123) tadData <- createTADdata(bounds.GR = bounds, resolution = 5000, genomicElements.GR = tfbsList_filt, featureType = "distance", resampling = "rus", trainCHR = "CHR1", predictCHR = "CHR22") # Run RF set.seed(123) tadModel <- TADrandomForest(trainData = tadData[[1]], testData = tadData[[2]], tuneParams = list(mtry = 2, ntree = 500, nodesize = 1), cvFolds = 3, cvMetric = "Accuracy", verbose = FALSE, model = TRUE, importances = TRUE, impMeasure = "MDA", performances = TRUE) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # View model performances performances <- tadModel[[3]] performances$Performance <- round(performances$Performance, digits = 2) rownames(performances) <- performances$Metric kable(t(performances), caption = "List of model performances when validating an RF built on CHR1 on CHR22 test data.") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- svmModel <- svm(y ~ ., data = tadData[[1]], kernel = "radial", cost = 1, gamma = 0.5) svmPreds <- predict(svmModel, tadData[[2]][, -1], positive = "Yes") # View confusion matrix table(svmPreds, tadData[[2]][, 1]) ## ----message=FALSE-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Run preciseTAD set.seed(123) pt <- preciseTAD(genomicElements.GR = tfbsList_filt, featureType = "distance", CHR = "CHR22", chromCoords = list(17000000, 18000000), tadModel = tadModel[[1]], threshold = 1, verbose = TRUE, parallel = NULL, DBSCAN_params = list(eps = c(30000), # c(10000, 30000, 50000) MinPts = c(100)), # c(3, 10, 100) slope = 5000, genome = "hg19", BaseProbs = TRUE) # What is getting returned? PTBRs - precide TAD boundary regions, and PTBPs - boundary points names(pt) # View the results # pt ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- plot(pt$Summaries$BaseProbs[1:100000]) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # Transform # pt_juice <- juicer_func(pt$PTBP) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # filepath = "~/path/to/store/ptbps" # write.table(pt_juice, # file.path(filepath, "pt_juice.bed"), # quote = FALSE, # col.names = FALSE, # row.names = FALSE, # sep = "\t") ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # Read in RF model built on chr1-chr21 # tadModel <- readRDS("~/Downloads/CHR22_GM12878_5kb_Peakachu.rds") ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # Establishing Hela-specific genomic annotations # # Reading in CTCF, RAD21, SMC3, and ZNF143 from repository and storing as list # ctcf <- as.data.frame(read.delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/ctcf.bed", sep = "\t", header = FALSE)) # rad21 <- as.data.frame(read.delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/rad21.bed", sep = "\t", header = FALSE)) # smc3 <- as.data.frame(read.delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/smc3.bed", sep = "\t", header = FALSE)) # znf143 <- as.data.frame(read.delim("http://raw.githubusercontent.com/stilianoudakis/preciseTAD_supplementary/master/data/bed/hela/znf143.bed", sep = "\t", header = FALSE)) # helaList <- list(ctcf, rad21, smc3, znf143) # # hela.GR <- bedToGRangesList(bedList=helaList, bedNames=c("Ctcf", "Rad21", "Smc3", "Znf143"), signal=5) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # # Run preciseTAD # set.seed(123) # pt <- preciseTAD(genomicElements.GR = hela.GR, # featureType = "distance", # CHR = "CHR22", # chromCoords = list(25500000, 27500000), # tadModel = tadModel[[1]], # threshold = 1.0, # verbose = FALSE, # parallel = NULL, # DBSCAN_params = list(30000, 100), # slope = 5000, # BaseProbs = FALSE, # savetobed = FALSE) #