## ----style, echo=FALSE, results='asis'----------------------------------- BiocStyle::markdown() ## ----setup, echo=FALSE, results='hide'----------------------------------- library(knitr) opts_chunk$set(cache=TRUE, error=FALSE) ## ----vectorize----------------------------------------------------------- x <- 1:10 log(x) ## NOT for (i in seq_along) x[i] <- log(x[i]) ## ----pre-allocate-------------------------------------------------------- result <- numeric(10) result[1] <- runif(1) for (i in 2:length(result)) result[i] <- runif(1) * result[i - 1] result ## ----inefficient--------------------------------------------------------- f0 <- function(n, a=2) { ## stopifnot(is.integer(n) && (length(n) == 1) && ## !is.na(n) && (n > 0)) result <- numeric() for (i in seq_len(n)) result[[i]] <- a * log(i) result } ## ----system-time--------------------------------------------------------- system.time(f0(10000)) n <- 1000 * seq(1, 20, 2) t <- sapply(n, function(i) system.time(f0(i))[[3]]) plot(t ~ n, type="b") ## ----correct-init-------------------------------------------------------- n <- 10000 system.time(expected <- f0(n)) head(expected) ## ----hoist--------------------------------------------------------------- f1 <- function(n, a=2) { result <- numeric() for (i in seq_len(n)) result[[i]] <- log(i) a * result } identical(expected, f1(n)) library(microbenchmark) microbenchmark(f0(n), f1(n), times=5) ## ----preallocate-and-fill------------------------------------------------ f2 <- function(n, a=2) { result <- numeric(n) for (i in seq_len(n)) result[[i]] <- log(i) a * result } identical(expected, f2(n)) microbenchmark(f0(n), f2(n), times=5) ## ----use-apply----------------------------------------------------------- f3 <- function(n, a=2) a * sapply(seq_len(n), log) identical(expected, f3(n)) microbenchmark(f0(n), f2(n), f3(n), times=10) ## ----use-vectorize------------------------------------------------------- f4 <- function(n, a=2) a * log(seq_len(n)) identical(expected, f4(n)) microbenchmark(f0(n), f3(n), f4(n), times=10) ## ----vectorized-scale---------------------------------------------------- n <- 10 ^ (5:8) # 100x larger than f0 t <- sapply(n, function(i) system.time(f4(i))[[3]]) plot(t ~ n, log="xy", type="b") ## ----reduceByYield-setup------------------------------------------------- suppressPackageStartupMessages({ library(GenomicFiles) library(GenomicAlignments) library(Rsamtools) library(TxDb.Hsapiens.UCSC.hg19.knownGene) }) yield <- # how to input the next chunk of data function(X, ...) { readGAlignments(X) } map <- # what to do to each chunk function(VALUE, ..., roi) { olaps <- findOverlaps(VALUE, roi, type="within", ignore.strand=TRUE) count <- tabulate(subjectHits(olaps), subjectLength(olaps)) setNames(count, names(roi)) } reduce <- `+` # how to combine mapped chunks ## ----yieldFactory-------------------------------------------------------- yieldFactory <- # return a function with local state function() { n_records <- 0L function(X, ...) { aln <- readGAlignments(X) n_records <<- n_records + length(aln) message(n_records) aln } } ## ----count-overlaps-roi, eval=FALSE-------------------------------------- # exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx") # # fl <- "/home/ubuntu/data/vobencha/LargeData/srarchive/hg19_alias.tab" # map0 <- read.delim(fl, header=FALSE, stringsAsFactors=FALSE) # seqlevels(exByTx, force=TRUE) <- setNames(map0$V1, map0$V2) ## ----count-overlaps, eval=FALSE------------------------------------------ # count1 <- function(filename, roi) { # message(filename) # ## Create and open BAM file # bf <- BamFile(filename, yieldSize=1000000) # reduceByYield(bf, yieldFactory(), map, reduce, roi=roi) # } ## ----count-overlaps-doit, eval=FALSE------------------------------------- # bam <- "/home/ubuntu/data/vobencha/LargeData/srarchive/SRR1039508_sorted.bam" # count <- count1(bam, exByTx) ## ----rtracklayer-file-classes-------------------------------------------- ## rtracklayer menagerie suppressPackageStartupMessages(library(rtracklayer)) names(getClass("RTLFile")@subclasses) ## ----parallel-sleep------------------------------------------------------ library(BiocParallel) fun <- function(i) { Sys.sleep(1) i } ## serial f0 <- function(n) lapply(seq_len(n), fun) ## parallel f1 <- function(n) bplapply(seq_len(n), fun) ## ----parallel-bpredo-param----------------------------------------------- param <- MulticoreParam(workers = 3) ## ----parallel-bpredo-bplapply, error=TRUE-------------------------------- X <- list(1, "2", 3) res <- bplapply(X, sqrt, BPPARAM = param) ## ----parallel-bptry------------------------------------------------------ res <- bptry(bplapply(X, sqrt, BPPARAM=param)) res ## ----parallel-bpredo----------------------------------------------------- X.redo <- list(1, 2, 3) bplapply(X.redo, sqrt, BPREDO = res) ## ----parallel-debug, eval=FALSE------------------------------------------ # > fun = function(i) { browser(); sqrt(i) } # > bplapply(X, fun, BPREDO=res, BPPARAM=SerialParam()) # resuming previous calculation ... # Called from: FUN(...) # Browse[1]> # debug at #1: sqrt(i) # Browse[2]> i # [1] "2" # Browse[2]> i = 2 # Browse[2]> c # [[1]] # [1] 1 # # [[2]] # [1] 1.414214 # # [[3]] # [1] 1.732051 ## ----logging, eval=FALSE------------------------------------------------- # FUN <- function(i) { # flog.debug(paste0("value of 'i': ", i)) # # if (!length(i)) { # flog.warn("'i' is missing") # NA # } else if (!is(i, "numeric")) { # flog.info("coercing to numeric") # as.numeric(i) # } else { # i # } # } ## ----logging-WARN, eval=FALSE-------------------------------------------- # param <- SnowParam(3, log = TRUE, threshold = "WARN") # bplapply(list(1, "2", integer()), FUN, BPPARAM = param) ## ----timeout_constructor------------------------------------------------- param <- SnowParam(timeout = 20) param ## ----timeout_setter------------------------------------------------------ bptimeout(param) <- 2 param ## ----timeout_bplapply---------------------------------------------------- fun <- function(i) { Sys.sleep(i) i } ## ----co-setup------------------------------------------------------------ suppressPackageStartupMessages({ library(BiocParallel) library(GenomicFiles) library(GenomicAlignments) library(Rsamtools) }) ## ----co-param------------------------------------------------------------ param <- MulticoreParam(4, log = TRUE, timeout = 60) ## ----co-param-snow, eval=FALSE------------------------------------------- # param <- SnowParam(4, log = TRUE, timeout = 60) ## ----co-bams------------------------------------------------------------- fls <- dir("/home/ubuntu/data/vobencha/LargeData/copynumber", ".bam$", full=TRUE) names(fls) <- basename(fls) bfl <- BamFileList(fls) ## ----co-GRanges---------------------------------------------------------- ranges <- GRanges("chr6", IRanges(c(28477797, 29527797, 32448354), c(29477797, 30527797, 33448354))) ## ----co-map, eval=FALSE-------------------------------------------------- # MAP <- function(range, file, ...) { # library(GenomicAlignments) ## readGAlignments(), ScanBamParam() # param = ScanBamParam(which=range) ## restriction # gal = readGAlignments(file, param=param) # ## log messages # flog.info(paste0("file: ", basename(file))) # flog.debug(paste0("records: ", length(gal))) # ## overlaps # olaps <- findOverlaps(gal, range, type="within", ignore.strand=TRUE) # tabulate(subjectHits(olaps), subjectLength(olaps)) # } ## ----co-doit, eval=FALSE------------------------------------------------- # cts <- reduceByFile(ranges, fls, MAP, BPPARAM = param) ## ----co-length, eval=FALSE----------------------------------------------- # length(cts) ## ----co-elementlengths, eval=FALSE--------------------------------------- # elementLengths(cts) ## ----co-tables, eval=FALSE----------------------------------------------- # cts[[1]]