## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"------------------ BiocStyle::latex() knitr::opts_chunk$set(cache=FALSE) ## ----setup, echo=FALSE, warning=FALSE, message=FALSE, cache=FALSE-------- library(GenomicRanges); library(ShortRead) library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) library(parallel) ## ----rbioc-pdata, eval=FALSE--------------------------------------------- ## pdataFile <- "/home/martin/RBiocForEveryone/pData.csv" ## ----rbioc-pdata-build, echo=FALSE--------------------------------------- library(RBiocForEveryone) pdataFile <- system.file(package="RBiocForEveryone", "extdata", "pData.csv") ## ----rbioc-pdata-csv----------------------------------------------------- pdata <- read.table(pdataFile) dim(pdata) names(pdata) ## ----rbioc-pdata-subset-------------------------------------------------- head(pdata$sex) # same as pdata[,"sex"], pdata[["sex"]] sapply(pdata, class) ## ----rbioc-pdata-sextab-------------------------------------------------- table(pdata$sex, useNA="ifany") ## ----rbioc-pdata-molbiol------------------------------------------------- with(pdata, table(mol.biol, useNA="ifany")) ## ----rbbioc-pdata-bcrabl------------------------------------------------- ridx <- pdata$mol.biol %in% c("BCR/ABL", "NEG") ## ----rbioc-pdata-molbiol-selected---------------------------------------- table(ridx) sum(ridx) ## ----rbioc-pdata-subset-ridx--------------------------------------------- pdata1 <- pdata[ridx,] ## ----rbioc-pdata-subset-levels------------------------------------------- levels(pdata$mol.biol) ## ----rbioc-pdata-subset-recode------------------------------------------- pdata1$mol.biol <- factor(pdata1$mol.biol) table(pdata1$mol.biol) ## ----rbioc-pdata-age-molbiol--------------------------------------------- with(pdata1, t.test(age ~ mol.biol)) ## ----rbioc-pdata-boxplot, eval=FALSE------------------------------------- ## ## not evaluated ## boxplot(age ~ mol.biol, pdata1) ## ----setup-pasilla------------------------------------------------------- library(pasillaBamSubset) library(GenomicRanges) # readGAlignments library(ShortRead) # alphabetByCycle ## ----param--------------------------------------------------------------- flag <- scanBamFlag(isMinusStrand=FALSE) param <- ScanBamParam(what=c("seq", "qual"), flag=flag) ## ----query--------------------------------------------------------------- fl <- untreated1_chr4() res <- readGAlignments(fl, param=param) ## ----abcplot, include=FALSE---------------------------------------------- abc <- alphabetByCycle(mcols(res)$seq) matplot(t(abc[1:4,]), type="l", lty=1, lwd=2, xlab="Cycle", ylab="Count") legend("topright", legend=rownames(abc)[1:4], lty=1, lwd=2, col=1:4) ## ----qualplot, include=FALSE--------------------------------------------- qual <- as(mcols(res)$qual, "matrix") boxplot(qual ~ col(qual), outline=FALSE, xlab="Cycle", ylab="Quality") ## ----annotations--------------------------------------------------------- library(TxDb.Dmelanogaster.UCSC.dm3.ensGene) # genome coordinates exByGn <- exonsBy(TxDb.Dmelanogaster.UCSC.dm3.ensGene, "gene") chr4 <- exByGn[ all(seqnames(exByGn) == "chr4") ] ## ----files--------------------------------------------------------------- fls <- c(untreated1_chr4(), untreated3_chr4()) names(fls) <- sub("_chr4.bam", "", basename(fls)) bfl <- BamFileList(fls) ## ----counts-------------------------------------------------------------- counts <- summarizeOverlaps(chr4, bfl, ignore.strand=TRUE) head(assay(counts)) ## ----countsplot, include=FALSE------------------------------------------- plot(asinh(assay(counts)), asp=1, main="asinh(counts), chr4") abline(0, 1, lty=2) ## ----gff, eval=FALSE----------------------------------------------------- ## library(rtracklayer) # import gff ## fl <- paste0("ftp://ftp.ensembl.org/pub/release-62/", "gtf/drosophila_melanogaster/", ## "Drosophila_melanogaster.BDGP5.25.62.gtf.gz") ## gffFile <- file.path(tempdir(), basename(fl)) ## download.file(fl, gffFile) ## gff0 <- import(gffFile) ## idx <- gff0$source == "protein_coding" & gff0$type == "exon" & seqnames(gff0) == "4" ## gff1 <- gff0[idx] ## chr4.gff <- split(gff1, mcols(gff1)$gene_id) ## ----counter------------------------------------------------------------- counter <- function(aln, roi) { strand(aln) <- "*" # strand-neutral protocol hits <- findOverlaps(aln, roi) keep <- which(countQueryHits(hits) == 1) cnts <- countSubjectHits(hits[queryHits(hits) %in% keep]) setNames(cnts, names(roi)) } ## ----counter-sapply------------------------------------------------------ countFile <- function(fl, roi) { open(fl); on.exit(close(fl)) aln <- readGAlignments(fl) counter(aln, roi) } count0 <- sapply(bfl, countFile, chr4) head(count0) ## ----counter-chunks------------------------------------------------------ countInChunks <- function(fl, roi) { yieldSize(fl) <- 1000000 # chunks of size 1 million open(fl); on.exit(close(fl)) count <- integer(length(range)) # initial count vector while (length(aln <- readGAlignments(fl))) count <- count + counter(aln, roi) count } count1 <- sapply(bfl, countInChunks, chr4) identical(count0, count1) ## ----counter-parallel---------------------------------------------------- library(parallel) options(mc.cores=detectCores()) # use all cores for parallel evaluation mcsapply <- function(...) simplify2array(mclapply(...)) count2 <- mcsapply(bfl, countInChunks, chr4) identical(count0, count2)