## ----style, eval=TRUE, echo=FALSE, results="asis"------------------------ BiocStyle::latex() library(knitr) opts_chunk$set(cache=TRUE, tidy=FALSE) ## ----packages, eval=TRUE, echo=FALSE, warning=FALSE, message=FALSE------- suppressPackageStartupMessages({ library(ShortRead) library(VariantAnnotation) library(BiocParallel) library(ggplot2) library(RNAseqData.HNRNPC.bam.chr14) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(BSgenome.Hsapiens.UCSC.hg19) library(AnnotationHub) library(rtracklayer) library(BiocIntro) }) ## ----fastq-format, echo=FALSE-------------------------------------------- bigdata <- "~/bigdata" fl <- "~/bigdata/fastq/ERR127302_2.fastq.gz" cat(noquote(tail(readLines(fl, 800), 4)), sep="\n") fq <- FastqStreamer(fl, 100000) enc <- yield(fq) close(fq) ## ----ascii, echo=FALSE--------------------------------------------------- encoding(quality(enc)) ## ----ShortRead----------------------------------------------------------- library(ShortRead) library(BiocParallel) ## ----getreads------------------------------------------------------------ dirPath <- "~/bigdata/fastq" sampler <- FastqSampler(file.path(dirPath, "ERR127302_1.fastq.gz"), 1000000) reads <- yield(sampler) ## ----id-qual------------------------------------------------------------- # outputs read ids as a list as BStringSet head( id(reads) ) # outputs read sequences as a list as DNAStringSet head(sread(reads) ) # outputs list of quality scores as BStringSet head(quality(reads)) ## ----fastq-3------------------------------------------------------------- abc <- alphabetByCycle(sread(reads)) abc[1:4, 1:8] matplot(t(abc[c("A","G","T","C"),]), type="l") ## ----fastq-4------------------------------------------------------------- alf0 <- alphabetFrequency(sread(reads), as.prob=TRUE) hist(alf0[,c("G", "C")] , main = "Histogram of gc Content", xlab="individual reads" ) ## ----qa------------------------------------------------------------------ qa <- qa(dirPath, "ERR*", type="fastq") ## ----browse-qa, eval=FALSE----------------------------------------------- ## browseURL(report(qa)) ## ----qa_all-------------------------------------------------------------- (load(file.path(dirPath, "E-MTAB-1147-qa_report.Rda"))) ## ----browse-qa_all, eval=FALSE------------------------------------------- ## browseURL(report(qa)) ## ----BiocIntro----------------------------------------------------------- library(BiocIntro) ## ----sampler, fig.height=3----------------------------------------------- fl <- file.path(dirPath, "ERR127302_1.fastq.gz") fq <- FastqSampler(fl, 100000) srq <- yield(fq) srq plotByCycle(quality(srq)) ## ----trim, fig.height=3-------------------------------------------------- trimmed <- trimTails(srq, 3, "5") plotByCycle(quality(trimmed)) ## ----trim-file-names----------------------------------------------------- (fls <- dir(dirPath, pattern="fastq.gz", full=TRUE)) ## ----trim-dest-names----------------------------------------------------- (destinations <- sub("fastq.gz", "trimmed.fastq", fls)) ## ----trim-files, eval=FALSE---------------------------------------------- ## trimTails(fls, 2, "5", destinations=destinations) ## ----SAM----------------------------------------------------------------- fl <- system.file("extdata", "ex1.sam", package="Rsamtools") strsplit(readLines(fl, 1), "\t")[[1]] ## ----readGAlignments----------------------------------------------------- library(GenomicAlignments) library(RNAseqData.HNRNPC.bam.chr14) fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES basename(fls) aln <- readGAlignments(fls[1]) length(aln) head(aln, 3) ## ----GappedAlignments-accessors------------------------------------------ table(strand(aln)) range(width(aln)) head(sort(table(cigar(aln)), decreasing=TRUE)) ## ----GappedAlignments-reads---------------------------------------------- param <- ScanBamParam(what="seq") aln <- readGAlignments(fls[1], param=param) ## ----GappedAlignments-iter----------------------------------------------- bf <- open(BamFile(fls[1], yieldSize=200000)) repeat { aln <- readGAlignments(bf) if (length(aln) == 0) break # no more records ## do work message(length(aln)) } close(bf) ## ----summarizeOverlaps--------------------------------------------------- ## library(BiocParallel) library(TxDb.Hsapiens.UCSC.hg19.knownGene) ex <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "gene") counts <- summarizeOverlaps(ex, fls) colSums(assay(counts)) sum(rowSums(assay(counts)) != 0) ## ----seqlevels_rename---------------------------------------------------- library(VariantAnnotation) library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") vcf <- renameSeqlevels(vcf, c("22"="chr22")) ## ----locateVariants_find------------------------------------------------- rd <- rowData(vcf) loc <- locateVariants(rd, txdb, CodingVariants()) head(loc, 3) ## ----locateVariants_example---------------------------------------------- ## Did any coding variants match more than one gene? splt <- split(loc$GENEID, loc$QUERYID) table(sapply(splt, function(x) length(unique(x)) > 1)) ## Summarize the number of coding variants by gene ID splt <- split(loc$QUERYID, loc$GENEID) head(sapply(splt, function(x) length(unique(x))), 3) ## ----predictCoding------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) coding <- predictCoding(vcf, txdb, seqSource=Hsapiens) coding[5:9] ## ----predictCoding_frameshift-------------------------------------------- coding[coding$CONSEQUENCE == "frameshift"]