############################################################################## ## Exercise 1: ############## library(Biostrings) dna <- DNAString(paste(sample(DNA_BASES, 2000, replace=TRUE), collapse="")) dna v <- Views(dna, start=c(550, 1222, -5, 45), width=c(100, 20)) v gaps(v) # the views are the regions not covered by the views in 'v' alphabetFrequency(dna) alphabetFrequency(v) alphabetFrequency(gaps(v)) ## Sanity check: alphabetFrequency(v, collapse=TRUE) # same as 'colSums(alphabetFrequency(v))' alphabetFrequency(gaps(v), collapse=TRUE) ## We should not expect the sum of the 2 above vectors to be equal to ## 'alphabetFrequency(dna)' because some views in 'v' are overlapping. ## However, summing the frequencies in 'gaps(v)' with those in 'gaps(gaps(v))' ## should work: af1 <- alphabetFrequency(gaps(v), collapse=TRUE) af2 <- alphabetFrequency(gaps(gaps(v)), collapse=TRUE) all(af1 + af2 == alphabetFrequency(dna)) # TRUE ############################################################################## ## Exercise 2: ############## library(hgu95av2probe) probes <- DNAStringSet(hgu95av2probe) probes[-(1:10)] ## Index of probes with more than 16 A's: idx <- which(alphabetFrequency(probes, baseOnly=TRUE)[ , "A"] >= 16) ## Probes with more than 16 A's: probes[idx] reverseComplement(probes) subseq(probes, start=3, end=-3) mm_probes <- probes subseq(mm_probes, start=13, width=1) <- reverseComplement(subseq(probes, start=13, width=1)) ## Probes with more than 9 consecutive A's: idx <- which(vcountPattern("AAAAAAAAA", probes) != 0) probes[idx] ############################################################################## ## Exercise 3: ############## library(BSgenome.Hsapiens.UCSC.hg19) pdict <- PDict(probes) chr22 <- Hsapiens[["chr22"]] chr22_nhit <- countPDict(pdict, chr22) length(chr22_nhit) # 1 integer per probe head(chr22_nhit) idx <- which(chr22_nhit >= 2000) probes[idx] ############################################################################## ## Exercise 4: ############## library(BSgenome.Hsapiens.UCSC.hg19) library(TxDb.Hsapiens.UCSC.hg19.knownGene) ## It takes between 30 sec and 1 min to extract this transcriptome. It also ## requires 1 GB of memory: txseqs <- extractTranscriptsFromGenome(Hsapiens, Hsapiens_UCSC_hg19_knownGene_TxDb) ## It takes between 3 and 5 min to count all the hits: nhit_per_tx <- vcountPDict(pdict, txseqs, collapse=2) names(nhit_per_tx) <- names(txseqs) head(nhit_per_tx) table(nhit_per_tx) # more than half of the transcripts have no hits!