############################################################################## ### ### Exercise 1 ### ========== ### ### 1. Create some arbitrary BString, DNAString and AAString instances. ### library(Biostrings) b <- BString("I am a BString object") b d <- DNAString("TTGAAAA-CTC-N") d a <- AAString("MARKSLEMSIR*") a ### ### 2. Apply nchar() and alphabet() on them. ### nchar(b) alphabet(b) nchar(d) alphabet(d) nchar(a) alphabet(a) ### ### 3. Apply alphabetFrequency() and reverseComplement() on the DNAString objects. ### alphabetFrequency(d) reverseComplement(d) ### ### 4. Extract substrings by using the subsetting operator [ and by using the ### subBString() function. ### b[16] b[nchar(b):1] subBString(b, 3, 12) ### See ?BString, ?DNAString, ?AAString, ?alphabetFrequency, ?reverseComplement ### and ?subBString for more information. ############################################################################## ### ### Exercise 2 ### ========== ### ### 1. Display several chromosomes of C. elegans. Do they contain IUPAC ### extended letters? ### library(BSgenome.Celegans.UCSC.ce2) Celegans Celegans[["chrI"]] alphabetFrequency(Celegans[["chrI"]]) # no extended letter Celegans[["chrII"]] alphabetFrequency(Celegans[["chrII"]]) # contains extended letter N ### To answer the question programmatically hasExtendedLetters <- function(seq) { alphabetFrequency(seq, baseOnly=TRUE)[["other"]] != 0 } ### To answer the question for all chromosomes programmatically sapply(seqnames(Celegans), function(seqname) hasExtendedLetters(Celegans[[seqname]])) ### See ?IUPAC_CODE_MAP, ?alphabetFrequency, ?sapply and ?seqnames ### for more information. ### ### 2. Load BSgenome.Dmelanogaster.FlyBase.r51 (fruit fly genome). ### Do the chromosomes contain IUPAC extended letters? ### library(BSgenome.Dmelanogaster.FlyBase.r51) sapply(seqnames(Dmelanogaster), function(seqname) hasExtendedLetters(Dmelanogaster[[seqname]])) ### ### 3. Extract the last 10M bases from fruit fly chromosome 2L. ### Use the system.time() function to compare the performance ### of subBString() vs the subsetting operator [. ### chr2L <- Dmelanogaster[["2L"]] system.time(x1 <- subBString(chr2L, , nchar(chr2L), length=10**6)) system.time(x2 <- chr2L[(nchar(chr2L)-10**6+1):nchar(chr2L)]) x1 == x2 # the 2 methods produce the same result but subBString is much faster ### See ?subBString for more information. ############################################################################## ### ### Exercise 3 ### ========== ### ### 1. Compare nchar() vs width() on v. ### v <- views(DNAString("TAATAATG"), 3:0, 5:8) nchar(v) width(v) nchar(v) == width(v) # FALSE for the last view ("out of limits" view) ### ### 2. Find a programatical way to remove "out of limits" views from ### a BStringViews object. ### v[nchar(v) == width(v)] ### See ?`BStringViews-class` for more information and many examples. ############################################################################## ### ### Exercise 4 ### ========== ### ### 1. Use the mask() function to mask the Ns in fruit fly chromosome 2L. ### Apply \Rfunction{mask} again to the result (and without the second ### argument) to see the location of the Ns. Do we have isolated Ns ### or N-blocks? ### library(BSgenome.Dmelanogaster.FlyBase.r51) noN_chr2L <- mask(Dmelanogaster[["2L"]], "N") mask(noN_chr2L) # looks like we have 2 N-blocks of length 100 ### See ?mask for more information. ### ### 2. Find the location of the Ns in C. elegans chromosome III and in ### H. sapiens chromosome 1. ### library(BSgenome.Celegans.UCSC.ce2) Celegans mask(mask(Celegans[["chrIII"]], "N")) # 5 isolated Ns library(BSgenome.Hsapiens.UCSC.hg18) Hsapiens Nlocs <- mask(mask(Hsapiens[["chr1"]], "N")) Nlocs # 39 N-blocks of length 50000? width(Nlocs) # no, some of them are even bigger! ### ### 3. What's the longest A-block in fruit fly chromosome 2L? ### ### Unlike with the Ns, masking the As by using the "A" pattern in the ### call to mask() would be very inefficient to answer this question. ### It's better to use a longer pattern: Ablocks <- mask(mask(Dmelanogaster[["2L"]], "AAAAAAAA")) Ablocks # shows all A-blocks with a length of at least 8 max(width(Ablocks)) # length of the longest A-block Ablocks[max(width(Ablocks)) == width(Ablocks)] # keep this view only ############################################################################## ### ### Exercise 5 ### ========== ### ### 1. Extract the start and end position of the exons belonging to gene ### FBgn0025803 (chromosome 3R). ### library(ann.Dmelanogaster.FlyBase.r51) data(EXON2GENE) ex_ids <- names(EXON2GENE)[EXON2GENE == "FBgn0025803"] ex_ids # shows the exons belonging to this gene ex_starts <- getAnnExons("3R")[ex_ids, "start"] # their starts ex_ends <- getAnnExons("3R")[ex_ids, "end"] # their ends ### ### 2. Load BSgenome.Dmelanogaster.FlyBase.r51 and create a BStringViews object ### where the subject is chromosome 3R and the views are the exon locations ### found previously. ### library(BSgenome.Dmelanogaster.FlyBase.r51) ex_FBgn0025803 <- views(Dmelanogaster[["3R"]], ex_starts, ex_ends) ### See ?views for more information. ### ### 3. Apply the mask() function on this \Rclass{BStringViews} object. Is this ### new BStringViews object representing the introns sequences for gene ### FBgn0025803? What needs to be adjusted? ### in_FBgn0025803 <- mask(ex_FBgn0025803) in_FBgn0025803 ### The first and the last views are not introns so we remove them: in_FBgn0025803 <- in_FBgn0025803[-c(1,length(in_FBgn0025803))] ### ### 4. Are we discovering something? ### in_FBgn0025803 ### Looks like we've just rediscovered the "GT-AG rule"! ### http://biotech.icmb.utexas.edu/search/dict-search.html ############################################################################## ### ### Exercise 6 ### ========== ### ### 1. Find all the matches of an arbitrary nucleotide sequence in fruit fly ### chromosome 2L. ### library(BSgenome.Dmelanogaster.FlyBase.r51) pattern <- DNAString("ACCCCCTTTTTG") # an arbitrary sequence matchPattern(pattern, Dmelanogaster[["2L"]]) ### See ?matchPattern for more information. ### ### 2. In fact, if we don't take any special action, we only get the hits in the ### plus strand of the chromosome. Find the matches in the minus strand too. ### IMPORTANT: You should always avoid to reverseComplement an entire ### chromosome sequence (this is very inefficient). matchPattern(reverseComplement(pattern), Dmelanogaster[["2L"]]) ############################################################################## ### ### Exercise 7 ### ========== ### ### 1. Use hgu95av2 to find the chromosome location of the gene targetted ### by probeset 1138_at. ### library(hgu95av2) ls(2) hgu95av2CHRLOC[["1138_at"]] # probe set 1138_at is targetting a gene in chr 2 # that starts at position 113119997 ### ### 2. Use hgu95av2probe to find all the probe sequences for this probeset ### id (put the result in a character vector called 'probes'). ### library(hgu95av2probe) ls(2) as.data.frame(hgu95av2probe)[1:4, ] probes <- hgu95av2probe$sequence[hgu95av2probe$Probe.Set.Name == "1138_at"] probes # 16 probes in probeset 1138_at ### ### 3. Use matchPattern() to find the corresponding hit for each probe in ### the corresponding chromosome (use a 'sapply(probes, function(probe) ...)' ### construction to put all the results in a single list, the returned list ### will have one element per probe in 'probes'). Do we have a hit for every probe? ### Compare with hgu95av2CHRLOC[["1138_at"]]. Do you see something wrong? ### library(BSgenome.Hsapiens.UCSC.hg18) probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr2"]])) probe_hits sapply(probe_hits, length) # exactly 1 hit for each probe except for probe 15 hit_starts <- unlist(sapply(probe_hits, start)) hit_starts - hgu95av2CHRLOC[["1138_at"]] # it looks like we hit rather far away # from the starting position of the gene! ### Note that if we expect all our probes in 'probes' to hit the positive strand ### of chromosome 2, we also expect them to NOT hit the human genome anywhere else ### i.e. we don't expect them to hit the minus strand of chromosome 2 neither do ### we expect them to hit any other strand of any other chromosome: probe_hits <- sapply(probes, function(probe) matchPattern(reverseComplement(DNAString(probe)), Hsapiens[["chr2"]])) sapply(probe_hits, length) # all zeros as expected probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr3"]])) sapply(probe_hits, length) # all zeros as expected ### etc... ### ### 4. Try again with \Robject{mismatch=1}. Do we have a hit for every probe? ### probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr2"]], mismatch=1)) sapply(probe_hits, length) # now every probe has exactly 1 hit ### ### 5. In fact, according to hgu95av2probe, probeset 1138_at should hit a gene ### in the antistrand. Does this look correct? ### strand <- hgu95av2probe$Target.Strandedness[hgu95av2probe$Probe.Set.Name == "1138_at"] strand # all probes are reported to target the minus strand which seems to be incorrect! ############################################################################## ### ### Exercise 8 ### ========== ### ### 1. Search pattern GAACTTTGCCACTC in fruit fly chromosome 2L. ### library(BSgenome.Dmelanogaster.FlyBase.r51) pattern0 <- DNAString("GAACTTTGCCACTC") matchPattern(pattern0, Dmelanogaster[["2L"]]) # no match ### ### 2. Repeat but this time allow the 2nd T in the pattern (6th letter) ### to match anything. Anything wrong? ### ### We need to use wildcard N instead of T in the pattern pattern1 <- DNAString("GAACTNTGCCACTC") ### Also we need to use 'fixed=FALSE' in our call to matchPattern so the N can ### actually match anything (not only another N). matchPattern(pattern1, Dmelanogaster[["2L"]], fixed=FALSE) ### mmmh, we get a bunch of unwanted matches, this is because with 'fixed=FALSE' ### Ns in the subject also match anything in the pattern ### ### 3. You can use the mask() function or 'fixed="subject"' to work around the ### previous problem. Try and compare. ### matchPattern(pattern1, mask(Dmelanogaster[["2L"]], "N"), fixed=FALSE) matchPattern(pattern1, Dmelanogaster[["2L"]], fixed="subject") ### Both give the expected result. ############################################################################## ### ### Exercise 9 ### ========== ### ### Find all Fprobe0 and Rprobe0 hits in chromosome 3R. ### Remember that nothing prevents a probe to hit the strand that it was ### not designed for! ### Put the start positions of all the "plus hits" (i.e. the hits of Fprobe0 ### and Rprobe0 in the plus strand) in the Phits object (integer vector). Put ### the end positions of all the "minus hits" in the Mhits object (integer vector). ### Fprobe0 <- DNAString("AGCTCCGAGTT") Rprobe0 <- DNAString("CGTTGTTCACA") library(BSgenome.Dmelanogaster.FlyBase.r51) Phits <- c(start(matchPattern(Fprobe0, Dmelanogaster[["3R"]])), start(matchPattern(Rprobe0, Dmelanogaster[["3R"]]))) Mhits <- c(end(matchPattern(reverseComplement(Fprobe0), Dmelanogaster[["3R"]])), start(matchPattern(reverseComplement(Rprobe0), Dmelanogaster[["3R"]]))) ### ### 2. Ideally Phits and Mhits should be of length 1 but it is not always the ### case. However the fact that we get several "plus hits" and/or "minus hits" ### doesn't necessarily mean that our original primer pair has a bad design. ### In fact, some of these hits don't prevent the PCR experiment to ### work properly: this is the case for any "plus hit" that is followed ### (from left to right) by another "plus hit" with no "minus hit" between. ### And similarly, it is the case for a "minus hit" that is preceded by ### another "minus hit" with no "plus hit" between. ### All these hits can be ignored. ### Write a function that takes 2 arguments, Phits and Mhits, ### that removes all the above hits. The function will return a data frame ### with 2 columns, Phits and Mhits. Note that using a data frame now makes ### sense because after this removal, Phits and Mhits will always have the ### same length. reduceProbePairMatches <- function(Phits, Mhits) { start <- sort(unique(Phits)) end <- sort(unique(Mhits)) nstart <- length(start) nend <- length(end) i <- j <- 1 start0 <- end0 <- integer(0) while (i <= nstart && j <= nend) { if (end[j] < start[i]) { j <- j + 1 next } while (i < nstart && start[i + 1] <= end[j]) i <- i + 1 start0 <- c(start0, start[i]) end0 <- c(end0, end[j]) i <- i + 1 j <- j + 1 } data.frame(start=start0, end=end0) } ### Some testing reduceProbePairMatches(c(1, 5, 10, 20, 25), c(4, 18, 19)) ### ### 3. Use the previous function to "clean" Phits and Mhits. ### Compare your result with the "theoretical amplicons" returned by ### matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[["3R"]]) ### From now we will use the matchProbePair() function to achieve this. ### Are all these "theoretical amplicons" realistic (PCR amplification is ### supposed to work for regions up to 5k-10k bases). ### Would you still consider the Fprobe0/Rprobe0 pair good for amplifying ### the region it was designed for? ### pp0_hits <- reduceProbePairMatches(Phits, Mhits) # hits for probe pair Fprobe0/Rprobe0 ### matchProbePair() is provided to make all this easier: pp_3R_hits <- matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[["3R"]]) width(pp_3R_hits) pp_3R_hits[width(pp_3R_hits) <= 10000] # only 1 valid amplicon, which is OK ### ### 4. Another problem that can prevent the PCR amplification from working ### properly is when a given primer pair produces more that 1 valid amplicon ### (e.g. <= 20k) over the whole genome. ### What about Fprobe0/Rprobe0? ### validHits <- function(Fprobe0, Rprobe0, seqname) { hits <- matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[[seqname]]) hits[width(hits) <= 20000] } pp_all_hits <- sapply(seqnames(Dmelanogaster), function(seqname) validHits(Fprobe0, Rprobe0, seqname)) sapply(pp_all_hits, length) # only 1 valid amplicon over the whole genome, which is OK ### ### 5. Write a function that takes our probes library (the 'probes' data frame) ### as input and produces a data frame similar to 'probes' but with 3 ### additional columns "chr", "start" and "end" that contain the chromosome ### name/start/end of the amplicon found for each primer pair or NAs if 0 or ### more than 1 amplicon were found. Having NAs in a row means that the ### design of the primer pair is bad. ### PCR <- function(probes) { chr <- rep(as.character(NA), nrow(probes)) start <- end <- rep(as.integer(NA), nrow(probes)) for (i in seq_len(nrow(probes))) { Fprobe <- probes[i, "Fprobe"] Rprobe <- probes[i, "Rprobe"] seq2amps <- sapply(seqnames(Dmelanogaster), function(seqname) validHits(Fprobe, Rprobe, seqname)) seq2namps <- sapply(seq2amps, length) if (sum(seq2namps) != 1) next chr[i] <- names(seq2namps)[seq2namps == 1] start[i] <- start(seq2amps[[chr[i]]]) end[i] <- end(seq2amps[[chr[i]]]) } probes$chr <- chr probes$start <- start probes$end <- end probes } probes1 <- PCR(probes) # will take 2-10 minutes depending on your machine