##############################################################################
###
### Exercise 1
### ==========

###
### 1. Create some arbitrary BString, DNAString and AAString instances.
###
library(Biostrings)
b <- BString("I am a BString object")
b
d <- DNAString("TTGAAAA-CTC-N")
d
a <- AAString("MARKSLEMSIR*")
a

###
### 2. Apply nchar() and alphabet() on them.
###
nchar(b)
alphabet(b)
nchar(d)
alphabet(d)
nchar(a)
alphabet(a)

###
### 3. Apply alphabetFrequency() and reverseComplement() on the DNAString objects.
###
alphabetFrequency(d)
reverseComplement(d)

###
### 4. Extract substrings by using the subsetting operator [ and by using the
###    subBString() function.
###
b[16]
b[nchar(b):1]
subBString(b, 3, 12)

### See ?BString, ?DNAString, ?AAString, ?alphabetFrequency, ?reverseComplement
### and ?subBString for more information.


##############################################################################
###
### Exercise 2
### ==========

###
### 1. Display several chromosomes of C. elegans. Do they contain IUPAC
###    extended letters?
###
library(BSgenome.Celegans.UCSC.ce2)
Celegans
Celegans[["chrI"]]
alphabetFrequency(Celegans[["chrI"]]) # no extended letter
Celegans[["chrII"]]
alphabetFrequency(Celegans[["chrII"]]) # contains extended letter N

### To answer the question programmatically
hasExtendedLetters <- function(seq)
{
    alphabetFrequency(seq, baseOnly=TRUE)[["other"]] != 0
}
### To answer the question for all chromosomes programmatically
sapply(seqnames(Celegans), function(seqname) hasExtendedLetters(Celegans[[seqname]]))

### See ?IUPAC_CODE_MAP, ?alphabetFrequency, ?sapply and ?seqnames 
### for more information.

###
### 2. Load BSgenome.Dmelanogaster.FlyBase.r51 (fruit fly genome).
###    Do the chromosomes contain IUPAC extended letters?
###
library(BSgenome.Dmelanogaster.FlyBase.r51)
sapply(seqnames(Dmelanogaster), function(seqname) hasExtendedLetters(Dmelanogaster[[seqname]]))

###
### 3. Extract the last 10M bases from fruit fly chromosome 2L.
###    Use the system.time() function to compare the performance
###    of subBString() vs the subsetting operator [.
###
chr2L <- Dmelanogaster[["2L"]]
system.time(x1 <- subBString(chr2L, , nchar(chr2L), length=10**6))
system.time(x2 <- chr2L[(nchar(chr2L)-10**6+1):nchar(chr2L)])
x1 == x2 # the 2 methods produce the same result but subBString is much faster

### See ?subBString for more information.


##############################################################################
###
### Exercise 3
### ==========

###
### 1. Compare nchar() vs width() on v.
###
v <- views(DNAString("TAATAATG"), 3:0, 5:8)
nchar(v)
width(v)
nchar(v) == width(v) # FALSE for the last view ("out of limits" view)

###
### 2. Find a programatical way to remove "out of limits" views from
###    a BStringViews object.
###
v[nchar(v) == width(v)]

### See ?`BStringViews-class` for more information and many examples.


##############################################################################
###
### Exercise 4
### ==========

###
### 1. Use the mask() function to mask the Ns in fruit fly chromosome 2L.
###    Apply \Rfunction{mask} again to the result (and without the second
###    argument) to see the location of the Ns. Do we have isolated Ns
###    or N-blocks?
###
library(BSgenome.Dmelanogaster.FlyBase.r51)
noN_chr2L <- mask(Dmelanogaster[["2L"]], "N")
mask(noN_chr2L) # looks like we have 2 N-blocks of length 100

### See ?mask for more information.

###
### 2. Find the location of the Ns in C. elegans chromosome III and in
###    H. sapiens chromosome 1.
###
library(BSgenome.Celegans.UCSC.ce2)
Celegans
mask(mask(Celegans[["chrIII"]], "N")) # 5 isolated Ns

library(BSgenome.Hsapiens.UCSC.hg18)
Hsapiens
Nlocs <- mask(mask(Hsapiens[["chr1"]], "N"))
Nlocs # 39 N-blocks of length 50000?
width(Nlocs) # no, some of them are even bigger!

###
### 3. What's the longest A-block in fruit fly chromosome 2L?
###

### Unlike with the Ns, masking the As by using the "A" pattern in the
### call to mask() would be very inefficient to answer this question.
### It's better to use a longer pattern:
Ablocks <- mask(mask(Dmelanogaster[["2L"]], "AAAAAAAA"))
Ablocks # shows all A-blocks with a length of at least 8
max(width(Ablocks)) # length of the longest A-block
Ablocks[max(width(Ablocks)) == width(Ablocks)] # keep this view only


##############################################################################
###
### Exercise 5
### ==========

###
### 1. Extract the start and end position of the exons belonging to gene
###    FBgn0025803 (chromosome 3R).
###
library(ann.Dmelanogaster.FlyBase.r51)
data(EXON2GENE)
ex_ids <- names(EXON2GENE)[EXON2GENE == "FBgn0025803"]
ex_ids # shows the exons belonging to this gene
ex_starts <- getAnnExons("3R")[ex_ids, "start"] # their starts
ex_ends <- getAnnExons("3R")[ex_ids, "end"] # their ends

###
### 2. Load BSgenome.Dmelanogaster.FlyBase.r51 and create a BStringViews object
###    where the subject is chromosome 3R and the views are the exon locations
###    found previously.
###
library(BSgenome.Dmelanogaster.FlyBase.r51)
ex_FBgn0025803 <- views(Dmelanogaster[["3R"]], ex_starts, ex_ends)

### See ?views for more information.

###
### 3. Apply the mask() function on this \Rclass{BStringViews} object. Is this
###    new BStringViews object representing the introns sequences for gene
###    FBgn0025803? What needs to be adjusted?
###
in_FBgn0025803 <- mask(ex_FBgn0025803)
in_FBgn0025803
### The first and the last views are not introns so we remove them:
in_FBgn0025803 <- in_FBgn0025803[-c(1,length(in_FBgn0025803))]

###
### 4. Are we discovering something?
###
in_FBgn0025803

### Looks like we've just rediscovered the "GT-AG rule"!
###   http://biotech.icmb.utexas.edu/search/dict-search.html


##############################################################################
###
### Exercise 6
### ==========

###
### 1. Find all the matches of an arbitrary nucleotide sequence in fruit fly
###    chromosome 2L.
###
library(BSgenome.Dmelanogaster.FlyBase.r51)
pattern <- DNAString("ACCCCCTTTTTG") # an arbitrary sequence
matchPattern(pattern, Dmelanogaster[["2L"]])

### See ?matchPattern for more information.

###
### 2. In fact, if we don't take any special action, we only get the hits in the
###    plus strand of the chromosome. Find the matches in the minus strand too.
###    IMPORTANT: You should always avoid to reverseComplement an entire
###    chromosome sequence (this is very inefficient).
matchPattern(reverseComplement(pattern), Dmelanogaster[["2L"]])


##############################################################################
###
### Exercise 7
### ==========

###
### 1. Use hgu95av2 to find the chromosome location of the gene targetted
###    by probeset 1138_at.
###
library(hgu95av2)
ls(2)
hgu95av2CHRLOC[["1138_at"]] # probe set 1138_at is targetting a gene in chr 2
                            # that starts at position 113119997

###
### 2. Use hgu95av2probe to find all the probe sequences for this probeset
###    id (put the result in a character vector called 'probes').
###
library(hgu95av2probe)
ls(2)
as.data.frame(hgu95av2probe)[1:4, ]
probes <- hgu95av2probe$sequence[hgu95av2probe$Probe.Set.Name == "1138_at"]
probes # 16 probes in probeset 1138_at

###
### 3. Use matchPattern() to find the corresponding hit for each probe in
###    the corresponding chromosome (use a 'sapply(probes, function(probe) ...)'
###    construction to put all the results in a single list, the returned list
###    will have one element per probe in 'probes'). Do we have a hit for every probe?
###    Compare with hgu95av2CHRLOC[["1138_at"]]. Do you see something wrong?
###
library(BSgenome.Hsapiens.UCSC.hg18)
probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr2"]]))
probe_hits
sapply(probe_hits, length) # exactly 1 hit for each probe except for probe 15
hit_starts <- unlist(sapply(probe_hits, start))

hit_starts - hgu95av2CHRLOC[["1138_at"]] # it looks like we hit rather far away
                                         # from the starting position of the gene!

### Note that if we expect all our probes in 'probes' to hit the positive strand
### of chromosome 2, we also expect them to NOT hit the human genome anywhere else
### i.e. we don't expect them to hit the minus strand of chromosome 2 neither do
### we expect them to hit any other strand of any other chromosome:
probe_hits <- sapply(probes, function(probe)
                     matchPattern(reverseComplement(DNAString(probe)), Hsapiens[["chr2"]]))
sapply(probe_hits, length) # all zeros as expected

probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr3"]]))
sapply(probe_hits, length) # all zeros as expected

### etc...

###
### 4. Try again with \Robject{mismatch=1}. Do we have a hit for every probe?
###
probe_hits <- sapply(probes, function(probe) matchPattern(probe, Hsapiens[["chr2"]], mismatch=1))
sapply(probe_hits, length) # now every probe has exactly 1 hit

###
### 5. In fact, according to hgu95av2probe, probeset 1138_at should hit a gene
###    in the antistrand. Does this look correct?
###
strand <- hgu95av2probe$Target.Strandedness[hgu95av2probe$Probe.Set.Name == "1138_at"]
strand # all probes are reported to target the minus strand which seems to be incorrect!


##############################################################################
###
### Exercise 8
### ==========

###
### 1. Search pattern GAACTTTGCCACTC in fruit fly chromosome 2L.
###
library(BSgenome.Dmelanogaster.FlyBase.r51)
pattern0 <- DNAString("GAACTTTGCCACTC")
matchPattern(pattern0, Dmelanogaster[["2L"]]) # no match

###
### 2. Repeat but this time allow the 2nd T in the pattern (6th letter)
###    to match anything. Anything wrong?
###

### We need to use wildcard N instead of T in the pattern
pattern1 <- DNAString("GAACTNTGCCACTC")
### Also we need to use 'fixed=FALSE' in our call to matchPattern so the N can
### actually match anything (not only another N).
matchPattern(pattern1, Dmelanogaster[["2L"]], fixed=FALSE)
### mmmh, we get a bunch of unwanted matches, this is because with 'fixed=FALSE'
### Ns in the subject also match anything in the pattern

###
### 3. You can use the mask() function or 'fixed="subject"' to work around the
###    previous problem. Try and compare.
###
matchPattern(pattern1, mask(Dmelanogaster[["2L"]], "N"), fixed=FALSE)
matchPattern(pattern1, Dmelanogaster[["2L"]], fixed="subject")

### Both give the expected result.


##############################################################################
###
### Exercise 9
### ==========

###
### Find all Fprobe0 and Rprobe0 hits in chromosome 3R.
### Remember that nothing prevents a probe to hit the strand that it was
### not designed for!
### Put the start positions of all the "plus hits" (i.e. the hits of Fprobe0
### and Rprobe0 in the plus strand) in the Phits object (integer vector). Put
### the end positions of all the "minus hits" in the Mhits object (integer vector).
###
Fprobe0 <- DNAString("AGCTCCGAGTT")
Rprobe0 <- DNAString("CGTTGTTCACA")
library(BSgenome.Dmelanogaster.FlyBase.r51)
Phits <- c(start(matchPattern(Fprobe0, Dmelanogaster[["3R"]])),
           start(matchPattern(Rprobe0, Dmelanogaster[["3R"]])))
Mhits <- c(end(matchPattern(reverseComplement(Fprobe0), Dmelanogaster[["3R"]])),
           start(matchPattern(reverseComplement(Rprobe0), Dmelanogaster[["3R"]])))

###
### 2. Ideally Phits and Mhits should be of length 1 but it is not always the
###    case. However the fact that we get several "plus hits" and/or "minus hits"
###    doesn't necessarily mean that our original primer pair has a bad design.
###    In fact, some of these hits don't prevent the PCR experiment to
###    work properly: this is the case for any "plus hit" that is followed
###    (from left to right) by another "plus hit" with no "minus hit" between.
###    And similarly, it is the case for a "minus hit" that is preceded by
###    another "minus hit" with no "plus hit" between.
###    All these hits can be ignored.
###    Write a function that takes 2 arguments, Phits and Mhits,
###    that removes all the above hits. The function will return a data frame
###    with 2 columns, Phits and Mhits. Note that using a data frame now makes
###    sense because after this removal, Phits and Mhits will always have the
###    same length.
reduceProbePairMatches <- function(Phits, Mhits)
{
    start <- sort(unique(Phits))
    end <- sort(unique(Mhits))
    nstart <- length(start)
    nend <- length(end)
    i <- j <- 1
    start0 <- end0 <- integer(0)
    while (i <= nstart && j <= nend) {
        if (end[j] < start[i]) {
            j <- j + 1
            next
        }
        while (i < nstart && start[i + 1] <= end[j])
            i <- i + 1
        start0 <- c(start0, start[i])
        end0 <- c(end0, end[j])
        i <- i + 1
        j <- j + 1
    }
    data.frame(start=start0, end=end0)
}

### Some testing
reduceProbePairMatches(c(1, 5, 10, 20, 25), c(4, 18, 19))

###
### 3. Use the previous function to "clean" Phits and Mhits.
###    Compare your result with the "theoretical amplicons" returned by
###    matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[["3R"]])
###    From now we will use the matchProbePair() function to achieve this.
###    Are all these "theoretical amplicons" realistic (PCR amplification is
###    supposed to work for regions up to 5k-10k bases).
###    Would you still consider the Fprobe0/Rprobe0 pair good for amplifying
###    the region it was designed for?
###

pp0_hits <- reduceProbePairMatches(Phits, Mhits) # hits for probe pair Fprobe0/Rprobe0

### matchProbePair() is provided to make all this easier:
pp_3R_hits <- matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[["3R"]])
width(pp_3R_hits)
pp_3R_hits[width(pp_3R_hits) <= 10000] # only 1 valid amplicon, which is OK

###
### 4. Another problem that can prevent the PCR amplification from working
###    properly is when a given primer pair produces more that 1 valid amplicon
###    (e.g. <= 20k) over the whole genome.
###    What about Fprobe0/Rprobe0?
###
validHits <- function(Fprobe0, Rprobe0, seqname)
{
    hits <- matchProbePair(Fprobe0, Rprobe0, Dmelanogaster[[seqname]])
    hits[width(hits) <= 20000]
}

pp_all_hits <- sapply(seqnames(Dmelanogaster),
                      function(seqname) validHits(Fprobe0, Rprobe0, seqname))
sapply(pp_all_hits, length) # only 1 valid amplicon over the whole genome, which is OK

###
### 5. Write a function that takes our probes library (the 'probes' data frame)
###    as input and produces a data frame similar to 'probes' but with 3
###    additional columns "chr", "start" and "end" that contain the chromosome
###    name/start/end of the amplicon found for each primer pair or NAs if 0 or
###    more than 1 amplicon were found. Having NAs in a row means that the
###    design of the primer pair is bad.
###

PCR <- function(probes)
{
    chr <- rep(as.character(NA), nrow(probes))
    start <- end <- rep(as.integer(NA), nrow(probes))
    for (i in seq_len(nrow(probes))) {
        Fprobe <- probes[i, "Fprobe"]
        Rprobe <- probes[i, "Rprobe"]
        seq2amps <- sapply(seqnames(Dmelanogaster),
                           function(seqname) validHits(Fprobe, Rprobe, seqname))
        seq2namps <- sapply(seq2amps, length)
        if (sum(seq2namps) != 1)
            next
        chr[i] <- names(seq2namps)[seq2namps == 1]
        start[i] <- start(seq2amps[[chr[i]]])
        end[i] <- end(seq2amps[[chr[i]]])
    }
    probes$chr <- chr
    probes$start <- start
    probes$end <- end
    probes
}

probes1 <- PCR(probes) # will take 2-10 minutes depending on your machine