###################################################
### chunk number 1: setup
###################################################
options(width=80)


###################################################
### chunk number 2: PackageLoads
###################################################
suppressMessages(library("ShortRead"))
library("BSgenome.Mmusculus.UCSC.mm9")


###################################################
### chunk number 3: PackageLoads
###################################################
packageDescription("BSgenome")$Version
packageDescription("Biostrings")$Version
packageDescription("IRanges")$Version


###################################################
### chunk number 4: loadSeqBasicsTutorial
###################################################
library(SeqBasicsTutorial)


###################################################
### chunk number 5: ctcfMotifWrite eval=FALSE
###################################################
## write.table(motifCounts, file = "CTCFMotifCounts.csv", sep = ",")
## library(rtracklayer)
## export(motifLocs, con = "CTCFMotifLocs.bed")


###################################################
### chunk number 6: FindTopReads eval=FALSE
###################################################
## # Code that was used to create the top reads
## # This can be modified for use with your own experiments
## library(ShortRead)
## sp <-
##   list("experiment1" = SolexaPath(file.path("path", "to", "experiment1")),
##        "experiment2" = SolexaPath(file.path("path", "to", "experiment2")))
## patSeq <- paste("s_", 1:8, "_.*_seq.txt", sep = "")
## names(patSeq) <- paste("lane", 1:8, sep = "")
## topReads <-
##   lapply(structure(seq_len(length(sp)), names = names(sp)),
##          function(i) {
##              print(experimentPath(sp[[i]]))
##              do.call(SplitDataFrameList,
##                      lapply(structure(seq_len(length(patSeq)),
##                                       names = names(patSeq)),
##                      function(j, n = 1000) {
##                          cat("Reading", patSeq[[j]], "...")
##                          x <-
##                            tables(readXStringColumns(baseCallPath(sp[[i]]),
##                                                      pattern = patSeq[[j]],
##                                                      colClasses =
##                                                      c(rep(list(NULL), 4),
##                                                      list("DNAString")))[[1]],
##                                   n = n)[["top"]]
##                          names(x) <- chartr("-", "N", names(x))
##                          cat("done.\n")
##                          DataFrame(read = DNAStringSet(names(x)),
##                                    count = unname(x))
##                      }))
##          })


###################################################
### chunk number 7: LoadTopReads
###################################################
data(topReads)


###################################################
### chunk number 8: TopReadsMetadata
###################################################
class(topReads)


###################################################
### chunk number 9: TopElement
###################################################
topReads[["experiment1"]][["lane1"]]


###################################################
### chunk number 10: TopElement
###################################################
sapply(topReads, lapply, function(x) as.character(x[["read"]])[1])


###################################################
### chunk number 11: FindPolyNs
###################################################
lane1.1TopReads <- topReads[["experiment1"]][["lane1"]]
alphabetCounts <- alphabetFrequency(lane1.1TopReads[["read"]], baseOnly = TRUE)
lane1.1MaxLetter <-
  pmax(alphabetCounts[,"A"], alphabetCounts[,"C"], alphabetCounts[,"G"], alphabetCounts[,"T"])
lane1.1PolySingles <- lane1.1TopReads[["read"]][lane1.1MaxLetter >= 34]
length(lane1.1PolySingles)
head(lane1.1PolySingles)


###################################################
### chunk number 12: MatchAdapterToGenome
###################################################
adapter <- DNAString("GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG")
adapterCounts <- vcountPattern(adapter, Mmusculus)
head(adapterCounts)
sum(adapterCounts[,"count"])


###################################################
### chunk number 13: AdapterReads
###################################################
distinctReads <-
  DNAStringSet(sort(unique(unlist(lapply(topReads, lapply,
    function(x) as.character(x[["read"]])), use.names = FALSE))))
whichAdapters <-
  isMatchingAt(adapter, distinctReads, max.mismatch = 4, with.indels = TRUE)
adapterReads <- distinctReads[whichAdapters]
length(adapterReads)
head(adapterReads)


###################################################
### chunk number 14: TopAdapterReads
###################################################
topAdapterReads <-
  lapply(topReads, lapply, function(x) x[x[["read"]] %in% adapterReads, ])
sapply(topAdapterReads, sapply, nrow)
sapply(topAdapterReads, sapply, function(x) sum(x[["count"]]))


###################################################
### chunk number 15: TopAdapterReads1-1
###################################################
lane1.1AdapterCounts <- topAdapterReads[["experiment1"]][["lane1"]][["count"]]
lane1.1AdapterReads <- topAdapterReads[["experiment1"]][["lane1"]][["read"]]
length(lane1.1AdapterReads)
head(lane1.1AdapterReads)


###################################################
### chunk number 16: TopAdapterAligns1-1
###################################################
lane1.1AdapterAligns <-
  pairwiseAlignment(lane1.1AdapterReads, adapter, type = "local-global")
summary(lane1.1AdapterAligns, weight = lane1.1AdapterCounts)


###################################################
### chunk number 17: TopReads2-1
###################################################
lane2.1TopCounts <- topReads[["experiment2"]][["lane1"]][["count"]]
lane2.1TopReads <- topReads[["experiment2"]][["lane1"]][["read"]]
length(lane2.1TopReads)
head(lane2.1TopReads)


###################################################
### chunk number 18: TopReadsClust2-1
###################################################
lane2.1Clust <- hclust(stringDist(lane2.1TopReads), method = "single")
lane2.1Groups <- cutree(lane2.1Clust, h = 2)
head(sort(table(lane2.1Groups), decreasing = TRUE))


###################################################
### chunk number 19: UnknownSeqs1
###################################################
head(reverseComplement(lane2.1TopReads[lane2.1Groups == 9]))
head(lane2.1TopReads[lane2.1Groups == 8])
unknownSeqs <-
  intersect(reverseComplement(lane2.1TopReads[lane2.1Groups == 9]),
            lane2.1TopReads[lane2.1Groups == 8])
length(unknownSeqs)
head(unknownSeqs)


###################################################
### chunk number 20: UnknownSeqs2
###################################################
unknownCounts <-
  lane2.1TopCounts[match(unknownSeqs, lane2.1TopReads)] + 
    lane2.1TopCounts[match(reverseComplement(unknownSeqs),
                           lane2.1TopReads)]
unknownSeqs <- unknownSeqs[order(unknownCounts, decreasing = TRUE)]
unknownCounts <- unknownCounts[order(unknownCounts, decreasing = TRUE)]
length(unknownCounts)
head(unknownCounts)


###################################################
### chunk number 21: StarterSeq
###################################################
submat <- nucleotideSubstitutionMatrix(match = 1, mismatch = -Inf)
whichStarter <-
  which.max(apply(as.matrix(stringDist(unknownSeqs,
                                       method = "substitutionMatrix",
                                       substitutionMatrix = submat,
                                       gapExtension = -Inf,
                                       type = "overlap")),
                  1, function(x) sum(x >= 24)))
starterSeq <- unknownSeqs[[whichStarter]]
starterSeq


###################################################
### chunk number 22: StarterSeqAlign
###################################################
starterAlign <-
  pairwiseAlignment(unknownSeqs, starterSeq, substitutionMatrix = submat,
                    gapExtension = -Inf, type = "overlap")


###################################################
### chunk number 23: UnknownPrefix
###################################################
whichInPrefix <- (score(starterAlign) >= 10 & start(subject(starterAlign)) == 1 & start(pattern(starterAlign)) != 1)
prefix <- narrow(unknownSeqs[whichInPrefix], 1, start(pattern(starterAlign[whichInPrefix])) - 1)
prefix <-
 DNAStringSet(paste(sapply(max(nchar(prefix)) - nchar(prefix), polyn, nucleotides="-"), as.character(prefix), sep = ""))
consensusMatrix(prefix, baseOnly = TRUE)
prefixString <- consensusString(consensusMatrix(prefix, baseOnly = TRUE)[-5,])
prefixString


###################################################
### chunk number 24: UnknownSuffix
###################################################
whichInSuffix <- (score(starterAlign) >= 10 & end(subject(starterAlign)) == 36 & end(pattern(starterAlign)) != 36)
suffix <- narrow(unknownSeqs[whichInSuffix], end(pattern(starterAlign[whichInSuffix])) + 1, 36)
suffix <-
  DNAStringSet(paste(as.character(suffix), sapply(max(nchar(suffix)) - nchar(suffix), polyn, nucleotides="-"), sep = ""))
consensusMatrix(suffix, baseOnly = TRUE)
suffixString <- consensusString(consensusMatrix(suffix, baseOnly = TRUE)[-5,])
suffixString


###################################################
### chunk number 25: ExtendedUnknown
###################################################
extendedUnknown <-
  DNAString(paste(prefixString, as.character(starterSeq), suffixString, sep = ""))
extendedUnknown


###################################################
### chunk number 26: ExtendedUnknownAlign
###################################################
unknownAlign <-
  pairwiseAlignment(unknownSeqs, extendedUnknown, substitutionMatrix = submat, gapExtension = -Inf, type = "overlap")
table(score(unknownAlign))


###################################################
### chunk number 27: SumOfUnknowns
###################################################
sapply(topReads, lapply,
       function(x) {
           if (nrow(x) > 0) {
           whichNoNs <- (alphabetFrequency(x[["read"]])[,"N"] == 0)
           x <- x[whichNoNs,]
           pdict <- PDict(x[["read"]])
           whichMapped <- (countPDict(pdict, extendedUnknown) + countPDict(pdict, reverseComplement(extendedUnknown))) > 0
           sum(x[whichMapped, "count"])
           }
       })


###################################################
### chunk number 28: UnknownMapping
###################################################
unknownPatternCount <- vcountPattern(extendedUnknown, Mmusculus)
tapply(unknownPatternCount[["count"]], unknownPatternCount[["seqname"]], sum)


###################################################
### chunk number 29: UnknownLocation
###################################################
mm9Chr2 <- Mmusculus[["chr2"]]
mm9Ch2View <- matchPattern(extendedUnknown, mm9Chr2)
mm9Ch2View


###################################################
### chunk number 30: PhageReads
###################################################
sp <- SolexaPath(system.file("extdata", "ELAND", "080828_HWI-EAS88_0003", package="SeqBasicsTutorial"))
phageReads <- readAligned(analysisPath(sp), "s_5_1_export.txt", "SolexaExport")


###################################################
### chunk number 31: UniquePhageReads
###################################################
phageReadTable <- tables(sread(phageReads), n = Inf)[["top"]]


###################################################
### chunk number 32: CleanPhageReads
###################################################
whichNotClean <- grep("N", names(phageReadTable))
head(phageReadTable[whichNotClean])
cleanReadTable <- phageReadTable[- whichNotClean]
head(cleanReadTable)


###################################################
### chunk number 33: PhageGenome
###################################################
data(phiX174Phage)
names(phiX174Phage)
nebPhage <- phiX174Phage[[which(names(phiX174Phage) == "NEB03")]]
nebPhage <-
  DNAString(paste(as.character(nebPhage), as.character(substr(nebPhage, 1, 34)),
                  sep = ""))
nebPhage


###################################################
### chunk number 34: MapPhageData
###################################################
posPDict <- PDict(DNAStringSet(names(cleanReadTable)), max.mismatch = 2)
negPDict <-
  PDict(reverseComplement(DNAStringSet(names(cleanReadTable))), max.mismatch = 2)
whichAlign <- rep(FALSE, length(phageReadTable))
whichAlign[- whichNotClean] <-
  (countPDict(posPDict, nebPhage, max.mismatch = 2) +
   countPDict(negPDict, nebPhage, max.mismatch = 2) > 0)


###################################################
### chunk number 35: SummarizePhageMapping
###################################################
table(whichAlign)
round(sapply(split(phageReadTable, whichAlign), sum) / sum(phageReadTable), 2)


###################################################
### chunk number 36: HooverDeconstructed
###################################################
print(
histogram(~ log10(phageReadTable[phageReadTable > 1]) | whichAlign[phageReadTable > 1],
          xlab = "log10(Read Counts)", main = "Read Counts by IS(Aligned to Phage)")
)


###################################################
### chunk number 37: sessionInfo
###################################################
toLatex(sessionInfo())