################################################################################
#Variant calling in R, some background and an Introduction to SNVsOmuC and gmapR
################################################################################

#Introduction

##This doc outlines the basic usages of the SNVsOmuC and gmapR packages as well
##as some basic usage of  other BioC packages for the manipulation of short-read alignments
##and variant data.
##I cover the general workflow for loading data, calling single sample variants and
##tumor-specific somatic mutations or other sample-specific variant types (eg RNA editing).

###manipulating bam files
source("~/.Rprofile")
download.file("http://bioconductor.org/help/course-materials/2012/BioC2012/jeremiah_tutorial.R", destfile="jeremiah.R")
library(SNVsOmuC)
library(gmapR)
library(Rsamtools)
library(VariantAnnotation)
library(BSgenome.Hsapiens.UCSC.hg19)

bamfile = file.path(system.file(package = "SNVsOmuC", mustWork=TRUE),
    "extdata/merged/SRC111111.TESTPREPENDSTR.merged.uniques.bam")

############################################################
#A quick overview of some tools for the summary of BAM files
############################################################

##Reading bam files using Rsamtools
nrows=100
bf <- BamFile(file = bamfile, yieldSize=nrows)
bam <- scanBam(bf)
bam

## checking the quality type in the bam file
nrows = 100
bf <- BamFile(file = bamfile, yieldSize=nrows)
?ScanBamParam
param <- ScanBamParam(what="qual")
bam <- scanBam(bf, param=param)
bam
quals <- as.character(bam[[1]]$qual)
length <- length(quals)
max_vec <- array()
min_vec <- array()
for(i in seq_along(length)){
    max_vec[i]<-max(as.integer(charToRaw(quals[i])))
      min_vec[i]<-min(as.integer(charToRaw(quals[i])))
  }
min <- min(min_vec)
max <- max(max_vec)
max
min

qual <- detectQuality(bamfile)
qual

##getting the total coverage
?readBamGappedAlignments
chr_ga <-  readBamGappedAlignments(bamfile)
pos <- chr_ga[strand(chr_ga)=="+"]
neg <- chr_ga[strand(chr_ga)=="-"]
pos_grl <- grglist(pos, drop.D.ranges=TRUE)
neg_grl <- grglist(neg, drop.D.ranges=TRUE)
pos_cov <- coverage(pos_grl)
neg_cov <- coverage(neg_grl)
cov <- pos_cov+neg_cov
plot(x=1, pch = NA, ylim = c(-120, 200), xlim = c(0,1200), ylab = "stranded covergage", xlab = "Position")
points(as.numeric(pos_cov$chr12[6646350:6647650]), type = 'l', col = "red")
points(-as.numeric(neg_cov$chr12[6646350:6647650]), type = 'l', col = "blue")
points(as.numeric(cov$chr12[6646350:6647650]), type = 'l', col = "grey")
abline(h=0)

##creating pileup data

?applyPileups
?PileupParam

which <- GRanges(seqnames = "chr12", IRanges(start =1, width = 6647650))
source("/home/jeremiah/pileup_to_gr.R")
bu_gr <- pileupAsGRanges(bamfile, regions = which)

bases <- t(as.matrix(values(bu_gr)[1:4]))
bases[,800:810]
barplot(bases[,1:1272], col=c("red", "blue", "green", "black"), border=NA)
points(as.numeric(cov$chr12[6646350:6647650]), type = 'l', col = "grey")

###############################################
#Intro to the new packages gmapR and SNVsOmuC
###############################################

##GSNAP alignments in R
dnaStringSet <-Hsapiens$chr4
makeTestFasta <- function(seqStarts, pairNum) {
    seqEnds <- seqStarts + 74
      nucleotides <- sapply(seq_along(seqStarts),
                                                    function(i) as.character(subseq(dnaStringSet, seqStarts[i], seqEnds[i]))
                                                    )
      fastaHeaders <- paste0(">", "fakeRead", seq_along(nucleotides), "#0/", pairNum)
      fastaLines <- paste(fastaHeaders, nucleotides, sep="\n")
      tmpFasta <- paste0(tempfile(), ".", pairNum, ".fa")
      writeLines(fastaLines, con=tmpFasta)
      tmpFasta
  }
seqStarts <- seq(1, 100000, by = 10000) + length(dnaStringSet) * 2 / 3
tmpFasta1 <- makeTestFasta(seqStarts, '1')
tmpFasta2 <- makeTestFasta(seqStarts + 125, '2')

library(GmapGenome.Hsapiens.UCSC.hg19)

genome <- GmapGenome.Hsapiens.UCSC.hg19

?GsnapParam

gsnapParam <- GsnapParam(genome = genome,
                                                  unique_only = FALSE,
                                                  max_mismatches = NULL,
                                                  suboptimal_levels = 0L, mode = "standard",
                                                  npaths = 10L,
                                                  novelsplicing = FALSE, splicing = NULL,
                                                  nthreads = 1L,
                                                  batch = "2")

gsnapOutput <- gsnap(input_a=tmpFasta1,
                                          input_b=tmpFasta2,
                                          params=gsnapParam)


path <- path(gsnapOutput)
##bam <- paste(path, dir(path)[8], sep = '/')
chr_ga <-  readBamGappedAlignments(bam)
grl <- grglist(chr_ga, drop.D.ranges=TRUE)
cov <- coverage(grl)

##Calling single-sample vairants

##The gmapR package has a function which converts the tally format from gmap
##(similar to samtools pileup) into a variant GRanges object. This object contains
##all of the information needed to call variants. The convience function below combines
##the gmapR tally function with the variant calling functionality here to simultaneously
##tally the BAM file and call the variants.

##genome <- library(GmapGenome.Hsapiens.UCSC.hg19)

?BamTallyParam

param <- BamTallyParam(cycle_breaks = c(0L, 15L, 60L, 75L),
                                              high_quality_cutoff = as.integer(56),
                                              minimum_mapq = as.integer(13),
                                              concordant_only = FALSE, unique_only = FALSE,
                                              primary_only = FALSE,
                                              min_depth = 0L, variant_strand = 1L,
                                              ignore_query_Ns = TRUE,
                                              indels = FALSE)
tallies_gr <-bam_tally(bamfile,
                                              genome,
                                              param)
tallies_gr

##Calling variants on a subset of the genome
which <- GRanges(seqnames = "chr12", IRanges(start =6647109, width = 6647453))
param <- BamTallyParam(which=which, cycle_breaks = c(0L, 15L, 60L, 75L),
                                              high_quality_cutoff = as.integer(56),
                                              minimum_mapq = as.integer(13),
                                              concordant_only = FALSE, unique_only = FALSE,
                                              primary_only = FALSE,
                                              min_depth = 0L, variant_strand = 1L,
                                              ignore_query_Ns = TRUE,
                                              indels = FALSE)
tallies_gr <-bam_tally(bamfile,
                                              genome,
                                              param)
tallies_gr

####################################################################
#Provide a method for filtering the pileup data and calling variants
####################################################################

var <- variantFilter(tallies_gr, useQual=TRUE)
var$filtered_granges

##Convenience method for variant calling

##A convenience method is provided to call variants across the entire genome
##given a BAM file as input. This section assumes that you have installed gmapR
##and built a genome iit file.

##variantsGR <- callVariantsP(bam=bamfile, genome=genome(genome),
##                             genome_dir=path(directory(genome)),mapq=13, force_qual=TRUE)
##variantsGR$filtered_granges

##A real example...
variantsGR<- get(load("/home/jeremiah/SRR305173/results/SRR305173.filtered_variants_granges.RData"))
chr2X <- variantsGR[seqnames(variantsGR) %in% c("20", "21", "22")]
chr2X <- chr2X[values(chr2X)$count.total>15]

plot(log10(values(chr2X)$count), log10(values(chr2X)$count.total), pch = 19, cex= 0.01)

pval <- (pbinom(q=values(chr2X)$count, size=values(chr2X)$count.total,prob= .5))
good <- chr2X[pval>0.001]
plot(log10(values(good)$count), log10(values(good)$count.total), pch = 19, cex= 0.01)

hist(values(good)$count/values(good)$count.total,
          breaks=50, main="SRR305173 SNPs on Chrom 20-22")

##Two sample comparison methods
tumor <- variantsGR
tumor <- tumor[seqnames(tumor) %in% c("20", "21", "22")]
normal <- get(load("/home/jeremiah/SRR305174/results/SRR305174.filtered_variants_granges.RData"))
normal <- normal[seqnames(normal) %in% c("20", "21", "22")]
raw <-get(load("/home/jeremiah/raw_chr_subset_gr.RData"))
cov<- get(load("/home/jeremiah/SRR305174/SRR305174.coverage.RData"))


##Calling sample-specific mutations

##next we want to compare the tumor and normal samples to find the tumor-specific
##mutations...

##however, before doing so we need to know that the two samples are actually from
##the same patient

##Checking that we have data from the same person

concord <- variantConcordance(tumor, normal)
concord


##now that we know we have tumor and normal data from the same sample we need one
##more bit of data before we get the tumor-specific variants

##Actually calling the sample-specific variants

TS<- tumorNormalCompare(tumor_gr=tumor,
                        normal_gr=normal, normal_raw=raw,
                        normal_cov=cov)


##Writing a vcf file

##Next we want to write out our tumor-specific varints out to a vcf file

cgpGr2vcf(TS, file = "~/temp_ts.vcf", sample_id="Test_sam", project = "SNVsOmuC_Vignette")

##annotaing variants and loading VCF files
library(VariantAnnotation)
fl <- "/home/jeremiah/temp_ts.vcf.gz"
vcf <- readVcf(fl, "hg19")
vcf
rowData(vcf)

#######################################################################
## We can also use VariantAnnotation to identify potentially functional
## using the GRanges objects generated by gmapR and SNVsOmuC directly
#######################################################################

seqlevels(TS) <- paste("chr", seqlevels(TS), sep = "")
genome(TS) <- "hg19"
seqlevels(TS)<-seqlevels(TS)[1:24]
strand(TS)<-"*"
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene

coding <- predictCoding(TS, varAllele=DNAStringSet(values(TS)$alt), txdb, seqSource=Hsapiens)
coding


##Making some plots

##Finally we want make some plots to check out our data.

##The first plot shows us the mutation transition/transversion rate matrix

plotTitv(variantsGR, main = "Single-sample SNVs")
plotTitv(TS, main = "tumor specific mutations")


##and finally we want to plot our variants on the genome

seqlevels(tumor) <- paste("chr", seqlevels(tumor), sep = "")
genome(tumor) <- "hg19"
seqlevels(tumor)<-seqlevels(tumor)[1:24]

plotTumor(tumor,TS)

\end{document}