
############ Analysis of 't Hoen's Tag-seq data in edgeR #############

setwd("tHoen_Tag-seq_Data")
library(edgeR)
targets <- read.delim(file='targets.txt',stringsAsFactors=FALSE)
targets
d <- readDGE(targets,skip=5,comment.char='!')
d

## Filter out rows with fewer than 10 counts in total. These tags
## cannot have a p-value for DE less than 0.01 and dispersion estimation
## is better if we filter out these tags.
d <- d[rowSums(d$counts)>9,]
d

## Do TMM normalization
norm.fact <- calcNormFactors(d$counts, refColumn=1)
norm.fact.GM <- norm.fact/prod(norm.fact)^(1/length(norm.fact))
norm.fact.GM
lib.size.norm <- round(d$samples$lib.size*norm.fact.GM)
lib.size.norm
d$samples$lib.size <- lib.size.norm  # use "effective" library size


## Look at MDS plot to see if any of the samples are obvious outliers
mds <- plotMDS.dge(d, xlim=c(-1,2))
d$samples 

v <- cmdscale( as.dist(mds) )
plot(v, col=c("blue","black")[d$samples$group])
## Note: Sample GSM272322 is a long way from the other samples in the 
## first dimension, so this is a potential outlier. We may consider 
## dropping this sample from our analysis.

## If we want to remove GSM272322 we can do this easily by subsetting 
## our DGEList object as so:
rem <- match("GSM272322",colnames(d$counts))
d <- d[,-rem]
d

## In order to carry out a DE analysis we need to estimate the expression 
## levels and common dispersion for the NB model. 
d <- estimateCommonDisp(d)

## What is our estimate of the common dispersion?
d$common.dispersion
## What is the coefficient of variation for these data? 
## (i.e. proportion by which true expression levels may vary between 
## replicate samples.
sqrt(d$common.dispersion)

## With the common dispersion computed we can look at a mean-variance plot 
## for the raw counts to get a feel for how variable the data are.

pooled.var <- function(y,group) {
## Function to calculate the pooled variance for a vector of data with factor 
## indicating group supplied
   numerator <- 0
   denominator <- 0
   for(i in 1:nlevels(group) ) {
       choosei <- as.logical(match(group,levels(group)[i]))
       choosei[is.na(choosei)] <- FALSE
       ni <- sum(choosei)
       numerator <- numerator + (ni-1)*var(y[choosei])
       denominator <- denominator + ni - 1
   }
   numerator/denominator
}

## Compute the pooled variance for each tag
vars <- apply(round(d$pseudo.alt),1,pooled.var,group=d$samples$group)
## Compute the mean count for each tag
means <- rowMeans(round(d$pseudo.alt))
## Calculate the variance function for the NB model using the common dispersion
lmu <- seq(-1,17,length.out=1000)
nb.var <- 10^lmu + (10^lmu)^2*d$common.dispersion


plot(log2(means),log2(vars),xlab="Gene means (pooled, quantile-adjusted counts)",
     ylab="Gene variances (pooled, quantile-adjusted counts)",
     main="Mean-variance plot for WT vs Gene K/O mice (log2 scale)")
abline(0,1,col="red",lwd=3)

lines(lmu,log10(nb.var),col="dodgerblue3",lwd=3)
legend(12,3,legend=c("Poisson","NB"),col=c("red","dodgerblue3"),lty=1,lwd=3)



## We can now carry out statistical testing for DE using 
## the common dispersion NB model
de.common <- exactTest(d, pair=c("WT","DCLK"))


######### Tagwise Dispersion Analysis ###########
## When computing the tagwise dispersions, we need to decide how much to 
## 'squeeze' the individual tagwise dispersions towards the common dispersion
## value. Choosing a prior.n of 10 gives the common likelihood the weight of 
## 10 tags, which gives a good of amount of stabilization of the tagwise dispersions.

d <- estimateTagwiseDisp(d,prior.n=10)
d
summary(d$tagwise.dispersion)
d$common.dispersion

## We can carry out the exact testing as we did above but this time with the
## tagwise dispersions
de.tagwise <- exactTest(d, pair=c("WT","DCLK"), common.disp=FALSE)
names(de.tagwise)

## Look at the toptags
topTags(de.tagwise)
## We notice substantially larger p-values when using the tagwise dispersions
topTags(de.common)

detags.tgw <- rownames(topTags(de.tagwise)$table)
detags.com <- rownames(topTags(de.common)$table)

o <- order(d$samples$group)
## Look at the actual raw tag counts for these DE genes
d$counts[detags.tgw,o]
d$counts[detags.com,o]

## Or, pseudocounts these DE genes (or you could linearly adjust)
round(d$pseudo.alt[detags.com,o])
round(d$pseudo.alt[detags.tgw,o])

## correspondence in p-value between using common and tagwise dispersion
plot( de.common$table$p.value, de.tagwise$table$p.value, log="xy", pch="." )
abline(0,1,col="blue",lwd=4)


topids.tgw <- rownames(topTags(de.tagwise)$table)
topids.com <- rownames(topTags(de.common)$table)

round(d$pseudo.alt)[intersect(topids.tgw, topids.com),o]
round(d$pseudo.alt)[setdiff(topids.tgw, topids.com),o]
round(d$pseudo.alt)[setdiff(topids.com, topids.tgw),o]

## Using tagwise dispersions downweights those tags with extremely variable 
## counts, e.g. tags that have one very large count in one library, which 
## dominates the other samples


## write results to text file
top.tgw <- topTags(de.tagwise, n=nrow(d$counts))$table
top.com <- topTags(de.common, n=nrow(d$counts))$table

top.tgw$id <- rownames(top.tgw)
top.com$id <- rownames(top.com)


## Number of DE up/down-regulated?
table(up = top.tgw$table$logFC > 0, de = top.tgw$table$FDR < 0.05)


normData <- as.data.frame(round(d$pseudo.alt[,o]))
normData$id <- rownames(normData)

m <- merge(top.tgw, top.com, by="id", suffixes=c(".tgw",".com"))
m <- merge(m, normData, by="id")

write.table( m, "results.xls", sep="\t", row.names=FALSE, quote=FALSE )

## another consideration -- "intensity"-dependent dispersions

dd <- estimateCommonDisp(d)

binnedDispersion <- function(d, nBins=1500) {
  conc <- d$conc$conc.common
  uquans <- unique(quantile(conc, p=(0:nBins)/nBins))
  bins <- cut(conc, breaks=uquans, include.lowest=TRUE)

  inds <- split(1:nrow(d$counts), bins)
  disps <- sapply(inds, function(u) estimateCommonDisp(d[u,])$common.dispersion)

  data.frame(conc=(uquans[-1]+uquans[-length(uquans)])/2, disp=disps)
}

bd <- binnedDispersion(d)

plot(bd, log="xy")
abline(h=d$common.dispersion)

## a few approaches are being evaluated on how to best address this
## e.g. weighted likelihood has the nice flexibility that a common 
## dispersion could be taken locally


## some more comparisons (with DESeq)

library(DESeq)
cds <- newCountDataSet( d$counts, d$samples$group )
cds <- estimateSizeFactors( cds )
cds <- estimateVarianceFunctions( cds )
res <- nbinomTest( cds, "WT", "DCLK")

mm1 <- match(res$id, rownames(top.tgw))
mm2 <- match(res$id, rownames(top.com))

par(mfrow=c(1,2))
plot( res$pval, top.tgw$PValue[mm1], log="xy", pch=19, cex=.2)
abline(0,1,col="blue",lwd=4)
plot( res$pval, top.com$PValue[mm2], log="xy", pch=19, cex=.2)
abline(0,1,col="blue",lwd=4)

w <- which(res$pval < 1e-20 & top.tgw$PValue[mm1] > .01 )
round(d$pseudo.alt[w,o])
w <- which(res$pval < 1e-20 & top.com$PValue[mm2] > .01 )
round(d$pseudo.alt[w,o])


##> residualsEcdfPlot(cds,"WT")
##> residualsEcdfPlot(cds,"DCLK")


##> w <- which(res$pval < 1e-20 & top.tgw$PValue[mm1] > .01 )
##> round(d$pseudo.alt[w,o])
##                  GSM272105 GSM272318 GSM272320 GSM272322 GSM272106 GSM272319 GSM272321 GSM272323
##TCGGACTCGTATGCCGT        95       383       422        48        14        12       477         7
##TCGGACTGTAGAATCGT       123       301      3047        37        64        40       808        26
##TCGGACTGTAGAACTTC        68       147      1560        21        17         3       571        14
##TCGGACTGTATCGTATG        86       336       994        40        17        33       496        13
##TCGGACTGTAGATCGTA        43       110       861        24        13         8       269         9

##> res[w,]
##                     id baseMean baseMeanA baseMeanB foldChange log2FoldChange     pval     padj resVarA resVarB
##19028 TCGGACTCGTATGCCGT      194     157.3       230       1.47          0.551 6.44e-22 4.11e-18   19.50    8.89
##30779 TCGGACTGTAGAATCGT      580     272.7       888       3.25          1.703 3.62e-30 6.92e-26    7.27   56.08
##32875 TCGGACTGTAGAACTTC      319     184.3       454       2.46          1.300 1.62e-29 2.48e-25   11.87   51.15
##49745 TCGGACTGTATCGTATG      265     168.4       362       2.15          1.102 8.00e-48 3.06e-43   12.01   26.45
##53707 TCGGACTGTAGATCGTA      179      96.3       261       2.71          1.441 5.91e-23 4.52e-19    7.85   49.16



