###################################################
### chunk number 1: Setup
###################################################
library("Biobase")
library("annotate")
library("Category")
library("hgu95av2")
library("genefilter")


###################################################
### chunk number 2: Example-subset
###################################################
library("ALL")
data(ALL)
Bcell <- grep("^B", as.character(ALL$BT))
bcrAblOrNegIdx <- which(as.character(ALL$mol.biol) %in% c("NEG", "BCR/ABL"))
eset <- ALL[, intersect(Bcell, bcrAblOrNegIdx)]
eset$mol.biol <- factor(eset$mol.biol)
numBN <- length(eset$mol.biol)


###################################################
### chunk number 3: noEntrez
###################################################
entrezIds <- mget(geneNames(eset), envir=hgu95av2LOCUSID)
haveEntrezId <- names(entrezIds)[sapply(entrezIds, function(x) !is.na(x))]
numNoEntrezId <- length(geneNames(eset)) - length(haveEntrezId)
eset <- eset[haveEntrezId, ]


###################################################
### chunk number 4: simplefiltering
###################################################
## Non-specific filtering based on IQR
lowQ = rowQ(eset, floor(0.25 * numBN))
upQ = rowQ(eset, ceiling(0.75 * numBN))
iqrs = upQ - lowQ

selected <- iqrs > 0.5

nsFiltered <- eset[selected, ]


###################################################
### chunk number 5: reduceto1to1
###################################################
## Reduce to unique probe <--> gene mapping by keeping largest IQR
## This gives us "unique genes" in the non-specific filtered gene
## set which simplifies further calculations.
nsFilteredIqr <- iqrs[selected]
uniqGenes <- findLargest(geneNames(nsFiltered), nsFilteredIqr, "hgu95av2")
nsFiltered <- nsFiltered[uniqGenes, ]

## basic stats on our non-specific filter result
numSelected <- length(geneNames(nsFiltered))
numBcrAbl <- sum(nsFiltered$mol.biol == "BCR/ABL")
numNeg <- sum(nsFiltered$mol.biol == "NEG")


###################################################
### chunk number 6: noKEGG
###################################################
## Remove genes with no PATH mapping
havePATH <- sapply(mget(geneNames(nsFiltered), hgu95av2PATH),
                 function(x) if (length(x) == 1 && is.na(x)) FALSE else TRUE)
numNoPATH<- sum(!havePATH)
nsF <- nsFiltered[havePATH, ]


###################################################
### chunk number 7: compAmat
###################################################

 Am = PWAmat("hgu95av2")
 egN = unlist(mget(geneNames(nsF), hgu95av2LOCUSID))

 sub1 = match(egN, row.names(Am))
 
 Am = Am[sub1,]
 dim(Am)
 table(colSums(Am))


###################################################
### chunk number 8: compttests
###################################################

rtt = rowttests(nsF, "mol.biol")
rttStat = rtt$statistic


###################################################
### chunk number 9: reducetoInt
###################################################
 Amat = t(Am)
 rs = rowSums(Amat)
 Amat2 = Amat[rs>10,]
 rs2 = rs[rs>10]
 nCats = length(rs2)


###################################################
### chunk number 10: pctests
###################################################

tA = as.vector(Amat2 %*% rttStat)
tAadj = tA/sqrt(rs2)

names(tA) = names(tAadj) = row.names(Amat2)


###################################################
### chunk number 11: qqplot
###################################################
 qqnorm(tAadj)


###################################################
### chunk number 12: findSmPW
###################################################

smPW = tAadj[tAadj < -5]
pwName = KEGGPATHID2NAME[[names(smPW)]]
pwName


###################################################
### chunk number 13: mnplot
###################################################
 KEGGmnplot(names(smPW), nsF, "hgu95av2", nsF$"mol.biol")


###################################################
### chunk number 14: heatmap
###################################################
 KEGG2heatmap(names(smPW), nsF, "hgu95av2")


###################################################
### chunk number 15: ttperm
###################################################

NPERM = 100
ttp = ttperm(exprs(nsF), nsF$mol.biol, NPERM)

permDm = do.call("cbind", lapply(ttp$perms, function(x) x$statistic))

permD = Amat2 %*% permDm


pvals = matrix(NA, nr=nCats, ncol=2)
dimnames(pvals) = list(row.names(Amat2), c("Lower", "Upper"))

for(i in 1:nCats) {
    pvals[i,1] = sum(permD[i,] < tA[i])/NPERM
    pvals[i,2] = sum(permD[i,] > tA[i])/NPERM
}

ord1 = order(pvals[,1])
lowC = (row.names(pvals)[ord1])[pvals[ord1,1]< 0.05]

highC = row.names(pvals)[pvals[,2] < 0.05]

 getPathNames(lowC)

 getPathNames(highC)

 lnhC = length(highC)


###################################################
### chunk number 16: findAmap
###################################################

AmChr = MAPAmat("hgu95av2", minCount=5)


###################################################
### chunk number 17: sub2ourData
###################################################

 subC = row.names(AmChr) %in% egN
 
 AmChr = AmChr[subC,]
 dim(AmChr)
 table(colSums(AmChr))


###################################################
### chunk number 18: 
###################################################
sessionInfo()