## R commands to do many HWE tests, and interpret them
## Anything after a hash (#) is a quote

## load the data into current R session;

setwd("C:/Documents and Settings/kenrice/Desktop/BiocWorkshop")

load("amd1.Rdata")
ls()
names(amd1)

# we have SNP data for 96 Age-related Macular Degeneration cases, 
# and 50 controls

dim(amd1)
# lots of SNPs! ...9202 of them, i.e. Chromosome 1

##
## Someone else coded the Exact HWE test...
##
install.packages("hwde")
library(hwde)
?hwexact # we're about to use this function a lot

##
## a little function
## it will take row 'i', do the HWE test in the controls
##

my.little.function <- function(i){
	snps <- amd1[i, 5:100]
	count.AA <- sum(snps==1)
	count.Aa <- sum(snps==2)
	count.aa <- sum(snps==3)
	hwexact( count.AA, count.Aa, count.aa )
}

system.time(all.hwe.p <- sapply(1:9202, my.little.function))
alarm()

## Interpreting the results

# 1. A very naive thing to do
table(all.hwe.p < 0.05)
9202*0.05

# 2. A smarter thing to do
sapply(1:6, function(r){sum(all.hwe.p< 10^(-r)) } )

# How many 'hits' should we expect?
rbind(
Obs =sapply(1:6, function(r){sum(all.hwe.p< 10^(-r)) } ),
Exp =round( 9202 * 10^-(1:6), 5)
)
# ... so roughly what's happening?

# 3. A plot tells us the first part of the story;

hist(all.hwe.p)
# The exact HWE test has a big spike at 1 (e.g. 0,0,n )

# BUT that's not the interesting part!
# Recall we were getting excessive *small* p-values;

plot( ppoints(9202), sort(all.hwe.p), xlab="ordered p  [Expected]",ylab="ordered p  [Observed]" )
title("Ordered p-values")
abline(h=c(0.05, 10/9202, 1/9202, 0.05/9202), lwd=4:1, col="red" )
abline(0,1, lty=2, col="blue", lwd=2)
legend("bottomright", legend=c(expression(alpha==0.05),expression(alpha==10/9202),expression(alpha==1/9202),expression(alpha==0.05/9202)) , lwd=4:1, col="red", bg="white")
legend("topleft", "nothing interesting\n going on", lty=2, lwd=2, col="blue", bty="n")

plot( -log10(ppoints(9202)), -log10(sort(all.hwe.p)), xlab="ordered -log10(p)  [Expected]",ylab="ordered -log10(p)  [Observed]" )
title("Ordered p-values, -log10 transform")
abline(h=-log10(c(0.05, 10/9202, 1/9202, 0.05/9202)), lwd=4:1, col="red" )
abline(0,1, lty=2, col="blue", lwd=2)
legend("left", legend=c(expression(alpha==0.05),expression(alpha==10/9202),expression(alpha==1/9202),expression(alpha==0.05/9202)) , lwd=4:1, col="red", bty="n")
legend("topleft", "nothing interesting\n going on", lty=2, lwd=2, col="blue", bty="n")

# How much variation would we expect by chance?

lo.int <- qbeta(0.005, 1:9202, 9202:1)
hi.int <- qbeta(0.995, 1:9202, 9202:1)

plot( -log10(ppoints(9202)), -log10(sort(all.hwe.p)), xlab="ordered -log10(p)  [Expected]",ylab="ordered -log10(p)  [Observed]" , ylim=c(0,8), type="n")
title("Ordered p-values, -log10 transform, truncated")
polygon( -log10(c(ppoints(9202), rev(ppoints(9202)))), -log10(c(lo.int, rev(hi.int))), density=NA, col="cyan" )
points( -log10(ppoints(9202)), pmin(8, -log10(sort(all.hwe.p))), pch=ifelse( -log10(sort(all.hwe.p))>8, 24, 21)  )
abline(0,1, lty=2, col="blue", lwd=2)
legend("topleft", c("pointwise 99% interval", "nothing interesting\n going on"), lty=1:2, lwd=c(10,2), col=c("cyan", "blue"), bty="n")


# A crude (but okay) estimate of FDR;

q <- sort(all.hwe.p)*9202/1:9202
plot(x=sort(all.hwe.p), y=q, xlab="ordered p-values", main="Highest FDR at which each SNP rejected, by BH method")

plot(x=sort(all.hwe.p), y=q, xlab="ordered p-values", log="x", main="Highest FDR at which each SNP rejected, by BH method")

# so who are the worst offenders?

amd1$hwe.p <- signif(all.hwe.p, 3)
amd1$q <- rep(NA, 9202)
amd1$q[order(all.hwe.p)] <- signif(q,3)
amd1$nAA <- rowSums(amd1[5:150]==1)
amd1$nAa <- rowSums(amd1[5:150]==2)
amd1$naa <- rowSums(amd1[5:150]==3)

subset(amd1[,c("rsID","nAA","nAa","naa","q", "hwe.p")], amd1$q<0.01)

