eQTL tutorial, Bioc 2014

VJ Carey stvjc at channing dot harvard dot edu ```{r extracode,echo=FALSE,error=FALSE,message=FALSE} suppressMessages({ exs2se = function (x, assayname = "exprs", fngetter = function(z) rownames(exprs(z)), annoDb, probekeytype = "PROBEID", duphandler = function(z) { if (any(isd <- duplicated(z[, "ENTREZID"]))) return(z[!isd, , drop = FALSE]) z }, signIsStrand = TRUE, ucsdChrnames = TRUE) { annopk = annoDb stopifnot(is(annopk, "ChipDb") | is(annopk, "OrgDb")) fn = fngetter(x) locd = duphandler(fulls <- select(annopk, keytype = probekeytype, keys = fn, columns = c("CHR", "CHRLOC", "CHRLOCEND"))) nfulls = na.omit(fulls) nmultiaddr = nrow(nfulls) - length(fn) rownames(locd) = locd[, probekeytype] locd = na.omit(locd) dropped = setdiff(fn, rownames(locd)) if (length(dropped) > 0) warning(paste("there were", length(dropped), "addresses dropped owing to missing address information in bioc annotation")) locd = locd[intersect(rownames(locd), fn), ] strand = rep("*", nrow(locd)) if (signIsStrand) strand = ifelse(locd[, "CHRLOC"] > 0, "+", "-") chpref = "" if (ucsdChrnames) chpref = "chr" rowd = GRanges(paste0(chpref, locd[, "CHR"]), IRanges(abs(locd[, "CHRLOC"]), abs(locd[, "CHRLOCEND"])), strand = strand) names(rowd) = rownames(locd) metadata(rowd)$dropped = dropped metadata(rowd)$nmultiaddr = nmultiaddr hasrowd = match(names(rowd), fn, nomatch = 0) ex = x[hasrowd, ] stopifnot(nrow(exprs(ex)) == length(rowd)) ed = SimpleList(initExptData = experimentData(ex)) SummarizedExperiment(assays = SimpleList(exprs = exprs(ex)), rowData = rowd, colData = DataFrame(pData(ex)), exptData = ed) } genemodel = function (sym, genome = "hg19", force = FALSE) { stopifnot(genome == "hg19") if (!exists("txdb") || force) txlib() require(org.Hs.eg.db) num = get(sym, revmap(org.Hs.egSYMBOL)) exonsBy(txdb, by = "gene")[[num]] } txlib = function () { library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <<- TxDb.Hsapiens.UCSC.hg19.knownGene message("txdb assigned") invisible(NULL) } rangeFromToksViaEntrez = function (toks, genome = "hg19", force = FALSE, takeRange=TRUE, orgDb=Homo.sapiens, typesToTry=c("SYMBOL", "GOID", "PATH", "TERM"), routeSink=TRUE) { # # convert tokens to GRanges after translation to Entrez ids # takeRange if true reduces answer to 'range' of exons # open question whether we also want to cache specific models # require(stashR) stopifnot(genome == "hg19") # # cache exonsBy of TxDb # require(deparse(substitute(orgDb)), character.only=TRUE) modcon = new("localDB", dir="~/.SEresolvers", name="rowResolvers") if (!dbExists(modcon, "hg19.exbygene")) { message("one time cache creation for TxDb...") txlib() # special function to create txdb in .GlobalEnv dbInsert( modcon, "hg19.exbygene", exonsBy(txdb, by="gene") ) } allex = dbFetch(modcon, "hg19.exbygene") tf = file(tempfile(), open="wt") if (routeSink) message("resolving row selection tokens... (messages diverted)") if (routeSink) sink(tf, type="message") for (curty in typesToTry) { num = try(select(orgDb, keytype=curty, keys=toks, columns="ENTREZID")[,"ENTREZID"]) if (!inherits(num, "try-error")) break } if (routeSink) sink(NULL, type="message") if (routeSink) message("messages restored") if (inherits(num, "try-error")) stop("can't resolve toks") inds = match(num, names(allex)) filt = force if (takeRange) filt = range ans = do.call(c, lapply(1:length(inds), function(x) try({ tmp = filt(allex[[inds[x]]]) tmp$ENTREZID=num[x] tmp}))) syms = select(Homo.sapiens, keys=ans$ENTREZID, keytype="ENTREZID", columns="SYMBOL") if (length(unique(syms[,1]))==nrow(syms)) ans$SYMBOL = syms[,2] else { ssyms = split(syms[,2], syms[,1])[ syms[,1] ] # retain order s1 = sapply(ssyms, "[", 1) ans$SYMBOL = s1 } ans } hasRowResolver = function(x) !is.null(exptData(x)$rowResolverHook) getRowResolverTypes = function(x) exptData(x)$rowResolverHook() tryRowResolution = function(x,i) { ress = getRowResolverTypes(x) rngs = rangeFromToksViaEntrez(toks=i, typesToTry=ress) ov = findOverlaps( rowData(x), rngs ) queryHits(ov) } # based on devel 1.17.26 GRanges 1.17.26 setMethod("[", c("SummarizedExperiment", "ANY", "ANY", "ANY"), function (x, i, j, ..., drop = TRUE) { clone = GenomicRanges:::clone .SummarizedExperiment.charbound = GenomicRanges:::.SummarizedExperiment.charbound .SummarizedExperiment.assays.subset = GenomicRanges:::.SummarizedExperiment.assays.subset if (1L != length(drop) || (!missing(drop) && drop)) warning("'drop' ignored '[,", class(x), ",ANY,ANY-method'") if (missing(i) && missing(j)) return(x) if (!missing(i) && is.character(i)) { if (hasRowResolver(x)) i = tryRowResolution(x, i) # will be vector of integers else { fmt <- paste0("<", class(x), ">[i,] index out of bounds: %s") i <- .SummarizedExperiment.charbound(i, rownames(x), fmt) } } if (!missing(j) && is.character(j)) { fmt <- paste0("<", class(x), ">[,j] index out of bounds: %s") j <- .SummarizedExperiment.charbound(j, colnames(x), fmt) } if (!missing(i) && !missing(j)) { ii <- as.vector(i) jj <- as.vector(j) x <- clone(x, ..., rowData = rowData(x)[i], colData = colData(x)[j, , drop = FALSE], assays = .SummarizedExperiment.assays.subset(x, ii, jj)) } else if (missing(i)) { jj <- as.vector(j) x <- clone(x, ..., colData = colData(x)[j, , drop = FALSE], assays = .SummarizedExperiment.assays.subset(x, j = jj)) } else { ii <- as.vector(i) x <- clone(x, ..., rowData = rowData(x)[i], assays = .SummarizedExperiment.assays.subset(x, ii)) } x }) library(knitcitations) library(bibtex) allbib = read.bibtex("/data/eQTL2014/eq.bib") }) ```

Basic data from RECOUNT

We will use transcript profiles obtained through RNA-seq applied to HapMap cell lines (`r citet(allbib[["Montgomery2010"]])`). The reported gene-level count data have been rendered as an ExpressionSet (montpick\_eset.RData) in the RECOUNT project (`r citet(allbib[["Frazee2011"]])`) using the ENSEMBL nomenclature for genes. These counts were filtered to exclude genes with uniform zero value over all samples, subjected to rlog transformation (DESeq2), and then filtered to include only those with MAD at the 40th percentile of the overall distribution of genewise MADs. Characteristic code for the transformation to SummarizedExperiment is given here. You do not need to execute this as the transformed data are available for loading as noted below. (exs2se() is a bit of custom code available in the source of this vignette.) ```{r trans1,eval=FALSE} library(Biobase) load("/data/eQTL2014/montpick_eset.RData") mp = montpick.eset annotation(mp) = "org.Hs.eg.db" tmp = select(org.Hs.eg.db, keytype="ENSEMBL", columns="ENTREZID", keys=featureNames(mp)) ezs = split(tmp[,2], tmp[,1]) ezs = sapply(ezs, "[", 1) drop = which(duplicated(ezs) | is.na(ezs)) if (length(drop)>0) ezs = ezs[-drop] mp = mp[names(ezs)] featureNames(mp) = as.character(ezs[featureNames(mp)]) mpSE = exs2se(mp, annoDb = org.Hs.eg.db, probekeytype="ENTREZID") ```

Transformed expression data in SummarizedExperiment

We obtain the basic image of transformed count data as follows. ```{r getd} load("/data/eQTL2014/rmpDEPF.rda") # rlogged Montgomery Pickrell DESeqDataSet, some positive, filtered rmpDEPF ```

Genotype data in an Amazon S3 bucket

All the data generated in the 1000 genomes project is available in a S3 (simple storage service) container. We have a URL that can be used to access the genotypes from chr22 in VCF format. ```{r lkgset, warning=FALSE, message=FALSE} load("/data/eQTL2014/s3url.rda") s3url library(VariantAnnotation) ```{r lkg, warning=FALSE, cache=TRUE, message=FALSE} head = scanVcfHeader(s3url) str(head, max.level=4) ```

A naive analysis

The GGtools package includes a function, cisAssoc(), that can test additive genetic associations in cis between SNP genotypes housed in tabix-indexed VCF and expression data housed in a SummarizedExperiment. The test is the score test for a generalized linear model fit to independent samples with a continuous response, with response variance approximately independent of response mean. These assumptions are generally not met for count data from RNA-seq experiments, but they may hold reasonably approximately after the rlog transformation. We will check on the severity of the violations after performing some tests. Here is code that computes, for 200 genes on chr17, association test statistics and versions thereof under permutation of expression against genotype. Do not perform these calculations, they will take too long for the tutorial. The answer is provided in the tutorial data, see below. ```{r getli} library(GGtools) ```{r doit,eval=FALSE} library(foreach) library(doParallel) registerDoSEQ() r17 = rmpDEPF[ which(seqnames(rmpDEPF)=="chr17"),] set.seed(1234) s3urlb = sub("22", "17", s3url) c17 = cisAssoc(r17, TabixFile(s3urlb), cisradius=100000, lbmaf=.025) c17d = addmindist(c17) ``` ```{r dod,echo=FALSE} load("/data/eQTL2014/c17d.rda") ``` Here is an extract from the output. We have manually added a column 'mindist' with the number of nucleotides from SNP to nearest end of gene body. 'mindist' is zero for SNP lying within the gene body. ```{r lkres} c17d[1:3,1:5] names(mcols(c17d)) ```

Computing the plug-in FDR

The following algorithm is from The Elements of Statistical Learning by Hastie, Tibshirani and Friedman (Springer 2001). We can obtain estimated FDR for the gene-SNP pairs tested in the example above, as follows, using the pifdr() function in GGtools. ```{r dofd} fdr = pifdr( c17d$chisq, c(c17d$permScore_1, c17d$permScore_2, c17d$permScore_3) ) sum(fdr == 0) sum(fdr <= .05) ```

Sensitivity analysis, and "what to count?"

The following computations generate a sensitivity analysis display, confronting the enumeration of significantly associated SNP-gene pairs. ```{r dosens1} library(data.table) library(foreach) registerDoSEQ() dt1 = data.table(as.data.frame(mcols(c17d))) ```{r runsen,cache=TRUE} s1 = eqsens_dt( dt1 ) ``` ```{r dopl,fig=TRUE} plotsens(s1, ylab="Count of significant SNP-gene pairs at given FDR") ``` If instead of counting SNP-gene pairs, we wish to count genes that show evidence of association with some genotype, the plug-in FDR procedure can be used with maximal association scores per gene, for observed and permuted tests. ```{r rnsen2,cache=TRUE} s2 = eqsens_dt( dt1, by="probes" ) ``` ```{r pl2,fig=TRUE} plotsens(s2, ylab="Count of genes with evidence of eQTL at given FDR") ``` There is an indication here that we get a larger yield if we filter more sharply on MAF and distance than we did in the initial run.

Exercise 1.

How can we compute the FDR for the gene-wise hypotheses H0g: Mean expression of gene g is independent of SNP content for all SNP cis to g, under the restrictions cis radius less than 50kb and MAF greater than 5 percent?

Visualization

The following code creates a visualization of genotype-expression association for a selected SNP-gene pair. ```{r doonePlot, fig=TRUE} plotOne = function (summex, vcf.tf, whicha, whichv, genome="hg19", ... ) { sampidsInSumm = colnames(summex) sampidsInVCF = sampsInVCF(vcf.tf) oksamp = intersect(sampidsInSumm, sampidsInVCF) stopifnot(length(oksamp) > 0) summex = summex[, oksamp] vp = ScanVcfParam(fixed = "ALT", info = NA, geno = "GT", samples = oksamp, which = whichv) vdata = readVcf(vcf.tf, genome = genome, param = vp) rdd = rowData(vdata) vdata = GGtools:::snvsOnly(vdata) gtdata = factor(geno(vdata)[[1]]) plot( as.numeric(assays(summex)[[1]][whicha,]) ~ gtdata , ...) } r17 = rmpDEPF[ which(as.character(seqnames(rmpDEPF))=="chr17"),] s3urlb = sub("22", "17", s3url) plotOne(r17, TabixFile(s3urlb), 1, c17d[1] ) ```

Exercise 2.

What are the names of the SNP and gene depicted here? Replot, annotating with this information (improve the x and y labels for maximal clarity of interpretation).

Exercise 3.

What is the FDR for the association shown in the display? We can show four relationships at once fairly simply. ```{r dofff,fig=TRUE} opar = par(no.readonly=TRUE) par(mfrow=c(2,2)) for (i in 2:5) plotOne(r17, TabixFile(s3urlb), i, c17d[i] ) par(opar) ```

Exercise 4.

Depict the associations for four gene-SNP pairs with very small FDR. The four genes should be different. Hint: ```{r lktops,eval=FALSE} order(fdr)[1:10] c17d[7659] args(plotOne) plotOne(r17, TabixFile(s3urlb), "114881", c17d[7659] ) ```

Exercise 5: Master regulators.

Find SNP whose FDR for association with multiple distinct genes is found to be less than one percent. Visualize the relationships.

Connection with "GWAS hits"

We can acquire an image of the current NHGRI GWAS catalog as follows. ```{r lkgw, message=FALSE, warning=FALSE, eval=FALSE} library(gwascat) curgw = makeCurrentGwascat() curgw ``` HOWEVER -- as of 28 July 2014, the coordinates used for the current catalog are GRCh38. We used rtracklayer's liftOver with the appropriate chain file to obtain an hg19 addressing scheme, losing a small number of nonmappable loci. ```{r getgw} load("/data/eQTL2014/curgwHG19.rda") curgwHG19[1:3,1:5] ```

Exercise 6

How frequent are exact coincidences between SNP exhibiting eQTL association at FDR five percent or less, and GWAS hits? Compare this frequency as computed for SNP exhibiting association at FDR 95 percent or greater.

Further exercises

Exercise 7: Population of origin.

We have disregarded the population of origin in this analysis. Use visualization to assess whether the association between gene OSBPL7 and SNP rs17774008 has similar forms in subjects from CEU and YRI populations. You can use s3url to get access to genotype data for chr22. It takes 5 minutes or so to run cisAssoc on chr22 genes and SNP. Obtain analyses for CEU and YRI groups separately. What is the concordance on significance of SNP-gene pairs at FDR five percent or so?

Exercise 8: Efficient analysis with counted expression values.

/data/eQTL2014/montpick_eset.RData has the raw counts. (Note, probe identifiers are in ENSEMBL vocabulary.) Use DESeq2 or edgeR to obtain an FDR for a selected SNP-gene pair. Write a function that does this for any given selection. In principle the mean-variance relationship needs to be modeled separately for each gene-SNP pair, so obtaining a statistically optimal solution for a general search may be laborious. It would be nice to understand the costs of using rlog with blind and fast set to TRUE before doing a Gaussian-like analysis of eQTL, as opposed to pair-specific optimally modeled negative binomial testing for eQTL.

Bibliography

```{r results='asis',echo=FALSE} bibliography() #style="markdown") ```