## ----extracode,echo=FALSE,error=FALSE,message=FALSE---------------------- suppressMessages({ exs2se = function (x, assayname = "exprs", fngetter = function(z) rownames(exprs(z)), annoDb, probekeytype = "PROBEID", duphandler = function(z) { if (any(isd <- duplicated(z[, "ENTREZID"]))) return(z[!isd, , drop = FALSE]) z }, signIsStrand = TRUE, ucsdChrnames = TRUE) { annopk = annoDb stopifnot(is(annopk, "ChipDb") | is(annopk, "OrgDb")) fn = fngetter(x) locd = duphandler(fulls <- select(annopk, keytype = probekeytype, keys = fn, columns = c("CHR", "CHRLOC", "CHRLOCEND"))) nfulls = na.omit(fulls) nmultiaddr = nrow(nfulls) - length(fn) rownames(locd) = locd[, probekeytype] locd = na.omit(locd) dropped = setdiff(fn, rownames(locd)) if (length(dropped) > 0) warning(paste("there were", length(dropped), "addresses dropped owing to missing address information in bioc annotation")) locd = locd[intersect(rownames(locd), fn), ] strand = rep("*", nrow(locd)) if (signIsStrand) strand = ifelse(locd[, "CHRLOC"] > 0, "+", "-") chpref = "" if (ucsdChrnames) chpref = "chr" rowd = GRanges(paste0(chpref, locd[, "CHR"]), IRanges(abs(locd[, "CHRLOC"]), abs(locd[, "CHRLOCEND"])), strand = strand) names(rowd) = rownames(locd) metadata(rowd)$dropped = dropped metadata(rowd)$nmultiaddr = nmultiaddr hasrowd = match(names(rowd), fn, nomatch = 0) ex = x[hasrowd, ] stopifnot(nrow(exprs(ex)) == length(rowd)) ed = SimpleList(initExptData = experimentData(ex)) SummarizedExperiment(assays = SimpleList(exprs = exprs(ex)), rowData = rowd, colData = DataFrame(pData(ex)), exptData = ed) } genemodel = function (sym, genome = "hg19", force = FALSE) { stopifnot(genome == "hg19") if (!exists("txdb") || force) txlib() require(org.Hs.eg.db) num = get(sym, revmap(org.Hs.egSYMBOL)) exonsBy(txdb, by = "gene")[[num]] } txlib = function () { library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <<- TxDb.Hsapiens.UCSC.hg19.knownGene message("txdb assigned") invisible(NULL) } rangeFromToksViaEntrez = function (toks, genome = "hg19", force = FALSE, takeRange=TRUE, orgDb=Homo.sapiens, typesToTry=c("SYMBOL", "GOID", "PATH", "TERM"), routeSink=TRUE) { # # convert tokens to GRanges after translation to Entrez ids # takeRange if true reduces answer to 'range' of exons # open question whether we also want to cache specific models # require(stashR) stopifnot(genome == "hg19") # # cache exonsBy of TxDb # require(deparse(substitute(orgDb)), character.only=TRUE) modcon = new("localDB", dir="~/.SEresolvers", name="rowResolvers") if (!dbExists(modcon, "hg19.exbygene")) { message("one time cache creation for TxDb...") txlib() # special function to create txdb in .GlobalEnv dbInsert( modcon, "hg19.exbygene", exonsBy(txdb, by="gene") ) } allex = dbFetch(modcon, "hg19.exbygene") tf = file(tempfile(), open="wt") if (routeSink) message("resolving row selection tokens... (messages diverted)") if (routeSink) sink(tf, type="message") for (curty in typesToTry) { num = try(select(orgDb, keytype=curty, keys=toks, columns="ENTREZID")[,"ENTREZID"]) if (!inherits(num, "try-error")) break } if (routeSink) sink(NULL, type="message") if (routeSink) message("messages restored") if (inherits(num, "try-error")) stop("can't resolve toks") inds = match(num, names(allex)) filt = force if (takeRange) filt = range ans = do.call(c, lapply(1:length(inds), function(x) try({ tmp = filt(allex[[inds[x]]]) tmp$ENTREZID=num[x] tmp}))) syms = select(Homo.sapiens, keys=ans$ENTREZID, keytype="ENTREZID", columns="SYMBOL") if (length(unique(syms[,1]))==nrow(syms)) ans$SYMBOL = syms[,2] else { ssyms = split(syms[,2], syms[,1])[ syms[,1] ] # retain order s1 = sapply(ssyms, "[", 1) ans$SYMBOL = s1 } ans } hasRowResolver = function(x) !is.null(exptData(x)$rowResolverHook) getRowResolverTypes = function(x) exptData(x)$rowResolverHook() tryRowResolution = function(x,i) { ress = getRowResolverTypes(x) rngs = rangeFromToksViaEntrez(toks=i, typesToTry=ress) ov = findOverlaps( rowData(x), rngs ) queryHits(ov) } # based on devel 1.17.26 GRanges 1.17.26 setMethod("[", c("SummarizedExperiment", "ANY", "ANY", "ANY"), function (x, i, j, ..., drop = TRUE) { clone = GenomicRanges:::clone .SummarizedExperiment.charbound = GenomicRanges:::.SummarizedExperiment.charbound .SummarizedExperiment.assays.subset = GenomicRanges:::.SummarizedExperiment.assays.subset if (1L != length(drop) || (!missing(drop) && drop)) warning("'drop' ignored '[,", class(x), ",ANY,ANY-method'") if (missing(i) && missing(j)) return(x) if (!missing(i) && is.character(i)) { if (hasRowResolver(x)) i = tryRowResolution(x, i) # will be vector of integers else { fmt <- paste0("<", class(x), ">[i,] index out of bounds: %s") i <- .SummarizedExperiment.charbound(i, rownames(x), fmt) } } if (!missing(j) && is.character(j)) { fmt <- paste0("<", class(x), ">[,j] index out of bounds: %s") j <- .SummarizedExperiment.charbound(j, colnames(x), fmt) } if (!missing(i) && !missing(j)) { ii <- as.vector(i) jj <- as.vector(j) x <- clone(x, ..., rowData = rowData(x)[i], colData = colData(x)[j, , drop = FALSE], assays = .SummarizedExperiment.assays.subset(x, ii, jj)) } else if (missing(i)) { jj <- as.vector(j) x <- clone(x, ..., colData = colData(x)[j, , drop = FALSE], assays = .SummarizedExperiment.assays.subset(x, j = jj)) } else { ii <- as.vector(i) x <- clone(x, ..., rowData = rowData(x)[i], assays = .SummarizedExperiment.assays.subset(x, ii)) } x }) library(knitcitations) library(bibtex) allbib = read.bibtex("/data/eQTL2014/eq.bib") }) ## ----trans1,eval=FALSE--------------------------------------------------- ## library(Biobase) ## load("/data/eQTL2014/montpick_eset.RData") ## mp = montpick.eset ## annotation(mp) = "org.Hs.eg.db" ## tmp = select(org.Hs.eg.db, keytype="ENSEMBL", columns="ENTREZID", ## keys=featureNames(mp)) ## ezs = split(tmp[,2], tmp[,1]) ## ezs = sapply(ezs, "[", 1) ## drop = which(duplicated(ezs) | is.na(ezs)) ## if (length(drop)>0) ezs = ezs[-drop] ## mp = mp[names(ezs)] ## featureNames(mp) = as.character(ezs[featureNames(mp)]) ## mpSE = exs2se(mp, annoDb = org.Hs.eg.db, probekeytype="ENTREZID") ## ----getd---------------------------------------------------------------- load("/data/eQTL2014/rmpDEPF.rda") # rlogged Montgomery Pickrell DESeqDataSet, some positive, filtered rmpDEPF ## ----lkgset, warning=FALSE, message=FALSE-------------------------------- load("/data/eQTL2014/s3url.rda") s3url library(VariantAnnotation) ## ----lkg, warning=FALSE, cache=TRUE, message=FALSE----------------------- head = scanVcfHeader(s3url) str(head, max.level=4) ## ----getli--------------------------------------------------------------- library(GGtools) ## ----doit,eval=FALSE----------------------------------------------------- ## library(foreach) ## library(doParallel) ## registerDoSEQ() ## r17 = rmpDEPF[ which(seqnames(rmpDEPF)=="chr17"),] ## set.seed(1234) ## s3urlb = sub("22", "17", s3url) ## c17 = cisAssoc(r17, TabixFile(s3urlb), cisradius=100000, lbmaf=.025) ## c17d = addmindist(c17) ## ----dod,echo=FALSE------------------------------------------------------ load("/data/eQTL2014/c17d.rda") ## ----lkres--------------------------------------------------------------- c17d[1:3,1:5] names(mcols(c17d)) ## ----dofd---------------------------------------------------------------- fdr = pifdr( c17d$chisq, c(c17d$permScore_1, c17d$permScore_2, c17d$permScore_3) ) sum(fdr == 0) sum(fdr <= .05) ## ----dosens1------------------------------------------------------------- library(data.table) library(foreach) registerDoSEQ() dt1 = data.table(as.data.frame(mcols(c17d))) ## ----runsen,cache=TRUE--------------------------------------------------- s1 = eqsens_dt( dt1 ) ## ----dopl,fig=TRUE------------------------------------------------------- plotsens(s1, ylab="Count of significant SNP-gene pairs at given FDR") ## ----rnsen2,cache=TRUE--------------------------------------------------- s2 = eqsens_dt( dt1, by="probes" ) ## ----pl2,fig=TRUE-------------------------------------------------------- plotsens(s2, ylab="Count of genes with evidence of eQTL at given FDR") ## ----doonePlot, fig=TRUE------------------------------------------------- plotOne = function (summex, vcf.tf, whicha, whichv, genome="hg19", ... ) { sampidsInSumm = colnames(summex) sampidsInVCF = sampsInVCF(vcf.tf) oksamp = intersect(sampidsInSumm, sampidsInVCF) stopifnot(length(oksamp) > 0) summex = summex[, oksamp] vp = ScanVcfParam(fixed = "ALT", info = NA, geno = "GT", samples = oksamp, which = whichv) vdata = readVcf(vcf.tf, genome = genome, param = vp) rdd = rowData(vdata) vdata = GGtools:::snvsOnly(vdata) gtdata = factor(geno(vdata)[[1]]) plot( as.numeric(assays(summex)[[1]][whicha,]) ~ gtdata , ...) } r17 = rmpDEPF[ which(as.character(seqnames(rmpDEPF))=="chr17"),] s3urlb = sub("22", "17", s3url) plotOne(r17, TabixFile(s3urlb), 1, c17d[1] ) ## ----dofff,fig=TRUE------------------------------------------------------ opar = par(no.readonly=TRUE) par(mfrow=c(2,2)) for (i in 2:5) plotOne(r17, TabixFile(s3urlb), i, c17d[i] ) par(opar) ## ----lktops,eval=FALSE--------------------------------------------------- ## order(fdr)[1:10] ## c17d[7659] ## args(plotOne) ## plotOne(r17, TabixFile(s3urlb), "114881", c17d[7659] ) ## ----lkgw, message=FALSE, warning=FALSE, eval=FALSE---------------------- ## library(gwascat) ## curgw = makeCurrentGwascat() ## curgw ## ----getgw--------------------------------------------------------------- load("/data/eQTL2014/curgwHG19.rda") curgwHG19[1:3,1:5] ## ----results='asis',echo=FALSE------------------------------------------- bibliography() #style="markdown")