1 Introduction

This document provides an introduction of the ELMER.data, which contains supporting data for ELMER (Yao, L., Shen, H., Laird, P. W., Farnham, P. J., & Berman, B. P. 2015). ELMER is package using DNA methylation to identify enhancers, and correlates enhancer state with expression of nearby genes to identify one or more transcriptional targets. Transcription factor (TF) binding site analysis of enhancers is coupled with expression analysis of all TFs to infer upstream regulators. ELMER.data provide 3 necessary data for ELMER analysis:

  1. Probes information: files with DNA methylation platforms metadata retrieved from http://zwdzwd.github.io/InfiniumAnnotation (Zhou, Wanding and Laird, Peter W and Shen, Hui 2016).
  2. Probes.motif: motif occurences within \(\pm 250bp\) of probe sites on HM450K/EPIC array aligned against hg19/hg38.

1.1 Installing and loading ELMER.data

To install this package, start R and enter

devtools::install_github(repo = "tiagochst/ELMER.data")
library("ELMER.data")
library("GenomicRanges")

2 Contents

2.1 ENSEMBL gene and TSS information

Data from GRCh38.p12 and GRCh37.p13 accessed via biomart.

getTranscripts <- function(genome = "hg38"){
  
  tries <- 0L
  msg <- character()
  while (tries < 3L) {
    tss <- tryCatch({
      host <- ifelse(genome == "hg19",  "grch37.ensembl.org","www.ensembl.org")
      message("Accessing ", host, " to get TSS information")
      
      ensembl <- tryCatch({
        useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", host =  host)
      },  error = function(e) {
        message(e)
        for(mirror in c("asia","useast","uswest")){
          x <- useEnsembl("ensembl",
                          dataset = "hsapiens_gene_ensembl",
                          mirror = mirror,
                          host =  host)
          if(class(x) == "Mart") {
            return(x)
          }
        }
        return(NULL)
      })
      
      if(is.null(host)) {
        message("Problems accessing ensembl database")
        return(NULL)
      }
      attributes <- c("chromosome_name",
                      "start_position",
                      "end_position", "strand",
                      "ensembl_gene_id", 
                      "transcription_start_site",
                      "transcript_start",
                      "ensembl_transcript_id",
                      "transcript_end",
                      "external_gene_name")
      chrom <- c(1:22, "X", "Y","M","*")
      db.datasets <- listDatasets(ensembl)
      description <- db.datasets[db.datasets$dataset=="hsapiens_gene_ensembl",]$description
      message(paste0("Downloading transcripts information from ", ensembl@host, ". Using: ", description))
      
      filename <-  paste0(gsub("[[:punct:]]| ", "_",description),"_tss.rda")
      tss <- getBM(attributes = attributes, filters = c("chromosome_name"), values = list(chrom), mart = ensembl)
      tss <- tss[!duplicated(tss$ensembl_transcript_id),]
      save(tss, file = filename, compress = "xz")
    })
  }
  return(tss)
}

getGenes <- function (genome = "hg19"){
  tries <- 0L
  msg <- character()
  while (tries < 3L) {
    gene.location <- tryCatch({
      host <- ifelse(genome == "hg19", "grch37.ensembl.org", 
                     "www.ensembl.org")
      message("Accessing ", host, " to get gene information")
      ensembl <- tryCatch({
        useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", 
                   host = host)
      }, error = function(e) {
        message(e)
        for (mirror in c("asia", "useast", "uswest")) {
          x <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl", 
                          mirror = mirror, host = host)
          if (class(x) == "Mart") {
            return(x)
          }
        }
        return(NULL)
      })
      if (is.null(host)) {
        message("Problems accessing ensembl database")
        return(NULL)
      }
      attributes <- c("chromosome_name", "start_position", 
                      "end_position", "strand", "ensembl_gene_id", 
                      "entrezgene", "external_gene_name")
      db.datasets <- listDatasets(ensembl)
      description <- db.datasets[db.datasets$dataset == 
                                   "hsapiens_gene_ensembl", ]$description
      message(paste0("Downloading genome information (try:", 
                     tries, ") Using: ", description))
      filename <- paste0(gsub("[[:punct:]]| ", "_", description), 
                         ".rda")
      if (!file.exists(filename)) {
        chrom <- c(1:22, "X", "Y")
        gene.location <- getBM(attributes = attributes, 
                               filters = c("chromosome_name"), values = list(chrom), 
                               mart = ensembl)
      }
      gene.location
    }, error = function(e) {
      msg <<- conditionMessage(e)
      tries <<- tries + 1L
    })
    if (!is.null(gene.location)) break
  }
  if (tries == 3L) 
    stop("failed to get URL after 3 tries:", "\n  error: ", msg)
  
  return(gene.location)
}

Human_genes__GRCh37_p13__tss <- getTranscripts(genome = "hg19")
Human_genes__GRCh38_p12__tss <- getTranscripts(genome = "hg38")
Human_genes__GRCh37_p13 <- getGenes("hg19")
Human_genes__GRCh38_p12 <- getGenes("hg38")
save(Human_genes__GRCh37_p13__tss,
     file = "Human_genes__GRCh37_p13__tss.rda", 
     compress = "xz")
     
save(Human_genes__GRCh38_p12,
     file = "Human_genes__GRCh38_p12.rda", 
     compress = "xz")
     
save(Human_genes__GRCh38_p12__tss,
     file = "Human_genes__GRCh38_p12__tss.rda", 
     compress = "xz")
     
save(Human_genes__GRCh37_p13,
     file = "Human_genes__GRCh37_p13.rda", 
     compress = "xz")     

2.2 Probes information

Probes information were retrieved from http://zwdzwd.github.io/InfiniumAnnotation (Zhou, Wanding and Laird, Peter W and Shen, Hui 2016).

for(plat in c("450K","EPIC")) {
  for(genome in c("hg38","hg19")) {
    base <- "http://zwdzwd.io/InfiniumAnnotation/current/"
    path <- file.path(base,plat,paste(plat,"hg19.manifest.rds", sep ="."))
    if (grepl("hg38", genome)) path <- gsub("hg19","hg38",path)
    if(plat == "EPIC") {
      annotation <- paste0(base,"EPIC/EPIC.hg19.manifest.rds")
    } else {
      annotation <- paste0(base,"hm450/hm450.hg19.manifest.rds")
    }
    if(grepl("hg38", genome)) annotation <- gsub("hg19","hg38",annotation)
    if(!file.exists(basename(annotation))) {
      if(Sys.info()["sysname"] == "Windows") mode <- "wb" else  mode <- "w"
      downloader::download(annotation, basename(annotation), mode = mode)
    }
  }
}

devtools::use_data(EPIC.hg19.manifest,overwrite = T,compress = "xz")
devtools::use_data(EPIC.hg38.manifest,overwrite = T,compress = "xz")
devtools::use_data(hm450.hg19.manifest,overwrite = T,compress = "xz")
devtools::use_data(hm450.hg38.manifest,overwrite = T,compress = "xz")
data("EPIC.hg19.manifest")
as.data.frame(EPIC.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("EPIC.hg38.manifest")
as.data.frame(EPIC.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.hg19.manifest")
as.data.frame(hm450.hg19.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 
data("hm450.hg38.manifest")
as.data.frame(hm450.hg38.manifest)[1:5,] %>% datatable(options = list(scrollX = TRUE,pageLength = 5)) 

2.3 TF family and subfamily classifications

ELMER uses the TFClass (Wingender, E., Schoeps, T., Haubrock, M., & Dönitz, J 2014), a classification of eukaryotic transcription factors based on the characteristics of their DNA-binding domains, to identify which are the TF that might be binding to the same region. For example, if a FOXA1 motif is found in a region, there is FOXA2 would also be able to bind to that region. For that ELMER uses two classifications, Family and sub-family. TFClass schema is shown below.

TFClass schema is below:

Level Rank denomination Definition Example
1 Superclass General topology of the DNA-binding domain Zinc-coordinating DNA-binding domains (Superclass 2)
2 Class Structural blueprint of the DNA-binding domain Nuclear receptors with C4 zinc fingers (Class 2.1)
3 Family Sequence & functional similarity Thyroid hormone receptor-related factors (NR1) (Family 2.1.2)
4 Subfamily Sequence-based subgroupings Retinoic acid receptors (NR1B) (Subfamily 2.1.2.1)
5 Genus Transcription factor gene RAR-α (Genus 2.1.2.1.1)
4 Species TF polypeptide RAR-α1 (Species 2.1.2.1.1.1)

The following code was used to create the objects:

library(xml2)
library(httr)
library(dplyr)
library(rvest)
createMotifRelevantTfs <- function(classification = "family"){
  
  message("Accessing hocomoco to get last version of TFs ", classification)
  file <- paste0(classification,".motif.relevant.TFs.rda")
  
  # Download from http://hocomoco.autosome.ru/human/mono
  tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
  tf.family <- tf.family[[1]]
  # Split TF for each family, this will help us map for each motif which are the some ones in the family
  # basicaly: for a TF get its family then get all TF in that family
  col <- ifelse(classification == "family", "TF family","TF subfamily")
  family <- split(tf.family,f = tf.family[[col]])
  
  motif.relevant.TFs <- plyr::alply(tf.family,1, function(x){  
    f <- x[[col]]
    if(f == "") return(x$`Transcription factor`) # Case without family, we will get only the same object
    return(unique(family[as.character(f)][[1]]$`Transcription factor`))
  },.progress = "text")
  #names(motif.relevant.TFs) <- tf.family$`Transcription factor`
  names(motif.relevant.TFs) <- tf.family$Model
  # Cleaning object
  attr(motif.relevant.TFs,which="split_type") <- NULL
  attr(motif.relevant.TFs,which="split_labels") <- NULL
  
  return(motif.relevant.TFs)
}

updateTFClassList <- function(tf.list, classification = "family"){
  col <- ifelse(classification == "family","family.name","subfamily.name")
  TFclass <- getTFClass()
  # Hocomoco
  tf.family <- "http://hocomoco11.autosome.ru/human/mono?full=true" %>% read_html()  %>%  html_table()
  tf.family <- tf.family[[1]]
  
  tf.members <- plyr::alply(unique(TFclass