## ----Figure1, out.width="100%", fig.align="center", fig.cap = "Overview of the data available in recount2. Reads (pink boxes) aligned to the reference genome can be used to compute a base-pair coverage curve and identify exon-exon junctions (split reads). Gene and exon count matrices are generated using annotation information providing the gene (green boxes) and exon (blue boxes) coordinates together with the base-level coverage curve. The reads spanning exon-exon junctions (jx) are used to compute a third count matrix that might include un-annotated junctions (jx 3 and 4). Without using annotation information, expressed regions (orange box) can be determined from the base-level coverage curve to then construct data-driven count matrices.", echo = FALSE----
knitr::include_graphics("Figure1.png")

## ----Figure2, out.width="100%", fig.align="center", fig.cap = "recount2 provides coverage count matrices in RangedSummarizedExperiment (rse) objects. Once the rse object has been downloaded and loaded into R, the feature information is accessed with rowRanges(rse) (blue box), the counts with assays(rse)\\$counts (pink box) and the sample metadata with colData(rse) (green box). The sample metadata can be expanded using add\\_predictions(rse) (orange box) or with custom code (brown box) matching by a unique sample identifier such as the SRA Run id. The rse object is inside the purple box and matching data is highlighted in each box.", echo = FALSE----
knitr::include_graphics("Figure2.png")

## ----"install", eval = FALSE-----------------------------------------------
#  ## Install packages from Bioconductor
#  source("https://bioconductor.org/biocLite.R")
#  biocLite(c("recount", "GenomicRanges", "DESeq2", "ideal", "regionReport",
#      "clusterProfiler", "org.Hs.eg.db", "gplots", "derfinder",
#      "rtracklayer", "GenomicFeatures", "bumphunter", "derfinderPlot",
#      "devtools"))

## ----"load libraries", message = FALSE, warning = FALSE--------------------
library("recount")
library("GenomicRanges")
library("DESeq2")
library("ideal")
library("regionReport")
library("clusterProfiler")
library("org.Hs.eg.db")
library("gplots")
library("derfinder")
library("rtracklayer")
library("GenomicFeatures")
library("bumphunter")
library("derfinderPlot")
library("devtools")

## ----Figure3, out.width="100%", fig.align="center", fig.cap = "RNA-seq starting data. 16 RNA-seq un-aligned RNA-seq reads 3 base-pairs long are shown (pink boxes) along a reference genome 16 base-pairs long (white box).", echo = FALSE----
knitr::include_graphics("Figure3.png")

## ----Figure4, out.width="100%", fig.align="center", fig.cap = "Aligned RNA-seq reads. Spice-aware RNA-seq aligners such as Rail-RNA are able to find the coordinates to which the reads map, even if they span exon-exon junctions (connected boxes). Rail-RNA soft clips some reads (purple boxes with rough edges) such that a portion of these reads align to the reference genome.", echo = FALSE----
knitr::include_graphics("Figure4.png")

## ----Figure5, out.width="100%", fig.align="center", fig.cap = "Gene annotation. A single gene with two isoforms composed by three distinct exons (blue boxes) is illustrated. Exons 1 and 3 share the first five base-pairs while exon 2 is common to both isoforms.", echo = FALSE----
knitr::include_graphics("Figure5.png")

## ----"disjoin", message = FALSE--------------------------------------------
library("GenomicRanges")
exons <- GRanges("seq", IRanges(start = c(1, 1, 13), end = c(5, 8, 15)))
exons
disjoin(exons)

## ----Figure6, out.width="100%", fig.align="center", fig.cap = "Disjoint exons. Windows of distinct exonic sequence for the example gene. Disjoint exons 1 and 2 form exon 1.", echo = FALSE----
knitr::include_graphics("Figure6.png")

## ----Figure7, out.width="100%", fig.align="center", fig.cap = "Base-pair coverage counting for exonic base-pairs. At each exonic base-pair we compute the number of reads overlapping that given base-pair. The first base (orange arrow) has 3 reads overlapping that base-pair. Base-pair 11 has a coverage of 3 but does not overlap known exonic sequence, so that information is not used for the gene and exon count matrices (grey arrow). If a read partially overlaps exonic sequence, only the portion that overlaps is used in the computation (see right most read).", echo = FALSE----
knitr::include_graphics("Figure7.png")

## ----Figure8, out.width="100%", fig.align="center", fig.cap = "Exon and gene coverage counts. The coverage counts for each disjoint exon are the sum of the base-pair coverage. The gene coverage count is the sum of the disjoint exons coverage counts.", echo = FALSE----
knitr::include_graphics("Figure8.png")

## ----"coverage"------------------------------------------------------------
## Take the example and translate it to R code
library("GenomicRanges")
reads <- GRanges("seq", IRanges(
    start = rep(
        c(1, 2, 3, 4, 5, 7, 8, 9, 10, 13, 14), 
        c(3, 1, 2, 1, 2, 1, 2, 1, 2, 4, 1)
    ), width = rep(
        c(1, 3, 2, 3, 1, 2, 1, 3, 2, 3, 2, 1, 3),
        c(1, 4, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1)
    )
))
## Get the base-level genome coverage curve
cov <- as.integer(coverage(reads)$seq)

## AUC
sum(cov)

## ----"coverage-reproduce", eval = FALSE------------------------------------
#  ## Code for reproducing the bottom portion of Figure 8.
#  pdf("base_pair_coverage.pdf", width = 20)
#  par(mar = c(5, 6, 4, 2) + 0.1)
#  plot(cov, type = "o", col = "violetred1", lwd = 10, ylim = c(0, 5),
#       xlab = "Genome", ylab = "Coverage", cex.axis = 2, cex.lab = 3,
#       bty = "n")
#  polygon(c(1, seq_len(length(cov)), length(cov)), c(0, cov, 0),
#          border = NA, density = -1, col = "light blue")
#  points(seq_len(length(cov)), cov, col = "violetred1", type = "o",
#         lwd = 10)
#  dev.off()

## ----Figure9, out.width="100%", fig.align="center", fig.cap = "Area under coverage (AUC). The area under coverage is the sum of the base-pair coverage for all positions in the genome regardless of the annotation. It is the area under the base-level coverage curve shown as the light blue area under the pink curve.", echo = FALSE----
knitr::include_graphics("Figure9.png")

## ----"example_scaled", message = FALSE-------------------------------------
## Check that the number of reads is less than or equal to 40 million
## after scaling.
library("recount")
rse_scaled <- scale_counts(rse_gene_SRP009615, round = FALSE)
summary(colSums(assays(rse_scaled)$counts)) / 1e6

## ----Figure10, out.width="100%", fig.align="center", fig.cap = "Exon-exon junctions go beyond the annotation. Reads spanning exon-exon junctions are highlighted and compared against the annotation. Three of them match the annotated junctions, but one (blue and orange read) spans an un-annotated exon-exon junction with the left end matching the annotation and the right end hinting at a possible new isoform for this gene (blue and orange isoform).", echo = FALSE----
knitr::include_graphics("Figure10.png")

## ----Figure11, out.width="100%", fig.align="center", fig.cap = "Intron retention events. Some reads might align with known intronic segments of the genome and provide information for exploring intron retention events (pink read). Some might support an intron retention event or a new isoform when coupled with exon-exon junction data (orange read).", echo = FALSE----
knitr::include_graphics("Figure11.png")

## ----Figure12, out.width="100%", fig.align="center", fig.cap = "Exon boundaries. Reads that go beyond the known exon boundaries can inform us of whether the annotated boundaries are correct or if there was a run-off transcription event.", echo = FALSE----
knitr::include_graphics("Figure12.png")

## ----"localPath"-----------------------------------------------------------
## Locate path with the data
library("recountWorkshop")
local_path <- system.file("extdata", "SRP056604", package = "recountWorkshop")
dir(local_path)

## ----"testing locally", echo = FALSE, eval = FALSE-------------------------
#  ## For testing
#  if(FALSE) {
#      local_path <- "/Users/lcollado/Dropbox/Code/recountWorkshop/inst/extdata/SRP056604"
#  }

## ----"download gene"-------------------------------------------------------
library("recount")

## Find the project id by searching abstracts of studies
abstract_search("hippocampi of 4")

## Download the data if it is not there
if(!file.exists(file.path(local_path, "rse_gene.Rdata"))) {
    ## In case you decide to download the data instead of using the
    ## pre-installed data
    local_path <- "SRP056604"
    download_study("SRP056604", type = "rse-gene")
}

## Check that the file was downloaded
file.exists(file.path(local_path, "rse_gene.Rdata"))

## Load the data
load(file.path(local_path, "rse_gene.Rdata"))

## ----colData---------------------------------------------------------------
## One row per sample, one column per phenotype variable
dim(colData(rse_gene))

## Mostly technical variables are included
colnames(colData(rse_gene))

## ----"explore colData"-----------------------------------------------------
## Input reads: number reported by SRA might be larger than number
## of reads Rail-RNA downloaded
colData(rse_gene)[, c("read_count_as_reported_by_sra", "reads_downloaded")]
summary(colData(rse_gene)$proportion_of_reads_reported_by_sra_downloaded)

## AUC information used by scale_counts() by default
head(colData(rse_gene)$auc)

## Alternatively, scale_scounts() can use the number of mapped reads
## and other information
colData(rse_gene)[, c("mapped_read_count", "paired_end", "avg_read_length")]

## ----sharq-----------------------------------------------------------------
## SHARQ tissue predictions: not present for all studies
head(colData(rse_gene)$sharq_beta_tissue)
head(colData(rse_gene_SRP009615)$sharq_beta_tissue)

## ----characteristics-------------------------------------------------------
## GEO information was absent for the SRP056604 data set
colData(rse_gene)[, c("geo_accession", "title", "characteristics")]

## GEO information for the SRP009615 data set
head(colData(rse_gene_SRP009615)$geo_accession)
head(colData(rse_gene_SRP009615)$title, 2)
head(colData(rse_gene_SRP009615)$characteristics, 2)

## Similar but not exactly the same wording used for two different samples
colData(rse_gene_SRP009615)$characteristics[[1]]
colData(rse_gene_SRP009615)$characteristics[[11]]

## For study SRP056604 we have characteristics information
## Note the > symbol in the age for the second sample
colData(rse_gene)$characteristics[[1]]
colData(rse_gene)$characteristics[[2]]

## ----characteristics_process-----------------------------------------------
## Get the case status: either LOAD or control
colData(rse_gene)$case <- factor(
    sapply(colData(rse_gene)$characteristics, function(x)
        ifelse(any(grepl("LOAD", x)), "control", "LOAD"))
)

## Get the sex
colData(rse_gene)$sex <- factor(
    sapply(colData(rse_gene)$characteristics, function(x)
        ifelse(any(grepl("female", x)),
    "female", "male"))
)

## Extract the age. Note that one of them is a bit more complicated.
colData(rse_gene)$age <- sapply(colData(rse_gene)$characteristics,
    function(x) {
        y <- x[grep("age.*(yrs)", x)]
        as.integer(gsub("age.*\\(yrs\\): |>", "", y))
    }
)

## ----"explore case sex age"------------------------------------------------
table(colData(rse_gene)$case, colData(rse_gene)$sex)
table(colData(rse_gene)$case, colData(rse_gene)$age)

## ----"add_predictions"-----------------------------------------------------
## Before adding predictions
dim(colData(rse_gene))

## Add the predictions
rse_gene <- add_predictions(rse_gene)

## After adding the predictions
dim(colData(rse_gene))

## Explore the variables
colData(rse_gene)[, 25:ncol(colData(rse_gene))]

## ----"sex predictions"-----------------------------------------------------
table("Observed" = colData(rse_gene)$sex,
    "Predicted" = colData(rse_gene)$predicted_sex)

## ----"matching sex", out.width="100%", fig.align="center"------------------
## Is the sex matching? TRUE for yes.
colData(rse_gene)$matching_sex <- as.character(colData(rse_gene)$sex) ==
    as.character(colData(rse_gene)$predicted_sex)

## Matching sex vs other variables
table(colData(rse_gene)$matching_sex, colData(rse_gene)$case)
table(colData(rse_gene)$matching_sex, colData(rse_gene)$age)
boxplot(colData(rse_gene)$mapped_read_count ~ colData(rse_gene)$matching_sex,
    ylab = "Mapped read count", xlab = "Matching sex")

## ----"sra_run_table"-------------------------------------------------------
## Save the information from 
## https://trace.ncbi.nlm.nih.gov/Traces/study/?acc=SRP056604
## to a table. We saved the file as SRP056604/SraRunTable.txt.
file.exists(file.path(local_path, "SraRunTable.txt"))

## Read the table
sra <- read.table(file.path(local_path, "SraRunTable.txt"),
    header = TRUE, sep = "\t")

## Explore it
head(sra)

## We will remove some trailing '_s' from the variable names
colnames(sra) <- gsub("_s$", "", colnames(sra))

## Choose some variables we want to add
sra_vars <- c("braak_stage", "gender", "tissue")

## Re-organize the SRA table based on the SRA Run ids we have
sra <- sra[match(colData(rse_gene)$run, sra$Run), ]

## Double check the order
identical(colData(rse_gene)$run, as.character(sra$Run))

## Append the variables of interest
colData(rse_gene) <- cbind(colData(rse_gene), sra[, sra_vars])

## Final dimensions
dim(colData(rse_gene))

## Explore result
colData(rse_gene)[, 38:ncol(colData(rse_gene))]

## The sex information from the 'characteristics' and the
## SRA Run Selector match
table(colData(rse_gene)$gender, colData(rse_gene)$sex)

## ----"scale_counts"--------------------------------------------------------
## Scale counts
rse_gene_scaled <- scale_counts(rse_gene)

## To highlight that we scaled the counts
rm(rse_gene)

## ----"deseq2_de", out.width="100%", fig.align="center"---------------------
library("DESeq2")

## Specify design and switch to DESeq2 format
dds <- DESeqDataSet(rse_gene_scaled, ~ sex + age + case)

## Perform DE analysis
dds <- DESeq(dds, test = "LRT", reduced = ~ sex + age, fitType = "local")
res <- results(dds, alpha = 0.01)

## Explore results
plotMA(res, main="DESeq2 results for SRP056604")

## ----"ideal volcano"-------------------------------------------------------
## Make a volcano plot
library("ideal")
plot_volcano(res, FDR = 0.01)

## ----"create_report", message = FALSE, warning = FALSE, results = "hide", eval = FALSE----
#  ## Make a report with the results
#  library("regionReport")
#  DESeq2Report(dds, res = res, project = "SRP056604",
#      intgroup = c("sex", "case"), outdir = ".",
#      output = "SRP056604_main-results")

## ----"browse_report", eval = FALSE-----------------------------------------
#  browseURL("SRP056604_main-results.html")

## ----"go_analysis", out.width="100%", fig.align="center"-------------------
library("clusterProfiler")
library("org.Hs.eg.db")

## Remember that dds had ENSEMBL ids for the genes
ensembl <- gsub("\\..*", "", rownames(dds))
head(ensembl)

## Not all genes have a p-value
table(!is.na(res$padj))

## Perform enrichment analysis for Biological Process (BP)
## Note that the argument is keytype instead of keyType in Bioconductor 3.5
enrich_go <- enrichGO(gene = ensembl[which(res$padj < 0.05)],
    OrgDb = org.Hs.eg.db, keyType = "ENSEMBL", ont = "BP",
    pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
    universe = ensembl[!is.na(res$padj)])

## Visualize enrichment results
dotplot(enrich_go)

## ----"secondary analysis", out.width="100%", fig.align="center"------------
## DE analysis checking what genes are different by matching sex
dds2 <- DESeqDataSet(rse_gene_scaled, ~ matching_sex)
dds2 <- DESeq(dds2, test = "LRT", reduced = ~ 1, fitType = "local")
res2 <- results(dds2, alpha = 0.01)

## Visually inspect results
plotMA(res2, main="DESeq2 results for SRP056604 - sex predictions")

## Lets add gene symbols to the volcano plot, they are stored in
## the rowRanges slot of the rse object
rowRanges(rse_gene_scaled)
res2$symbol <- sapply(rowRanges(rse_gene_scaled)$symbol, "[[", 1)

## Select some DE genes
intgenes <- res2$symbol[which(res2$padj < 0.0005)]
intgenes <- intgenes[!is.na(intgenes)]

## Make the volcano plot
plot_volcano(res2, FDR = 0.01, intgenes = intgenes)

## ----"create_report2", message = FALSE, warning = FALSE, results = "hide", eval = FALSE----
#  DESeq2Report(dds2, res = res2, project = "SRP056604 - matching sex",
#      intgroup = c("sex", "predicted_sex"), outdir = ".",
#      output = "SRP056604_sex-predictions")

## ----"matching sex de genes chr"-------------------------------------------
sort(table(seqnames(rowRanges(rse_gene_scaled))[which(res2$padj < 0.01)]))

## ----"exon_de_analysis", out.width="100%", fig.align="center"--------------
## Download the data if it is not there
if(!file.exists(file.path(local_path, "rse_exon.Rdata"))) {
    ## In case you decide to download the data instead of using the
    ## pre-installed data
    local_path <- "SRP056604"
    download_study("SRP056604", type = "rse-exon")
}

## Load the data
load(file.path(local_path, "rse_exon.Rdata"))

## Scale and add the metadata (it is in the same order)
identical(colData(rse_exon)$run, colData(rse_gene_scaled)$run)
colData(rse_exon) <- colData(rse_gene_scaled)
rse_exon_scaled <- scale_counts(rse_exon)
## To highlight that we scaled the counts
rm(rse_exon)

## Filter lowly expressed exons: reduces the object size
## and run time
filter_exon <- rowMeans(assays(rse_exon_scaled)$counts) > 5
round(table(filter_exon) / length(filter_exon) * 100, 2)

## Perform the filtering and change default names
rse_e <- rse_exon_scaled[filter_exon, ]
rowRanges(rse_e)$gene_id <- rownames(rse_e)
rownames(rse_e) <- paste0("exon_", seq_len(nrow(rse_e)))

## Create DESeq2 object for the exon data
dds_exon <- DESeqDataSet(rse_e, ~ sex + age + case)

## Perform DE analysis
dds_exon <- DESeq(dds_exon, test = "LRT", reduced = ~ sex + age,
    fitType = "local")
res_exon <- results(dds_exon, alpha = 0.01)

## Explore results
plotMA(res_exon, main="DESeq2 results for SRP056604 - exon level")
plot_volcano(res_exon, FDR = 0.01)

## ----"gene_exon", out.width="100%", fig.align="center"---------------------
## Get the gene ids for genes that are DE at the gene level or that have at
## least one exon with DE signal.
genes_w_de_exon <- unique(rowRanges(rse_e)$gene_id[which(res_exon$padj < 0.01)])
genes_de <- rownames(rse_gene_scaled)[which(res$padj < 0.01)]

## Make a venn diagram
library("gplots")
vinfo <- venn(list("genes" = genes_de, "exons" = genes_w_de_exon),
    names = c("genes", "exons"), show.plot = FALSE) 
plot(vinfo) +
    title("Genes with DE signal: at the gene and exon levels")

## ----"gene_exon_match", out.width="100%", fig.align="center"---------------
## Keep only the DE exons that are from a gene that is also DE
top_exon_de <- res_exon[intersect(which(res_exon$padj < 0.01),
    which(rowRanges(rse_e)$gene_id %in% 
    attr(vinfo, "intersections")[["genes:exons"]])), ]
## Add the gene id
top_exon_de$gene_id <- rowRanges(rse_e)$gene_id[match(rownames(top_exon_de),
    rownames(rse_e))]
    
## Find the fold change that is the most extreme among the DE exons of a gene
exon_max_fc <- tapply(top_exon_de$log2FoldChange, top_exon_de$gene_id,
    function(x) { x[which.max(abs(x))] })

## Keep only the DE genes that match the previous selection
top_gene_de <- res[match(names(exon_max_fc), rownames(res)), ]

## Make the plot
plot(top_gene_de$log2FoldChange, exon_max_fc, pch = 20,
    col = adjustcolor("black", 1/2),
    ylab = "Most extreme log FC at the exon level among DE exons",
    xlab = "Log fold change (FC) at the gene level",
    main = "DE genes with at least one DE exon")
abline(a = 0, b = 1, col = "red")
abline(h = 0, col = "grey80")
abline(v = 0, col = "grey80")

## ----"de_genes_by_chr"-----------------------------------------------------
sort(table(seqnames(rowRanges(rse_gene_scaled)[which(res$padj < 0.01)])),
    decreasing = TRUE)

## ----"identify regions", eval = .Platform$OS.type != "windows"-------------
## Define expressed regions for study SRP056604, only for chromosome 12
regions <- expressed_regions("SRP056604", "chr12", cutoff = 5L,
    maxClusterGap = 3000L, outdir = local_path)
    
## Explore the resulting expressed regions
regions
summary(width(regions))
table(width(regions) >= 60)

## Keep only the ones that are at least 60 bp long
regions <- regions[width(regions) >= 60]
length(regions)

## ----"build_rse_ER", eval = .Platform$OS.type != "windows"-----------------
## Compute coverage matrix for study SRP056604, only for chromosome 12
## Takes about 45 seconds with local data
## and about 70 seconds with data from the web
system.time(rse_er <- coverage_matrix("SRP056604", "chr12", regions,
    chunksize = length(regions), outdir = local_path, round = TRUE))

## Use the expanded metadata we built for the gene model
colData(rse_er) <- colData(rse_gene_scaled)

## ----"er_de_analysis", eval = .Platform$OS.type != "windows", out.width="100%", fig.align="center"----
## Define DESeq2 object
dds_er <- DESeqDataSet(rse_er, ~ sex + age + case)
dds_er <- DESeq(dds_er, test = "LRT", reduced = ~ sex + age, fitType = "local")
res_er <- results(dds_er, alpha = 0.05)

## Visually inspect results
plotMA(res_er, main="DESeq2 results for SRP056604 - DERs")
plot_volcano(res_er, FDR = 0.05)

## ----"create_report3", message = FALSE, warning = FALSE, results = "hide", eval = FALSE----
#  DESeq2Report(dds_er, res = res_er, project = "SRP056604 - DERs",
#      intgroup = c("sex", "case"), outdir = ".",
#      output = "SRP056604_DERs")

## ----"sort_qvalue", eval = .Platform$OS.type != "windows"------------------
## Sort regions by q-value
regions_by_padj <- regions[order(res_er$padj, decreasing = FALSE)]

## Look at the top 10
regions_by_padj[1:10]
width(regions_by_padj[1:10])

## ----"find_bws", eval = .Platform$OS.type != "windows"---------------------
## Construct the list of BigWig URLs
## They have the following form:
## http://duffel.rail.bio/recount/
## project id
## /bw/
## sample run id
## .bw
bws_web <- paste0("http://duffel.rail.bio/recount/SRP056604/bw/",
    colData(rse_er)$bigwig_file)

## Note that they are also present in the recount_url data.frame
bws_url <- recount_url$url[match(colData(rse_er)$bigwig_file,
    recount_url$file_name)]
identical(bws_web, bws_url)

## Local bigwigs
bws <- file.path(local_path, "bw", colData(rse_er)$bigwig_file)
all(file.exists(bws))

## Use the sample run ids as the sample names
names(bws) <- colData(rse_er)$run

## ----"add_padding", eval = .Platform$OS.type != "windows"------------------
## Add 100 bp padding on each side
regions_resized <- resize(regions_by_padj[1:10],
    width(regions_by_padj[1:10]) + 200, fix = "center")

## ----"regionCov", eval = .Platform$OS.type != "windows"--------------------
## Get the bp coverage data for the plots
library("derfinder")
regionCov <- getRegionCoverage(regions = regions_resized, files = bws,
    targetSize = 40 * 1e6 * 100, totalMapped = colData(rse_er)$auc,
    verbose = FALSE)

## ----"gencode_txdb", eval = .Platform$OS.type != "windows"-----------------
## Import the Gencode v25 hg38 gene annotation
library("rtracklayer")

## If accessing from the web
if(FALSE) {
    gencode_v25_hg38 <- import(paste0(
        "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/",
        "gencode.v25.annotation.gtf.gz"))  
}

## Using the file locally, included in the workshop package
gencode_v25_hg38 <- import(system.file("extdata",
    "gencode.v25.annotation.gtf.gz", package = "recountWorkshop"))
            
## Keep only the chr12 info
gencode_v25_hg38 <- keepSeqlevels(gencode_v25_hg38, "chr12",
    pruning.mode = "coarse")

## Get the chromosome information for hg38
library("GenomicFeatures")
chrInfo <- getChromInfoFromUCSC("hg38")
chrInfo$chrom <- as.character(chrInfo$chrom)
chrInfo <- chrInfo[chrInfo$chrom %in% seqlevels(regions), ]
chrInfo$isCircular <- FALSE

## Assign the chromosome information to the object we will use to
## create the txdb object
si <- with(chrInfo, Seqinfo(as.character(chrom), length, isCircular,
    genome = "hg38"))
seqinfo(gencode_v25_hg38) <- si

## Switch from Gencode gene ids to Ensembl gene ids
gencode_v25_hg38$gene_id <- gsub("\\..*", "", gencode_v25_hg38$gene_id)

## Create the TxDb object
gencode_v25_hg38_txdb <- makeTxDbFromGRanges(gencode_v25_hg38)

## Explore the TxDb object
gencode_v25_hg38_txdb

## ----"bump_ann", eval = .Platform$OS.type != "windows"---------------------
library("bumphunter")
## Annotate all transcripts for gencode v25 based on the TxDb object
## we built previously.
ann_gencode_v25_hg38 <- annotateTranscripts(gencode_v25_hg38_txdb,
    annotationPackage = "org.Hs.eg.db",
    mappingInfo = list("column" = "ENTREZID", "keytype" = "ENSEMBL",
    "multiVals" = "first"))
    
## Annotate the regions of interest
## Note that we are using the original regions, not the resized ones
nearest_ann <- matchGenes(regions_by_padj[1:10], ann_gencode_v25_hg38)

## ----"make_gs", eval = .Platform$OS.type != "windows"----------------------
## Create the genomic state object using the gencode TxDb object
gs_gencode_v25_hg38 <- makeGenomicState(gencode_v25_hg38_txdb,
    chrs = seqlevels(regions))
    
## Annotate the original regions
regions_ann <- annotateRegions(regions_resized,
    gs_gencode_v25_hg38$fullGenome)

## ----"region_plots", eval = .Platform$OS.type != "windows", out.width="100%", fig.align="center"----
library("derfinderPlot")
plotRegionCoverage(regions = regions_resized, regionCoverage = regionCov, 
   groupInfo = colData(rse_er)$case,
   nearestAnnotation = nearest_ann, 
   annotatedRegions = regions_ann,
   txdb = gencode_v25_hg38_txdb,
   scalefac = 1, ylab = "Coverage (RP40M, 100bp)",
   ask = FALSE, verbose = FALSE)

## ----sessionInfo----------------------------------------------------------------------------------
## Pandoc information
rmarkdown::pandoc_version()

## Time for reproducing this workflow, in minutes
round(proc.time()[3] / 60, 1)

options(width = 100)
library("devtools")
session_info()