## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"-------------------- BiocStyle::latex() ## ----options, results='hide', message=FALSE, eval=TRUE, echo=FALSE------------------------------------------ library(chromstaR) options(width=110) ## ----univariate_narrow_library, results='hide', message=FALSE, eval=TRUE------------------------------------ library(chromstaR) ## ----univariate_narrow_binning, results='markup', message=FALSE, eval=TRUE---------------------------------- ## === Step 1: Binning === # Get an example BAM file file <- system.file("extdata","euratrans","lv-H3K4me3-BN-male-bio2-tech1.bam", package="chromstaRData") # Get the chromosome lengths (see ?GenomeInfoDb::fetchExtendedChromInfoFromUCSC) # This is only necessary for BED files. BAM files are handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, stepsizes=500, chromosomes='chr12') print(binned.data) ## ----univariate_narrow_peak_calling, results='markup', eval=TRUE, message=FALSE----------------------------- ## === Step 2: Peak calling === model <- callPeaksUnivariate(binned.data, verbosity=0) ## ----univariate_narrow_plotting, fig.width=6, fig.height=4, out.width='0.5\\textwidth'---------------------- ## === Step 3: Checking the fit === # For a narrow modification, the fit should look something like this, # with the 'modified'-component near the bottom of the figure plotHistogram(model) + ggtitle('H3K4me3') ## ----univariate_narrow_plotting2, fig.width=10, fig.height=2, out.width='0.7\\textwidth'-------------------- # We can also check a browser snapshot of the data plotGenomeBrowser(model, chr='chr12', start=1, end=1e6)[[1]] ## ----univariate_narrow_posterior, results='markup', message=FALSE, eval=TRUE-------------------------------- ## === Step 4: Working with peaks === # Get the number and average size of peaks length(model$peaks); mean(width(model$peaks)) # Adjust the sensitivity and get number of peaks model <- changeMaxPostCutoff(model, maxPost.cutoff=0.9999) length(model$peaks); mean(width(model$peaks)) ## ----univariate_narrow_export, results='hide', message=FALSE, eval=TRUE------------------------------------- ## === Step 5: Export to genome browser === # We can export peak calls and binned read counts with exportPeaks(model, filename=tempfile()) exportCounts(model, filename=tempfile()) ## ----univariate_broad_library, results='hide', message=FALSE, eval=TRUE------------------------------------- library(chromstaR) ## ----univariate_broad_binning, results='hide', message=FALSE, eval=TRUE------------------------------------- ## === Step 1: Binning === # Get an example BAM file file <- system.file("extdata","euratrans","lv-H3K27me3-BN-male-bio2-tech1.bam", package="chromstaRData") # Get the chromosome lengths (see ?GenomeInfoDb::fetchExtendedChromInfoFromUCSC) # This is only necessary for BED files. BAM files are handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, stepsizes=500, chromosomes='chr12') ## ----univariate_broad_peak_calling, results='markup', eval=TRUE, message=FALSE------------------------------ ## === Step 2: Peak calling === model <- callPeaksUnivariate(binned.data, verbosity=0) ## ----univariate_broad_plotting, fig.width=6, fig.height=4, out.width='0.5\\textwidth'----------------------- ## === Step 3: Checking the fit === # For a broad modification, the fit should look something like this, # with a 'modified'-component that fits the thick tail of the distribution. plotHistogram(model) + ggtitle('H3K27me3') ## ----univariate_broad_plotting2, fig.width=10, fig.height=2, out.width='0.7\\textwidth'--------------------- plotGenomeBrowser(model, chr='chr12', start=1, end=1e6)[[1]] ## ----univariate_broad_posterior, results='markup', message=FALSE, eval=TRUE--------------------------------- ## === Step 4: Working with peaks === peaks <- model$peaks length(peaks); mean(width(peaks)) # Adjust the sensitivity and get number of peaks model <- changeMaxPostCutoff(model, maxPost.cutoff=0.9999) peaks <- model$peaks length(peaks); mean(width(peaks)) ## ----univariate_broad_export, results='hide', message=FALSE, eval=TRUE-------------------------------------- ## === Step 5: Export to genome browser === # We can export peak calls and binned read counts with exportPeaks(model, filename=tempfile()) exportCounts(model, filename=tempfile()) ## ----univariate_broad_H4K20me1, echo=TRUE, results='hide', message=FALSE, fig.width=6, fig.height=4, out.width='0.5\\textwidth'---- ## === Step 1-3: Another example for mark H4K20me1 === file <- system.file("extdata","euratrans","lv-H4K20me1-BN-male-bio1-tech1.bam", package="chromstaRData") data(rn4_chrominfo) binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, stepsizes=500, chromosomes='chr12') model <- callPeaksUnivariate(binned.data, max.time=60, verbosity=0) plotHistogram(model) + ggtitle('H4K20me1') ## ----univariate_braod_H4K20me1.2, fig.width=10, fig.height=2, out.width='0.7\\textwidth'-------------------- # We can also check a browser snapshot of the data plotGenomeBrowser(model, chr='chr12', start=1, end=1e6)[[1]] ## ----multivariate_replicate_library, results='hide', message=FALSE, eval=TRUE------------------------------- library(chromstaR) ## ----multivariate_replicate_preparation, results='markup', message=FALSE, eval=TRUE------------------------- #=== Step 1: Preparation === # Let's get some example data with 3 replicates in spontaneous hypertensive rat (SHR) file.path <- system.file("extdata","euratrans", package='chromstaRData') files.good <- list.files(file.path, pattern="H3K27me3.*SHR.*bam$", full.names=TRUE)[1:3] # We fake a replicate with poor quality by taking a different mark entirely files.poor <- list.files(file.path, pattern="H4K20me1.*SHR.*bam$", full.names=TRUE)[1] files <- c(files.good, files.poor) # Obtain chromosome lengths. This is only necessary for BED files. BAM files are # handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # Define experiment structure exp <- data.frame(file=files, mark='H3K27me3', condition='SHR', replicate=1:4, pairedEndReads=FALSE, controlFiles=NA) # Peaks could now be called with # Chromstar(inputfolder=file.path, experiment.table=exp, outputfolder=tempdir(), # mode = 'separate') # However, to get more information on the replicates we will choose # a more detailed workflow. ## ----multivariate_replicate_binning, results='hide', message=FALSE, eval=TRUE------------------------------- ## === Step 2: Binning === # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- list() for (file in files) { binned.data[[basename(file)]] <- binReads(file, binsize=1000, stepsizes=500, assembly=rn4_chrominfo, chromosomes='chr12', experiment.table=exp) } ## ----multivariate_replicate_univariate, results='hide', message=FALSE, eval=TRUE---------------------------- ## === Step 3: Univariate peak calling === # The univariate fit is obtained for each replicate models <- list() for (i1 in 1:length(binned.data)) { models[[i1]] <- callPeaksUnivariate(binned.data[[i1]], max.time=60) plotHistogram(models[[i1]]) } ## ----multivariate_replicate_peak_calling, results='markup', message=FALSE, eval=TRUE------------------------ ## === Step 4: Check replicate correlation === # We run a multivariate peak calling on all 4 replicates # A warning is issued because replicate 4 is very different from the others multi.model <- callPeaksReplicates(models, max.time=60, eps=1) # Checking the correlation confirms that Rep4 is very different from the others print(multi.model$replicateInfo$correlation) ## ----multivariate_replicate_peak_calling2, results='hide', message=FALSE, eval=TRUE------------------------- ## === Step 5: Peak calling with replicates === # We redo the previous step without the "bad" replicate # Also, we force all replicates to agree in their peak calls multi.model <- callPeaksReplicates(models[1:3], force.equal=TRUE, max.time=60) ## ----multivariate_replicate_export, results='hide', message=FALSE, eval=TRUE-------------------------------- ## === Step 6: Export to genome browser === # Finally, we can export the results as BED file exportPeaks(multi.model, filename=tempfile()) exportCounts(multi.model, filename=tempfile()) ## ----multivariate_differential_library, results='hide', message=FALSE, eval=TRUE---------------------------- library(chromstaR) ## ----multivariate_differential_preparation, results='markup', message=FALSE, eval=TRUE---------------------- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'H4K20me1-example') ## Define experiment structure, put NA if you don't have controls data(experiment_table_H4K20me1) print(experiment_table_H4K20me1) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----multivariate_differential_Chromstar, results='hide', message=FALSE, eval=TRUE-------------------------- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table_H4K20me1, outputfolder=outputfolder, numCPU=4, binsize=1000, stepsize=500, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='differential') ## ----multivariate_differential_listfiles, results='markup', message=FALSE, eval=TRUE------------------------ ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'multivariate', 'multivariate_mode-differential_mark-H4K20me1_binsize1000_stepsize500.RData'))) ## ----multivariate_differential_stateBrewer, results='markup', message=TRUE, eval=TRUE----------------------- ## === Step 3: Construct differential and common states === diff.states <- stateBrewer(experiment_table_H4K20me1, mode='differential', differential.states=TRUE) print(diff.states) common.states <- stateBrewer(experiment_table_H4K20me1, mode='differential', common.states=TRUE) print(common.states) ## ----multivariate_differential_export, results='hide', message=FALSE, eval=TRUE----------------------------- ## === Step 5: Export to genome browser === # Export only differential states exportPeaks(model, filename=tempfile()) exportCounts(model, filename=tempfile()) exportCombinations(model, filename=tempfile(), include.states=diff.states) ## ----multivariate_combinatorial_library, results='hide', message=FALSE, eval=TRUE--------------------------- library(chromstaR) ## ----multivariate_combinatorial_preparation, results='markup', message=FALSE, eval=TRUE--------------------- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'SHR-example') ## Define experiment structure, put NA if you don't have controls # (SHR = spontaneous hypertensive rat) data(experiment_table_SHR) print(experiment_table_SHR) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----multivariate_combinatorial_Chromstar, results='hide', message=FALSE, eval=TRUE------------------------- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table_SHR, outputfolder=outputfolder, numCPU=4, binsize=1000, stepsize=500, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='combinatorial') ## ----multivariate_combinatorial_listfiles, results='markup', message=FALSE, eval=TRUE, fig.width=4, fig.height=3, out.width='0.5\\textwidth'---- ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'multivariate', 'multivariate_mode-combinatorial_condition-SHR_binsize1000_stepsize500.RData'))) # Obtain genomic frequencies for combinatorial states genomicFrequencies(model) # Plot transition probabilities and read count correlation heatmapTransitionProbs(model) + ggtitle('Transition probabilities') heatmapCountCorrelation(model) + ggtitle('Read count correlation') ## ----multivariate_combinatorial_enrichment, results='markup', message=FALSE, eval=TRUE---------------------- ## === Step 3: Enrichment analysis === # Annotations can easily be obtained with the biomaRt package. Of course, you can # also load them from file if you already have annotation files available. library(biomaRt) ensembl <- useEnsembl(biomart='ENSEMBL_MART_ENSEMBL', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name', 'gene_biotype'), mart=ensembl) # Transform to GRanges for easier handling genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name), ranges=IRanges(start=genes$start, end=genes$end), strand=genes$strand, name=genes$external_gene_name, biotype=genes$gene_biotype) # Rename chrMT to chrM to avoid warnings seqlevels(genes)[seqlevels(genes)=='chrMT'] <- 'chrM' # Select only chr12 to avoid warnings genes <- keepSeqlevels(genes, 'chr12', pruning.mode = 'coarse') print(genes) ## ----multivariate_combinatorial_enrichment_plot1, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # We expect promoter [H3K4me3] and bivalent-promoter signatures [H3K4me3+H3K27me3] # to be enriched at transcription start sites. plotEnrichment(hmm = model, annotation = genes, bp.around.annotation = 15000) + ggtitle('Fold enrichment around genes') + xlab('distance from gene body') # Plot enrichment only at TSS. We make use of the fact that TSS is the start of a gene. plotEnrichment(model, genes, region = 'start') + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS in [bp]') # Note: If you want to facet the plot because you have many combinatorial states you # can do that with plotEnrichment(model, genes, region = 'start') + facet_wrap(~ combination) + ggtitle('Fold enrichment around TSS') ## ----multivariate_combinatorial_enrichment_plot2, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Another form of visualization that shows every TSS in a heatmap tss <- resize(genes, width = 3, fix = 'start') plotEnrichCountHeatmap(model, tss, bp.around.annotation = 15000) + theme(strip.text.x = element_text(size=6)) + scale_x_continuous(breaks=c(-10000,0,10000)) + ggtitle('Read count around TSS') ## ----multivariate_combinatorial_enrichment_plot3, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Fold enrichment with different biotypes, showing that protein coding genes are # enriched with (bivalent) promoter combinations [H3K4me3] and [H3K4me3+H3K27me3], # while rRNA is enriched with the empty [] combination. biotypes <- split(tss, tss$biotype) plotFoldEnrichHeatmap(model, annotations=biotypes) + coord_flip() + ggtitle('Fold enrichment with different biotypes') ## ----multivariate_combinatorial_expression, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.5\\textwidth', fig.align='center'---- # === Step 4: Expression analysis === # Suppose you have expression data as well for your experiment. The following code # shows you how to get the expression values for each combinatorial state. data(expression_lv) head(expression_lv) # We need to get coordinates for each of the genes library(biomaRt) ensembl <- useEnsembl(biomart='ENSEMBL_MART_ENSEMBL', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name', 'gene_biotype'), mart=ensembl) expr <- merge(genes, expression_lv, by='ensembl_gene_id') # Transform to GRanges expression.SHR <- GRanges(seqnames=paste0('chr',expr$chromosome_name), ranges=IRanges(start=expr$start, end=expr$end), strand=expr$strand, name=expr$external_gene_name, biotype=expr$gene_biotype, expression=expr$expression_SHR) # Rename chrMT to chrM to avoid warnings seqlevels(expression.SHR)[seqlevels(expression.SHR)=='chrMT'] <- 'chrM' # We apply an asinh transformation to reduce the effect of outliers expression.SHR$expression <- asinh(expression.SHR$expression) ## Plot plotExpression(model, expression.SHR) + theme(axis.text.x=element_text(angle=0, hjust=0.5)) + ggtitle('Expression of genes overlapping combinatorial states') plotExpression(model, expression.SHR, return.marks=TRUE) + ggtitle('Expression of marks overlapping combinatorial states') ## ----combined_library, results='hide', message=FALSE, eval=TRUE--------------------------------------------- library(chromstaR) ## ----combined_preparation, results='markup', message=FALSE, eval=TRUE--------------------------------------- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'SHR-BN-example') ## Define experiment structure, put NA if you don't have controls data(experiment_table) print(experiment_table) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----combined_Chromstar, results='hide', message=FALSE, eval=TRUE------------------------------------------- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table, outputfolder=outputfolder, numCPU=4, binsize=1000, stepsize=500, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='differential') ## ----combined_listfiles, results='markup', message=FALSE, eval=TRUE----------------------------------------- ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'combined', 'combined_mode-differential_binsize1000_stepsize500.RData'))) ## ----combined_analysis, results='markup', message=FALSE, eval=TRUE------------------------------------------ #=== Step 3: Analysis and export === ## Obtain all genomic regions where the two tissues have different states segments <- model$segments diff.segments <- segments[segments$combination.SHR != segments$combination.BN] # Let's be strict with the differences and get only those where both marks are different diff.segments <- diff.segments[diff.segments$differential.score >= 1.9] exportGRangesAsBedFile(diff.segments, trackname='differential_chromatin_states', filename=tempfile(), scorecol='differential.score') ## Obtain all genomic regions where we find a bivalent promoter in BN but not in SHR bivalent.BN <- segments[segments$combination.BN == '[H3K27me3+H3K4me3]' & segments$combination.SHR != '[H3K27me3+H3K4me3]'] ## Obtain all genomic regions where BN and SHR have promoter signatures promoter.BN <- segments[segments$transition.group == 'constant [H3K4me3]'] ## Get transition frequencies print(model$frequencies) ## ----combined_enrichment, results='markup', message=FALSE, eval=TRUE---------------------------------------- ## === Step 4: Enrichment analysis === # Annotations can easily be obtained with the biomaRt package. Of course, you can # also load them from file if you already have annotation files available. library(biomaRt) ensembl <- useEnsembl(biomart='ENSEMBL_MART_ENSEMBL', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_name', 'gene_biotype'), mart=ensembl) # Transform to GRanges for easier handling genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name), ranges=IRanges(start=genes$start, end=genes$end), strand=genes$strand, name=genes$external_gene_name, biotype=genes$gene_biotype) # Rename chrMT to chrM to avoid warnings seqlevels(genes)[seqlevels(genes)=='chrMT'] <- 'chrM' # Select only chr12 to avoid warnings genes <- keepSeqlevels(genes, 'chr12', pruning.mode = 'coarse') print(genes) ## ----combined_enrichment_plot1, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # We expect promoter [H3K4me3] and bivalent-promoter signatures [H3K4me3+H3K27me3] # to be enriched at transcription start sites. plots <- plotEnrichment(hmm=model, annotation=genes, region='start') plots[['BN']] + facet_wrap(~ combination) + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS') plots <- plotEnrichment(hmm=model, annotation=genes, region='start', what='peaks') plots[['BN']] + facet_wrap(~ mark) + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS') ## ----combined_enrichment_plot3, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Fold enrichment with different biotypes, showing that protein coding genes are # enriched with (bivalent) promoter combinations [H3K4me3] and [H3K4me3+H3K27me3], # while rRNA is enriched with the empty [] combination. tss <- resize(genes, width = 3, fix = 'start') biotypes <- split(tss, tss$biotype) plots <- plotFoldEnrichHeatmap(model, annotations=biotypes) plots[['BN']] + coord_flip() + ggtitle('Fold enrichment with different biotypes') ## ----faq_postcutoff, results='markup', message=FALSE, eval=TRUE, fig.width=10, fig.height=2, out.width='0.7\\textwidth'---- model <- get(load(file.path(outputfolder,'combined', 'combined_mode-differential_binsize1000_stepsize500.RData'))) # Try a strict cutoff close to 1 model2 <- changeMaxPostCutoff(model, maxPost.cutoff=0.99999) model3 <- changePostCutoff(model, post.cutoff=0.99999) # Check the peaks before and after adjustment plots <- plotGenomeBrowser(model, chr='chr12', start=1, end=3e5) plots2 <- plotGenomeBrowser(model2, chr='chr12', start=1, end=3e5) plots3 <- plotGenomeBrowser(model3, chr='chr12', start=1, end=3e5) plots$`H3K27me3-BN-rep1` + ggtitle('H3K27me3 original') plots2$`H3K27me3-BN-rep1` + ggtitle('H3K27me3 maxPost.cutoff=0.99999') plots3$`H3K27me3-BN-rep1` + ggtitle('H3K27me3 post.cutoff=0.99999') plots$`H3K4me3-BN-rep1` + ggtitle('H3K4me3 original') plots2$`H3K4me3-BN-rep1` + ggtitle('H3K4me3 maxPost.cutoff=0.99999') plots3$`H3K4me3-BN-rep1` + ggtitle('H3K4me3 post.cutoff=0.99999') ## ----faq_postcutoff_single, results='markup', message=FALSE, eval=TRUE-------------------------------------- # Set a stricter cutoff for H3K4me3 than for H3K27me3 cutoffs <- c(0.9, 0.9, 0.9, 0.9, 0.99, 0.99, 0.99, 0.99) names(cutoffs) <- model$info$ID print(cutoffs) model2 <- changeMaxPostCutoff(model, maxPost.cutoff=cutoffs) ## ----faq_heatmapCombinations, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, fig.align='center', out.width='0.8\\textwidth'---- heatmapCombinations(marks=c('H3K4me3', 'H3K27me3', 'H3K36me3', 'H3K27Ac')) ## ----sessionInfo, eval=TRUE, results="asis"----------------------------------------------------------------- toLatex(sessionInfo())