Lifecycle:maturing

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

Create tidybulk tibble.

tt = se_mini

Aggregate duplicated transcripts

Tidy transcriptomics

rowData(tt)$gene_name = rownames(tt)
tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name)
Base R
temp = data.frame(
	symbol = dge_list$genes$symbol,
	dge_list$counts
)
dge_list.nr <- by(temp,	temp$symbol,
	function(df)
		if(length(df[1,1])>0)
			matrixStats:::colSums(as.matrix(df[,-1]))
)
dge_list.nr <- do.call("rbind", dge_list.nr)
colnames(dge_list.nr) <- colnames(dge_list)

Scale counts

Tidy transcriptomics
tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance()
Base R
library(edgeR)

dgList <- DGEList(count_m=x,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
[...]
dgList <- calcNormFactors(dgList, method="TMM")
norm_counts.table <- cpm(dgList)

Filter variable transcripts

We may want to identify and filter variable transcripts.

Tidy transcriptomics
tt.norm.variable = tt.norm %>% keep_variable()
Base R
library(edgeR)

x = norm_counts.table

s <- rowMeans((x-rowMeans(x))^2)
o <- order(s,decreasing=TRUE)
x <- x[o[1L:top],,drop=FALSE]

norm_counts.table = norm_counts.table[rownames(x)]

norm_counts.table$cell_type = tibble_counts[
	match(
		tibble_counts$sample,
		rownames(norm_counts.table)
	),
	"Cell type"
]

Reduce dimensions

Tidy transcriptomics
tt.norm.MDS =
  tt.norm %>%
  reduce_dimensions(method="MDS", .dims = 2)
Base R
library(limma)

count_m_log = log(count_m + 1)
cmds = limma::plotMDS(ndim = .dims, plot = FALSE)

cmds = cmds %$%	
	cmdscale.out %>%
	setNames(sprintf("Dim%s", 1:6))

cmds$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(cmds)),
	"Cell type"
]

PCA

Tidy transcriptomics
tt.norm.PCA =
  tt.norm %>%
  reduce_dimensions(method="PCA", .dims = 2)
Base R
count_m_log = log(count_m + 1)
pc = count_m_log %>% prcomp(scale = TRUE)
variance = pc$sdev^2
variance = (variance / sum(variance))[1:6]
pc$cell_type = counts[
	match(counts$sample, rownames(pc)),
	"Cell type"
]

tSNE

Tidy transcriptomics
tt.norm.tSNE =
	breast_tcga_mini_SE %>%
	tidybulk(		sample, ens, count_scaled) %>%
	identify_abundant() %>%
	reduce_dimensions(
		method = "tSNE",
		perplexity=10,
		pca_scale =TRUE
	)
Base R
count_m_log = log(count_m + 1)

tsne = Rtsne::Rtsne(
	t(count_m_log),
	perplexity=10,
		pca_scale =TRUE
)$Y
tsne$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(tsne)),
	"Cell type"
]

Rotate dimensions

Tidy transcriptomics
tt.norm.MDS.rotated =
  tt.norm.MDS %>%
	rotate_dimensions(`Dim1`, `Dim2`, rotation_degrees = 45, action="get")
Base R
rotation = function(m, d) {
	r = d * pi / 180
	((bind_rows(
		c(`1` = cos(r), `2` = -sin(r)),
		c(`1` = sin(r), `2` = cos(r))
	) %>% as_matrix) %*% m)
}
mds_r = pca %>% rotation(rotation_degrees)
mds_r$cell_type = counts[
	match(counts$sample, rownames(mds_r)),
	"Cell type"
]

Test differential abundance

Tidy transcriptomics
tt.de =
	tt %>%
	test_differential_abundance( ~ condition, action="get")
tt.de
Base R
library(edgeR)

dgList <- DGEList(counts=counts_m,group=group)
keep <- filterByExpr(dgList)
dgList <- dgList[keep,,keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
design <- model.matrix(~group)
dgList <- estimateDisp(dgList,design)
fit <- glmQLFit(dgList,design)
qlf <- glmQLFTest(fit,coef=2)
topTags(qlf, n=Inf)

Adjust counts

Tidy transcriptomics
tt.norm.adj =
	tt.norm %>% adjust_abundance(	~ condition + time)
Base R
library(sva)

count_m_log = log(count_m + 1)

design =
		model.matrix(
			object = ~ condition + time,
			data = annotation
		)

count_m_log.sva =
	ComBat(
			batch =	design[,2],
			mod = design,
			...
		)

count_m_log.sva = ceiling(exp(count_m_log.sva) -1)
count_m_log.sva$cell_type = counts[
	match(counts$sample, rownames(count_m_log.sva)),
	"Cell type"
]

Deconvolve Cell type composition

Tidy transcriptomics
tt.cibersort =
	tt %>%
	deconvolve_cellularity(action="get", cores=1)
Base R
source(‘CIBERSORT.R’)
count_m %>% write.table("mixture_file.txt")
results <- CIBERSORT(
	"sig_matrix_file.txt",
	"mixture_file.txt",
	perm=100, QN=TRUE
)
results$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(results)),
	"Cell type"
]

Cluster samples

k-means

Tidy transcriptomics
tt.norm.cluster = tt.norm.MDS %>%
  cluster_elements(method="kmeans",	centers = 2, action="get" )
Base R
count_m_log = log(count_m + 1)

k = kmeans(count_m_log, iter.max = 1000, ...)
cluster = k$cluster

cluster$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(cluster)),
	c("Cell type", "Dim1", "Dim2")
]

SNN

Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.

Tidy transcriptomics
tt.norm.SNN =
	tt.norm.tSNE %>%
	cluster_elements(method = "SNN")
Base R
library(Seurat)

snn = CreateSeuratObject(count_m)
snn = ScaleData(
	snn, display.progress = TRUE,
	num.cores=4, do.par = TRUE
)
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = FindVariableFeatures(snn, selection.method = "vst")
snn = RunPCA(snn, npcs = 30)
snn = FindNeighbors(snn)
snn = FindClusters(snn, method = "igraph", ...)
snn = snn[["seurat_clusters"]]

snn$cell_type = tibble_counts[
	match(tibble_counts$sample, rownames(snn)),
	c("Cell type", "Dim1", "Dim2")
]

Drop redundant transcripts

Tidy transcriptomics
tt.norm.non_redundant =
	tt.norm.MDS %>%
  remove_redundancy(	method = "correlation" )
Base R
library(widyr)

.data.correlated =
	pairwise_cor(
		counts,
		sample,
		transcript,
		rc,
		sort = TRUE,
		diag = FALSE,
		upper = FALSE
	) %>%
	filter(correlation > correlation_threshold) %>%
	distinct(item1) %>%
	rename(!!.element := item1)

# Return non redundant data frame
counts %>% anti_join(.data.correlated) %>%
	spread(sample, rc, - transcript) %>%
	left_join(annotation)

Draw heatmap

tidytranscriptomics
tt.norm.MDS %>%

  # filter lowly abundant
  keep_abundant() %>%

  # extract 500 most variable genes
  keep_variable( .abundance = count_scaled, top = 500) %>%

  # create heatmap
  heatmap(sample, transcript, count_scaled, transform = log1p) %>%
	add_tile(`Cell type`) 
Base R
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$`Cell type`)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")

Draw density plot

tidytranscriptomics
# Example taken from airway dataset from BioC2020 workshop. 
airway %>%
    tidybulk() %>%
	  identify_abundant() %>%
    scale_abundance() %>%
    pivot_longer(cols = starts_with("counts"), names_to = "source", values_to = "abundance") %>%
    filter(!lowly_abundant) %>%
    ggplot(aes(x=abundance + 1, color=sample)) +
    geom_density() +
    facet_wrap(~source) +
    scale_x_log10() 
Base R
# Example taken from airway dataset from BioC2020 workshop. 
dgList <- SE2DGEList(airway)
group <- factor(dgList$samples$dex)
keep.exprs <- filterByExpr(dgList, group=group)
dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE]
dgList <- calcNormFactors(dgList)
logcounts <- cpm(dgList, log=TRUE)
var_genes <- apply(logcounts, 1, var)
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
highly_variable_lcpm <- logcounts[select_var,]
colours <- c("#440154FF", "#21908CFF", "#fefada" )
col.group <- c("red","grey")[group]
gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row")

Appendix

sessionInfo()
## R Under development (unstable) (2022-10-25 r83175)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidySummarizedExperiment_1.9.2 SummarizedExperiment_1.29.1   
##  [3] Biobase_2.59.0                 GenomicRanges_1.51.1          
##  [5] GenomeInfoDb_1.35.5            IRanges_2.33.0                
##  [7] S4Vectors_0.37.0               BiocGenerics_0.45.0           
##  [9] MatrixGenerics_1.11.0          matrixStats_0.63.0            
## [11] tidybulk_1.11.1                ggrepel_0.9.2                 
## [13] ggplot2_3.4.0                  magrittr_2.0.3                
## [15] tibble_3.1.8                   tidyr_1.2.1                   
## [17] dplyr_1.0.10                   knitr_1.41                    
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.1.3              bitops_1.0-7           widyr_0.1.5           
##  [4] rlang_1.0.6            tidytext_0.3.4         compiler_4.3.0        
##  [7] RSQLite_2.2.18         mgcv_1.8-41            reshape2_1.4.4        
## [10] png_0.1-7              vctrs_0.5.1            sva_3.47.0            
## [13] stringr_1.4.1          pkgconfig_2.0.3        crayon_1.5.2          
## [16] fastmap_1.1.0          backports_1.4.1        XVector_0.39.0        
## [19] ellipsis_0.3.2         utf8_1.2.2             tzdb_0.3.0            
## [22] preprocessCore_1.61.0  purrr_0.3.5            bit_4.0.5             
## [25] xfun_0.35              zlibbioc_1.45.0        cachem_1.0.6          
## [28] jsonlite_1.8.3         blob_1.2.3             SnowballC_0.7.0       
## [31] DelayedArray_0.25.0    BiocParallel_1.33.6    broom_1.0.1           
## [34] parallel_4.3.0         R6_2.5.1               stringi_1.7.8         
## [37] limma_3.55.0           genefilter_1.81.0      Rcpp_1.0.9            
## [40] assertthat_0.2.1       readr_2.1.3            Matrix_1.5-3          
## [43] splines_4.3.0          tidyselect_1.2.0       codetools_0.2-18      
## [46] plyr_1.8.8             lattice_0.20-45        withr_2.5.0           
## [49] KEGGREST_1.39.0        evaluate_0.18          Rtsne_0.16            
## [52] survival_3.4-0         Biostrings_2.67.0      pillar_1.8.1          
## [55] janeaustenr_1.0.0      plotly_4.10.1          generics_0.1.3        
## [58] RCurl_1.98-1.9         hms_1.1.2              munsell_0.5.0         
## [61] scales_1.2.1           xtable_1.8-4           glue_1.6.2            
## [64] lazyeval_0.2.2         tools_4.3.0            tokenizers_0.2.3      
## [67] data.table_1.14.6      annotate_1.77.0        locfit_1.5-9.6        
## [70] XML_3.99-0.12          grid_4.3.0             AnnotationDbi_1.61.0  
## [73] edgeR_3.41.1           colorspace_2.0-3       nlme_3.1-160          
## [76] GenomeInfoDbData_1.2.9 cli_3.4.1              fansi_1.0.3           
## [79] viridisLite_0.4.1      gtable_0.3.1           digest_0.6.30         
## [82] htmlwidgets_1.5.4      memoise_2.0.1          htmltools_0.5.3       
## [85] lifecycle_1.0.3        httr_1.4.4             bit64_4.0.5