1 clustifyrdatahub

clustifyrdatahub provides external reference data sets for cell-type assignment with clustifyr.

1.1 Installation

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("clustifyrdatahub")

1.2 Available references include

knitr::kable(dplyr::select(
  read.csv(system.file("extdata", "metadata.csv", package = "clustifyrdatahub")),
  c(1, 9, 2:7)))
Title Species Description RDataPath BiocVersion Genome SourceType SourceUrl
ref_MCA Mus musculus Mouse Cell Atlas clustifyrdatahub/ref_MCA.rda 3.12 mm10 Zip https://ndownloader.figshare.com/files/10756795
ref_tabula_muris_drop Mus musculus Tabula Muris (10X) clustifyrdatahub/ref_tabula_muris_drop.rda 3.12 mm10 Zip https://ndownloader.figshare.com/articles/5821263
ref_tabula_muris_facs Mus musculus Tabula Muris (SmartSeq2) clustifyrdatahub/ref_tabula_muris_facs.rda 3.12 mm10 Zip https://ndownloader.figshare.com/articles/5821263
ref_mouse.rnaseq Mus musculus Mouse RNA-seq from 28 cell types clustifyrdatahub/ref_mouse.rnaseq.rda 3.12 mm10 RDA https://github.com/dviraran/SingleR/tree/master/data
ref_moca_main Mus musculus Mouse Organogenesis Cell Atlas (main cell types) clustifyrdatahub/ref_moca_main.rda 3.12 mm10 RDA https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads
ref_immgen Mus musculus Mouse sorted immune cells clustifyrdatahub/ref_immgen.rda 3.12 mm10 RDA https://github.com/dviraran/SingleR/tree/master/data
ref_hema_microarray Homo sapiens Human hematopoietic cell microarray clustifyrdatahub/ref_hema_microarray.rda 3.12 hg38 TXT https://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE24759/matrix/GSE24759_series_matrix.txt.gz
ref_cortex_dev Homo sapiens Human cortex development scRNA-seq clustifyrdatahub/ref_cortex_dev.rda 3.12 hg38 TSV https://cells.ucsc.edu/cortex-dev/exprMatrix.tsv.gz
ref_pan_indrop Homo sapiens Human pancreatic cell scRNA-seq (inDrop) clustifyrdatahub/ref_pan_indrop.rda 3.12 hg38 RDA https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/baron-human.rds
ref_pan_smartseq2 Homo sapiens Human pancreatic cell scRNA-seq (SmartSeq2) clustifyrdatahub/ref_pan_smartseq2.rda 3.12 hg38 RDA https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/segerstolpe.rds
ref_mouse_atlas Mus musculus Mouse Atlas scRNA-seq from 321 cell types clustifyrdatahub/ref_mouse_atlas.rda 3.12 mm10 RDA https://github.com/rnabioco/scRNA-seq-Cell-Ref-Matrix/blob/master/atlas/musMusculus/MouseAtlas.rda

1.3 To use clustifyrdatahub

library(ExperimentHub)
eh <- ExperimentHub()

## query
refs <- query(eh, "clustifyrdatahub")
refs
#> ExperimentHub with 11 records
#> # snapshotDate(): 2023-10-24
#> # $dataprovider: figshare, S3, GitHub, GEO, washington.edu, UCSC
#> # $species: Mus musculus, Homo sapiens
#> # $rdataclass: data.frame
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["EH3444"]]' 
#> 
#>            title                
#>   EH3444 | ref_MCA              
#>   EH3445 | ref_tabula_muris_drop
#>   EH3446 | ref_tabula_muris_facs
#>   EH3447 | ref_mouse.rnaseq     
#>   EH3448 | ref_moca_main        
#>   ...      ...                  
#>   EH3450 | ref_hema_microarray  
#>   EH3451 | ref_cortex_dev       
#>   EH3452 | ref_pan_indrop       
#>   EH3453 | ref_pan_smartseq2    
#>   EH3779 | ref_mouse_atlas
## either by index or id
ref_hema_microarray <- refs[[7]]         ## load the first resource in the list
ref_hema_microarray <- refs[["EH3450"]]  ## load by EH id

## or list and load
refs <- listResources(eh, "clustifyrdatahub")
ref_hema_microarray <- loadResources(
    eh, 
    "clustifyrdatahub",
    "ref_hema_microarray"
    )[[1]]

## use for classification of cell types
res <- clustifyr::clustify(
    input = clustifyr::pbmc_matrix_small,
    metadata = clustifyr::pbmc_meta$classified,
    ref_mat = ref_hema_microarray,
    query_genes = clustifyr::pbmc_vargenes
)
## or load refs by function name (after loading hub library)
library(clustifyrdatahub)
ref_hema_microarray()[1:5, 1:5]           ## data are loaded
#>        Basophils CD4+ Central Memory CD4+ Effector Memory CD8+ Central Memory
#> DDR1    6.084244            5.967502             5.933039            6.005278
#> RFC2    6.280044            6.028615             6.047005            5.992979
#> HSPA6   6.535444            5.811475             5.746326            5.928349
#> PAX8    6.669153            5.896401             6.118577            6.270870
#> GUCA1A  5.239230            5.232116             5.206960            5.227415
#>        CD8+ Effector Memory
#> DDR1               5.895926
#> RFC2               5.942426
#> HSPA6              5.942670
#> PAX8               6.323922
#> GUCA1A             5.090882
ref_hema_microarray(metadata = TRUE)      ## only metadata
#> ExperimentHub with 1 record
#> # snapshotDate(): 2023-10-24
#> # names(): EH3450
#> # package(): clustifyrdatahub
#> # $dataprovider: GEO
#> # $species: Homo sapiens
#> # $rdataclass: data.frame
#> # $rdatadateadded: 2020-05-14
#> # $title: ref_hema_microarray
#> # $description: Human hematopoietic cell microarray
#> # $taxonomyid: 9606
#> # $genome: hg38
#> # $sourcetype: TXT
#> # $sourceurl: https://ftp.ncbi.nlm.nih.gov/geo/series/GSE24nnn/GSE24759/matr...
#> # $sourcesize: NA
#> # $tags: c("SingleCellData", "SequencingData", "MicroarrayData",
#> #   "ExperimentHub") 
#> # retrieve record with 'object[["EH3450"]]'

2 session info

sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] clustifyrdatahub_1.12.0 ExperimentHub_2.10.0    AnnotationHub_3.10.0   
#> [4] BiocFileCache_2.10.0    dbplyr_2.3.4            BiocGenerics_0.48.0    
#> [7] BiocStyle_2.30.0       
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.0              dplyr_1.1.3                  
#>  [3] blob_1.2.4                    filelock_1.0.2               
#>  [5] Biostrings_2.70.1             bitops_1.0-7                 
#>  [7] SingleCellExperiment_1.24.0   fastmap_1.1.1                
#>  [9] RCurl_1.98-1.12               promises_1.2.1               
#> [11] digest_0.6.33                 mime_0.12                    
#> [13] lifecycle_1.0.3               ellipsis_0.3.2               
#> [15] KEGGREST_1.42.0               interactiveDisplayBase_1.40.0
#> [17] RSQLite_2.3.1                 magrittr_2.0.3               
#> [19] clustifyr_1.14.0              compiler_4.3.1               
#> [21] rlang_1.1.1                   sass_0.4.7                   
#> [23] tools_4.3.1                   utf8_1.2.4                   
#> [25] yaml_2.3.7                    data.table_1.14.8            
#> [27] knitr_1.44                    S4Arrays_1.2.0               
#> [29] bit_4.0.5                     curl_5.1.0                   
#> [31] DelayedArray_0.28.0           BiocParallel_1.36.0          
#> [33] abind_1.4-5                   withr_2.5.1                  
#> [35] purrr_1.0.2                   grid_4.3.1                   
#> [37] stats4_4.3.1                  fansi_1.0.5                  
#> [39] xtable_1.8-4                  colorspace_2.1-0             
#> [41] ggplot2_3.4.4                 scales_1.2.1                 
#> [43] SummarizedExperiment_1.32.0   cli_3.6.1                    
#> [45] rmarkdown_2.25                crayon_1.5.2                 
#> [47] generics_0.1.3                httr_1.4.7                   
#> [49] DBI_1.1.3                     cachem_1.0.8                 
#> [51] zlibbioc_1.48.0               parallel_4.3.1               
#> [53] AnnotationDbi_1.64.0          BiocManager_1.30.22          
#> [55] XVector_0.42.0                matrixStats_1.0.0            
#> [57] vctrs_0.6.4                   Matrix_1.6-1.1               
#> [59] jsonlite_1.8.7                bookdown_0.36                
#> [61] IRanges_2.36.0                S4Vectors_0.40.0             
#> [63] bit64_4.0.5                   tidyr_1.3.0                  
#> [65] jquerylib_0.1.4               glue_1.6.2                   
#> [67] codetools_0.2-19              cowplot_1.1.1                
#> [69] gtable_0.3.4                  BiocVersion_3.18.0           
#> [71] later_1.3.1                   GenomeInfoDb_1.38.0          
#> [73] GenomicRanges_1.54.0          munsell_0.5.0                
#> [75] tibble_3.2.1                  pillar_1.9.0                 
#> [77] rappdirs_0.3.3                htmltools_0.5.6.1            
#> [79] fgsea_1.28.0                  entropy_1.3.1                
#> [81] GenomeInfoDbData_1.2.11       R6_2.5.1                     
#> [83] evaluate_0.22                 shiny_1.7.5.1                
#> [85] Biobase_2.62.0                lattice_0.22-5               
#> [87] png_0.1-8                     memoise_2.0.1                
#> [89] httpuv_1.6.12                 bslib_0.5.1                  
#> [91] fastmatch_1.1-4               Rcpp_1.0.11                  
#> [93] SparseArray_1.2.0             xfun_0.40                    
#> [95] MatrixGenerics_1.14.0         pkgconfig_2.0.3