1 Introduction

TileDB implements a framework for local and remote storage of dense and sparse arrays. We can use this as a DelayedArray backend to provide an array-level abstraction, thus allowing the data to be used in many places where an ordinary array or matrix might be used. The TileDBArray package implements the necessary wrappers around TileDB-R to support read/write operations on TileDB arrays within the DelayedArray framework.

2 Creating a TileDBArray

Creating a TileDBArray is as easy as:

X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
##              [,1]       [,2]       [,3] ...        [,9]       [,10]
##   [1,] -2.5634091  0.8744546  0.9860739   . -1.25009232 -0.03044110
##   [2,] -0.3396823  1.3510598 -0.8298500   . -1.05542092  1.64764480
##   [3,]  0.4024526  0.6694622  1.5606203   .  0.19354001  0.35103773
##   [4,] -1.7019909 -0.4417821 -1.0550905   .  1.29041926  0.07767379
##   [5,]  1.8369010  1.1559940 -0.4676432   . -0.42041393 -1.00957990
##    ...          .          .          .   .           .           .
##  [96,]  0.2429128 -0.5413027  0.2352831   .  -0.9930017  -0.1135778
##  [97,] -0.4139537  0.3308705 -1.0188687   .   0.4604300   0.8798608
##  [98,] -1.3889043  1.8554911  0.8005370   .  -0.2347394  -0.3150729
##  [99,]  0.3917084 -0.4374012 -1.4310260   .  -0.3540704   1.2979328
## [100,] -0.9188678 -0.1028133 -0.8016942   .   0.2433746  -1.5822773

Alternatively, we can use coercion methods:

as(X, "TileDBArray")
## <100 x 10> TileDBMatrix object of type "double":
##              [,1]       [,2]       [,3] ...        [,9]       [,10]
##   [1,] -2.5634091  0.8744546  0.9860739   . -1.25009232 -0.03044110
##   [2,] -0.3396823  1.3510598 -0.8298500   . -1.05542092  1.64764480
##   [3,]  0.4024526  0.6694622  1.5606203   .  0.19354001  0.35103773
##   [4,] -1.7019909 -0.4417821 -1.0550905   .  1.29041926  0.07767379
##   [5,]  1.8369010  1.1559940 -0.4676432   . -0.42041393 -1.00957990
##    ...          .          .          .   .           .           .
##  [96,]  0.2429128 -0.5413027  0.2352831   .  -0.9930017  -0.1135778
##  [97,] -0.4139537  0.3308705 -1.0188687   .   0.4604300   0.8798608
##  [98,] -1.3889043  1.8554911  0.8005370   .  -0.2347394  -0.3150729
##  [99,]  0.3917084 -0.4374012 -1.4310260   .  -0.3540704   1.2979328
## [100,] -0.9188678 -0.1028133 -0.8016942   .   0.2433746  -1.5822773

This process works also for sparse matrices:

Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse TileDBMatrix object of type "double":
##            [,1]    [,2]    [,3] ...  [,999] [,1000]
##    [1,]       0       0       0   .       0       0
##    [2,]       0       0       0   .       0       0
##    [3,]       0       0       0   .       0       0
##    [4,]       0       0       0   .       0       0
##    [5,]       0       0       0   .       0       0
##     ...       .       .       .   .       .       .
##  [996,]       0       0       0   .       0       0
##  [997,]       0       0       0   .       0       0
##  [998,]       0       0       0   .       0       0
##  [999,]       0       0       0   .       0       0
## [1000,]       0       0       0   .       0       0

Logical and integer matrices are supported:

writeTileDBArray(Y > 0)
## <1000 x 1000> sparse TileDBMatrix object of type "logical":
##            [,1]    [,2]    [,3] ...  [,999] [,1000]
##    [1,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##    [2,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##    [3,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##    [4,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##    [5,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##     ...       .       .       .   .       .       .
##  [996,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##  [997,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##  [998,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
##  [999,]   FALSE   FALSE   FALSE   .   FALSE   FALSE
## [1000,]   FALSE   FALSE   FALSE   .   FALSE   FALSE

As are matrices with dimension names:

rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> TileDBMatrix object of type "double":
##              SAMP_1     SAMP_2     SAMP_3 ...      SAMP_9     SAMP_10
##   GENE_1 -2.5634091  0.8744546  0.9860739   . -1.25009232 -0.03044110
##   GENE_2 -0.3396823  1.3510598 -0.8298500   . -1.05542092  1.64764480
##   GENE_3  0.4024526  0.6694622  1.5606203   .  0.19354001  0.35103773
##   GENE_4 -1.7019909 -0.4417821 -1.0550905   .  1.29041926  0.07767379
##   GENE_5  1.8369010  1.1559940 -0.4676432   . -0.42041393 -1.00957990
##      ...          .          .          .   .           .           .
##  GENE_96  0.2429128 -0.5413027  0.2352831   .  -0.9930017  -0.1135778
##  GENE_97 -0.4139537  0.3308705 -1.0188687   .   0.4604300   0.8798608
##  GENE_98 -1.3889043  1.8554911  0.8005370   .  -0.2347394  -0.3150729
##  GENE_99  0.3917084 -0.4374012 -1.4310260   .  -0.3540704   1.2979328
## GENE_100 -0.9188678 -0.1028133 -0.8016942   .   0.2433746  -1.5822773

3 Manipulating TileDBArrays

TileDBArrays are simply DelayedArray objects and can be manipulated as such. The usual conventions for extracting data from matrix-like objects work as expected:

out <- as(X, "TileDBArray")
dim(out)
## [1] 100  10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
##       GENE_1       GENE_2       GENE_3       GENE_4       GENE_5       GENE_6 
## -2.563409091 -0.339682326  0.402452648 -1.701990899  1.836901004  0.001805847

We can also perform manipulations like subsetting and arithmetic. Note that these operations do not affect the data in the TileDB backend; rather, they are delayed until the values are explicitly required, hence the creation of the DelayedMatrix object.

out[1:5,1:5] 
## <5 x 5> DelayedMatrix object of type "double":
##             SAMP_1      SAMP_2      SAMP_3      SAMP_4      SAMP_5
## GENE_1 -2.56340909  0.87445458  0.98607393  0.07181744  0.66577775
## GENE_2 -0.33968233  1.35105978 -0.82984998  0.35339552  0.12505141
## GENE_3  0.40245265  0.66946223  1.56062031 -0.01478452 -1.50158860
## GENE_4 -1.70199090 -0.44178210 -1.05509045  0.72900112 -0.85357556
## GENE_5  1.83690100  1.15599402 -0.46764322  0.49465298  0.77643887
out * 2
## <100 x 10> DelayedMatrix object of type "double":
##              SAMP_1     SAMP_2     SAMP_3 ...     SAMP_9    SAMP_10
##   GENE_1 -5.1268182  1.7489092  1.9721479   . -2.5001846 -0.0608822
##   GENE_2 -0.6793647  2.7021196 -1.6597000   . -2.1108418  3.2952896
##   GENE_3  0.8049053  1.3389245  3.1212406   .  0.3870800  0.7020755
##   GENE_4 -3.4039818 -0.8835642 -2.1101809   .  2.5808385  0.1553476
##   GENE_5  3.6738020  2.3119880 -0.9352864   . -0.8408279 -2.0191598
##      ...          .          .          .   .          .          .
##  GENE_96  0.4858256 -1.0826053  0.4705662   . -1.9860034 -0.2271556
##  GENE_97 -0.8279074  0.6617411 -2.0377373   .  0.9208601  1.7597217
##  GENE_98 -2.7778086  3.7109823  1.6010740   . -0.4694787 -0.6301458
##  GENE_99  0.7834168 -0.8748025 -2.8620520   . -0.7081409  2.5958655
## GENE_100 -1.8377355 -0.2056266 -1.6033884   .  0.4867493 -3.1645546

We can also do more complex matrix operations that are supported by DelayedArray:

colSums(out)
##     SAMP_1     SAMP_2     SAMP_3     SAMP_4     SAMP_5     SAMP_6     SAMP_7 
##  15.789110  12.307266 -19.375159   2.622866   5.176531   2.747817  16.588069 
##     SAMP_8     SAMP_9    SAMP_10 
## -16.699902  14.577084   3.499956
out %*% runif(ncol(out))
##                 [,1]
## GENE_1   -1.83267322
## GENE_2    1.17468142
## GENE_3    0.98501707
## GENE_4   -1.06907444
## GENE_5    1.05454539
## GENE_6    2.08568056
## GENE_7    0.91028066
## GENE_8   -0.00135150
## GENE_9   -1.01986676
## GENE_10  -0.07425205
## GENE_11  -1.12623658
## GENE_12  -0.24229414
## GENE_13   1.58818582
## GENE_14  -0.33561280
## GENE_15   1.43961971
## GENE_16   0.86720955
## GENE_17   3.48702360
## GENE_18   1.75273223
## GENE_19   0.17732212
## GENE_20   0.41126259
## GENE_21  -1.91122766
## GENE_22  -0.82739130
## GENE_23  -0.48954978
## GENE_24   0.34941802
## GENE_25   2.39692918
## GENE_26  -1.15516200
## GENE_27  -2.75501360
## GENE_28   0.38646687
## GENE_29   1.62242623
## GENE_30   0.77916759
## GENE_31  -0.16420555
## GENE_32  -1.77784690
## GENE_33  -1.74404666
## GENE_34   1.33390267
## GENE_35   1.18849171
## GENE_36   2.31219981
## GENE_37   1.22683391
## GENE_38   0.65651480
## GENE_39  -1.09942607
## GENE_40  -2.16686772
## GENE_41   2.24156753
## GENE_42  -2.41890018
## GENE_43   0.56309206
## GENE_44  -0.87443214
## GENE_45   0.30314480
## GENE_46  -0.89323772
## GENE_47  -1.14651612
## GENE_48   0.28819710
## GENE_49   0.45071798
## GENE_50  -0.04298327
## GENE_51  -1.10387778
## GENE_52  -0.83373757
## GENE_53   0.90866576
## GENE_54   0.42056032
## GENE_55  -1.98842228
## GENE_56   0.71399608
## GENE_57  -2.70456333
## GENE_58   1.89276705
## GENE_59   1.14425029
## GENE_60  -0.06071243
## GENE_61   0.47060497
## GENE_62  -1.58931994
## GENE_63  -2.64009221
## GENE_64  -0.20816852
## GENE_65  -1.81830490
## GENE_66   0.47959553
## GENE_67  -0.37742432
## GENE_68  -0.05559477
## GENE_69   1.36990455
## GENE_70   0.36587089
## GENE_71   2.69590138
## GENE_72  -2.98535761
## GENE_73  -1.46895125
## GENE_74  -1.56712777
## GENE_75   0.14313183
## GENE_76  -1.26172865
## GENE_77  -1.23307303
## GENE_78  -0.39830856
## GENE_79   1.92906310
## GENE_80   3.61252936
## GENE_81  -1.48405931
## GENE_82  -1.86216588
## GENE_83   1.06181528
## GENE_84  -0.55804303
## GENE_85  -0.77869007
## GENE_86  -2.78719206
## GENE_87   0.83760381
## GENE_88   4.69889863
## GENE_89  -0.74362426
## GENE_90   1.37938181
## GENE_91  -0.50864725
## GENE_92  -1.31184463
## GENE_93   0.86137812
## GENE_94  -0.27389404
## GENE_95   0.61777316
## GENE_96  -0.82821761
## GENE_97  -1.48159750
## GENE_98   0.31363874
## GENE_99   1.33785450
## GENE_100 -2.44867526

4 Controlling backend creation

We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray(). For example, the example below allows us to control the path to the backend as well as the name of the attribute containing the data.

X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> TileDBMatrix object of type "double":
##               [,1]        [,2]        [,3] ...       [,9]      [,10]
##   [1,]  1.06504839  1.43897083  1.47968640   . -0.9024547 -0.2420091
##   [2,]  1.83086325 -0.01856940 -0.52470436   .  0.4411536  0.0413871
##   [3,] -0.76630520 -1.22072129 -0.37133527   .  0.6314237 -0.1761234
##   [4,]  0.08583011 -0.50142857  0.01650638   .  0.2281899  0.5051250
##   [5,]  1.44290957  0.18962483  0.28741259   . -1.2979466  0.1159794
##    ...           .           .           .   .          .          .
##  [96,]   0.4699386  -1.0979907  -0.8443384   . -2.0603201  1.0528272
##  [97,]  -0.1449866  -1.4037982  -0.2464315   . -0.7822831  0.8906309
##  [98,]   0.4326231  -0.1597281   1.1746462   .  0.1733751  0.5343873
##  [99,]   2.7892061  -0.9950843   1.2131812   . -0.9457024 -0.5510885
## [100,]   0.1905879  -1.4087611   1.4067222   . -1.2565528 -1.8693152

As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.

path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> TileDBMatrix object of type "double":
##               [,1]        [,2]        [,3] ...       [,9]      [,10]
##   [1,]  1.06504839  1.43897083  1.47968640   . -0.9024547 -0.2420091
##   [2,]  1.83086325 -0.01856940 -0.52470436   .  0.4411536  0.0413871
##   [3,] -0.76630520 -1.22072129 -0.37133527   .  0.6314237 -0.1761234
##   [4,]  0.08583011 -0.50142857  0.01650638   .  0.2281899  0.5051250
##   [5,]  1.44290957  0.18962483  0.28741259   . -1.2979466  0.1159794
##    ...           .           .           .   .          .          .
##  [96,]   0.4699386  -1.0979907  -0.8443384   . -2.0603201  1.0528272
##  [97,]  -0.1449866  -1.4037982  -0.2464315   . -0.7822831  0.8906309
##  [98,]   0.4326231  -0.1597281   1.1746462   .  0.1733751  0.5343873
##  [99,]   2.7892061  -0.9950843   1.2131812   . -0.9457024 -0.5510885
## [100,]   0.1905879  -1.4087611   1.4067222   . -1.2565528 -1.8693152

5 Session information

sessionInfo()
## R version 4.4.0 beta (2024-04-15 r86425)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RcppSpdlog_0.0.17     TileDBArray_1.14.0    DelayedArray_0.30.0  
##  [4] SparseArray_1.4.0     S4Arrays_1.4.0        abind_1.4-5          
##  [7] IRanges_2.38.0        S4Vectors_0.42.0      MatrixGenerics_1.16.0
## [10] matrixStats_1.3.0     BiocGenerics_0.50.0   Matrix_1.7-0         
## [13] BiocStyle_2.32.0     
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.0.5           jsonlite_1.8.8      compiler_4.4.0     
##  [4] BiocManager_1.30.22 crayon_1.5.2        Rcpp_1.0.12        
##  [7] nanoarrow_0.4.0.1   jquerylib_0.1.4     yaml_2.3.8         
## [10] fastmap_1.1.1       lattice_0.22-6      R6_2.5.1           
## [13] RcppCCTZ_0.2.12     XVector_0.44.0      tiledb_0.26.0      
## [16] knitr_1.46          bookdown_0.39       bslib_0.7.0        
## [19] rlang_1.1.3         cachem_1.0.8        xfun_0.43          
## [22] sass_0.4.9          bit64_4.0.5         cli_3.6.2          
## [25] zlibbioc_1.50.0     spdl_0.0.5          digest_0.6.35      
## [28] grid_4.4.0          lifecycle_1.0.4     data.table_1.15.4  
## [31] evaluate_0.23       nanotime_0.3.7      zoo_1.8-12         
## [34] rmarkdown_2.26      tools_4.4.0         htmltools_0.5.8.1