Contents

1 Overview

The alabaster.vcf package implements methods to save VCF objects to file artifacts and load them back into R. This refers specifically to the SummarizedExperiment subclasses (i.e., CollapsedVCF and ExpandedVCF) that are used to represent the variant calling data in an R session, which may contain additional modifications that cannot be easily stored inside the original VCF file. Check out the alabaster.base for more details on the motivation and concepts of the alabaster framework.

2 Quick start

Given a VCF object, we can use stageObject() to save it inside a staging directory:

library(VariantAnnotation)
fl <- system.file("extdata", "structural.vcf", package="VariantAnnotation")
vcf <- readVcf(fl, genome="hg19")
vcf
## class: CollapsedVCF 
## dim: 7 1 
## rowRanges(vcf):
##   GRanges with 5 metadata columns: paramRangeID, REF, ALT, QUAL, FILTER
## info(vcf):
##   DataFrame with 10 columns: BKPTID, CIEND, CIPOS, END, HOMLEN, HOMSEQ, IMPR...
## info(header(vcf)):
##              Number Type    Description                                        
##    BKPTID    .      String  ID of the assembled alternate allele in the asse...
##    CIEND     2      Integer Confidence interval around END for imprecise var...
##    CIPOS     2      Integer Confidence interval around POS for imprecise var...
##    END       1      Integer End position of the variant described in this re...
##    HOMLEN    .      Integer Length of base pair identical micro-homology at ...
##    HOMSEQ    .      String  Sequence of base pair identical micro-homology a...
##    IMPRECISE 0      Flag    Imprecise structural variation                     
##    MEINFO    4      String  Mobile element info of the form NAME,START,END,P...
##    SVLEN     .      Integer Difference in length between REF and ALT alleles   
##    SVTYPE    1      String  Type of structural variant                         
## geno(vcf):
##   List of length 4: GT, GQ, CN, CNQ
## geno(header(vcf)):
##        Number Type    Description                                      
##    GT  1      String  Genotype                                         
##    GQ  1      Float   Genotype quality                                 
##    CN  1      Integer Copy number genotype for imprecise events        
##    CNQ 1      Float   Copy number genotype quality for imprecise events
library(alabaster.vcf)
tmp <- tempfile()
dir.create(tmp)
meta <- stageObject(vcf, tmp, "vcf")
.writeMetadata(meta, tmp)
## $type
## [1] "local"
## 
## $path
## [1] "vcf/experiment.json"
list.files(tmp, recursive=TRUE)
##  [1] "vcf/assay-1/array.h5"                             
##  [2] "vcf/assay-1/array.h5.json"                        
##  [3] "vcf/assay-2/array.h5"                             
##  [4] "vcf/assay-2/array.h5.json"                        
##  [5] "vcf/assay-3/array.h5"                             
##  [6] "vcf/assay-3/array.h5.json"                        
##  [7] "vcf/assay-4/array.h5"                             
##  [8] "vcf/assay-4/array.h5.json"                        
##  [9] "vcf/coldata/simple.csv.gz"                        
## [10] "vcf/coldata/simple.csv.gz.json"                   
## [11] "vcf/experiment.json"                              
## [12] "vcf/fixed/column1/sequence.fa.gz"                 
## [13] "vcf/fixed/column1/sequence.fa.gz.json"            
## [14] "vcf/fixed/column1/set.json"                       
## [15] "vcf/fixed/column2/concatenated/simple.csv.gz"     
## [16] "vcf/fixed/column2/concatenated/simple.csv.gz.json"
## [17] "vcf/fixed/column2/grouping.csv.gz"                
## [18] "vcf/fixed/column2/grouping.csv.gz.json"           
## [19] "vcf/fixed/simple.csv.gz"                          
## [20] "vcf/fixed/simple.csv.gz.json"                     
## [21] "vcf/header/header.bcf"                            
## [22] "vcf/header/header.bcf.json"                       
## [23] "vcf/info/column1/concatenated/simple.csv.gz"      
## [24] "vcf/info/column1/concatenated/simple.csv.gz.json" 
## [25] "vcf/info/column1/grouping.csv.gz"                 
## [26] "vcf/info/column1/grouping.csv.gz.json"            
## [27] "vcf/info/column2/concatenated/simple.csv.gz"      
## [28] "vcf/info/column2/concatenated/simple.csv.gz.json" 
## [29] "vcf/info/column2/grouping.csv.gz"                 
## [30] "vcf/info/column2/grouping.csv.gz.json"            
## [31] "vcf/info/column3/concatenated/simple.csv.gz"      
## [32] "vcf/info/column3/concatenated/simple.csv.gz.json" 
## [33] "vcf/info/column3/grouping.csv.gz"                 
## [34] "vcf/info/column3/grouping.csv.gz.json"            
## [35] "vcf/info/column5/concatenated/simple.csv.gz"      
## [36] "vcf/info/column5/concatenated/simple.csv.gz.json" 
## [37] "vcf/info/column5/grouping.csv.gz"                 
## [38] "vcf/info/column5/grouping.csv.gz.json"            
## [39] "vcf/info/column6/concatenated/simple.csv.gz"      
## [40] "vcf/info/column6/concatenated/simple.csv.gz.json" 
## [41] "vcf/info/column6/grouping.csv.gz"                 
## [42] "vcf/info/column6/grouping.csv.gz.json"            
## [43] "vcf/info/column8/concatenated/simple.csv.gz"      
## [44] "vcf/info/column8/concatenated/simple.csv.gz.json" 
## [45] "vcf/info/column8/grouping.csv.gz"                 
## [46] "vcf/info/column8/grouping.csv.gz.json"            
## [47] "vcf/info/column9/concatenated/simple.csv.gz"      
## [48] "vcf/info/column9/concatenated/simple.csv.gz.json" 
## [49] "vcf/info/column9/grouping.csv.gz"                 
## [50] "vcf/info/column9/grouping.csv.gz.json"            
## [51] "vcf/info/simple.csv.gz"                           
## [52] "vcf/info/simple.csv.gz.json"                      
## [53] "vcf/rowdata/column1/simple.txt.gz"                
## [54] "vcf/rowdata/column1/simple.txt.gz.json"           
## [55] "vcf/rowdata/simple.csv.gz"                        
## [56] "vcf/rowdata/simple.csv.gz.json"                   
## [57] "vcf/rowranges/ranges.csv.gz"                      
## [58] "vcf/rowranges/ranges.csv.gz.json"                 
## [59] "vcf/rowranges/seqinfo/simple.csv.gz"              
## [60] "vcf/rowranges/seqinfo/simple.csv.gz.json"

We can then load it back into the session with loadObject().

meta <- acquireMetadata(tmp, "vcf/experiment.json")
roundtrip <- loadObject(meta, tmp)
class(roundtrip)
## [1] "CollapsedVCF"
## attr(,"package")
## [1] "VariantAnnotation"

More details on the metadata and on-disk layout are provided in the schema.

3 Further comments

We do not use VCF itself as our file format as the VCF may be decorated with more information (e.g., in the rowData or colData) that may not be easily stored in a VCF file. The VCF file is not amenable to random access of data, either for individual variants or for different aspects of the dataset, e.g., just the row annotations. Finally, it allow interpretation of the data as the SummarizedExperiment base class.

The last point is worth some elaboration. Downstream consumers do not necessarily need to know anything about the VCF data structure to read the files, as long as they understand how to interpret the base summarized_experiment schema:

library(alabaster.se)
loadSummarizedExperiment(meta, tmp)
## class: RangedSummarizedExperiment 
## dim: 7 1 
## metadata(0):
## assays(4): GT GQ CN CNQ
## rownames(7): 1:13220_T/<DEL>
##   1:2827693_CCGTGGATGCGGGGACCCGCATCCCCTCTCCCTTCACAGCTGAGTGACCCACATCCCCTCTCCCCTCGCA/C
##   ... 3:12665100_A/<DUP> 4:18665128_T/<DUP:TANDEM>
## rowData names(1): paramRangeID
## colnames(1): NA00001
## colData names(1): Samples

Session information

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] alabaster.se_1.2.0          alabaster.vcf_1.2.0        
##  [3] alabaster.base_1.2.0        VariantAnnotation_1.48.0   
##  [5] Rsamtools_2.18.0            Biostrings_2.70.0          
##  [7] XVector_0.42.0              SummarizedExperiment_1.32.0
##  [9] Biobase_2.62.0              GenomicRanges_1.54.0       
## [11] GenomeInfoDb_1.38.0         IRanges_2.36.0             
## [13] S4Vectors_0.40.0            MatrixGenerics_1.14.0      
## [15] matrixStats_1.0.0           BiocGenerics_0.48.0        
## [17] BiocStyle_2.30.0           
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0         jsonvalidate_1.3.2       dplyr_1.1.3             
##  [4] blob_1.2.4               filelock_1.0.2           bitops_1.0-7            
##  [7] fastmap_1.1.1            RCurl_1.98-1.12          BiocFileCache_2.10.0    
## [10] GenomicAlignments_1.38.0 XML_3.99-0.14            digest_0.6.33           
## [13] lifecycle_1.0.3          alabaster.matrix_1.2.0   KEGGREST_1.42.0         
## [16] RSQLite_2.3.1            magrittr_2.0.3           compiler_4.3.1          
## [19] rlang_1.1.1              sass_0.4.7               progress_1.2.2          
## [22] tools_4.3.1              utf8_1.2.4               yaml_2.3.7              
## [25] rtracklayer_1.62.0       knitr_1.44               prettyunits_1.2.0       
## [28] S4Arrays_1.2.0           bit_4.0.5                curl_5.1.0              
## [31] DelayedArray_0.28.0      xml2_1.3.5               abind_1.4-5             
## [34] BiocParallel_1.36.0      HDF5Array_1.30.0         grid_4.3.1              
## [37] fansi_1.0.5              Rhdf5lib_1.24.0          biomaRt_2.58.0          
## [40] cli_3.6.1                rmarkdown_2.25           crayon_1.5.2            
## [43] generics_0.1.3           httr_1.4.7               rjson_0.2.21            
## [46] rhdf5_2.46.0             DBI_1.1.3                cachem_1.0.8            
## [49] stringr_1.5.0            zlibbioc_1.48.0          parallel_4.3.1          
## [52] AnnotationDbi_1.64.0     BiocManager_1.30.22      restfulr_0.0.15         
## [55] alabaster.schemas_1.2.0  vctrs_0.6.4              V8_4.4.0                
## [58] Matrix_1.6-1.1           jsonlite_1.8.7           bookdown_0.36           
## [61] hms_1.1.3                bit64_4.0.5              alabaster.ranges_1.2.0  
## [64] GenomicFeatures_1.54.0   jquerylib_0.1.4          glue_1.6.2              
## [67] codetools_0.2-19         stringi_1.7.12           BiocIO_1.12.0           
## [70] tibble_3.2.1             pillar_1.9.0             rhdf5filters_1.14.0     
## [73] rappdirs_0.3.3           htmltools_0.5.6.1        GenomeInfoDbData_1.2.11 
## [76] BSgenome_1.70.0          R6_2.5.1                 dbplyr_2.3.4            
## [79] evaluate_0.22            lattice_0.22-5           png_0.1-8               
## [82] memoise_2.0.1            bslib_0.5.1              Rcpp_1.0.11             
## [85] SparseArray_1.2.0        xfun_0.40                alabaster.string_1.2.0  
## [88] pkgconfig_2.0.3