single-cell-data · aaronwolen · Nov 17, 2022 · Nov 10, 2022 · Nov 11, 2022 · Nov 11, 2022
diff --git a/apis/r/DESCRIPTION b/apis/r/DESCRIPTION
@@ -7,7 +7,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
     from and export to in-memory formats used by popular toolchains like
     'Seurat', 'Bioconductor', and even 'AnnData' using the companion Python
     package.
-Version: 0.1.16.9000
+Version: 0.1.16.9001
 Authors@R: c(
     person(given = "Aaron",
            family = "Wolen",

diff --git a/apis/r/NEWS.md b/apis/r/NEWS.md
@@ -1,5 +1,9 @@
 # tiledbsoma (development version)
 
+## Features
+
+- The `AnnotationMatrix`'s `to_matrix()` method now supports batched reads via the `batch_mode` argument. This functionality can also be leveraged from `SOMA`'s  `get_seurat_dimreductions_list()` and `get_seurat_dimreduction()` methods.
+
 ## Fixes
 * Don't use default assay name when recreating a `Seurat` object (thanks @dan11mcguire)
 

diff --git a/apis/r/R/AnnotationMatrix.R b/apis/r/R/AnnotationMatrix.R
@@ -3,6 +3,9 @@
 #' Base class for matrix-like data with rows aligned to the observations or
 #' features of a [`SOMA`].
 #'
+#' @param batch_mode logical, if `TRUE`, batch query mode is enabled, which
+#' provides the ability to detect partial query results and resubmit until
+#' all results are retrieved.
 #' @export
 
 AnnotationMatrix <- R6::R6Class(
@@ -42,15 +45,17 @@ AnnotationMatrix <- R6::R6Class(
     },
 
     #' @description Retrieve the annotation data from TileDB
+    #' @param attrs A character vector of the attribute names to retrieve. By
+    #' default, all attributes are retrieved.
     #' @return A [`matrix`]
-    to_matrix = function() {
-      if (self$verbose) {
-        message(
-          sprintf("Reading %s into memory from '%s'", self$class(), self$uri)
-        )
-      }
+    to_matrix = function(attrs = NULL, batch_mode = FALSE) {
+
+      df <- private$read_data(
+        attrs = attrs,
+        batch_mode = batch_mode,
+        return_as = "data.frame"
+      )
 
-      df <- self$tiledb_array(return_as = "data.frame")[]
       index_col <- self$dimnames()
       attr_cols <- setdiff(colnames(df), index_col)
 

diff --git a/apis/r/R/AssayMatrix.R b/apis/r/R/AssayMatrix.R
@@ -107,30 +107,11 @@ AssayMatrix <- R6::R6Class(
     #' all attributes are retrieved.
     #' @return A [`Matrix::dgTMatrix-class`].
     to_dataframe = function(attrs = NULL, batch_mode = FALSE) {
-      if (self$verbose) {
-        message(
-          sprintf("Reading %s into memory from '%s'", self$class(), self$uri)
-        )
-      }
-      arr <- self$object
-      tiledb::attrs(arr) <- attrs %||% character()
-      tiledb::return_as(arr) <- "data.frame"
-
-      if (batch_mode) {
-        if (self$verbose) message("...reading in batches")
-        batcher <- tiledb:::createBatched(arr)
-        results <- list()
-        i <- 1
-        while(isFALSE(tiledb::completedBatched(batcher))) {
-          if (self$verbose) message(sprintf("...retrieving batch %d", i))
-          results[[i]] <- tiledb::fetchBatched(arr, batcher)
-          i <- i + 1
-        }
-        results <- vctrs::vec_rbind(!!!results)
-      } else {
-        results <- arr[]
-      }
-      results
+      private$read_data(
+        attrs = attrs,
+        batch_mode = batch_mode,
+        return_as = "data.frame"
+      )
     },
 
     #' @description Retrieve assay data from TileDB as a 2D sparse matrix.
@@ -146,7 +127,11 @@ AssayMatrix <- R6::R6Class(
       }
       stopifnot(is_scalar_character(attr))
 
-      assay_data <- self$to_dataframe(attrs = attr, batch_mode = batch_mode)
+      assay_data <- private$read_data(
+        attrs = attr,
+        batch_mode = batch_mode,
+        return_as = "data.frame"
+      )
 
       # reverse index columns if transposing array dimensions
       if (transpose) assay_data <- assay_data[c(rev(self$dimnames()), attr)]

diff --git a/apis/r/R/SOMA.R b/apis/r/R/SOMA.R
@@ -576,7 +576,7 @@ SOMA <- R6::R6Class(
     #' @param technique Name of the dimensionality reduction technique. Used to
     #' identify which `obsm`/`varm` array will be retrieved. If `NULL`, we
     #' default to the first `obsm/dimreduction_` array.
-    get_seurat_dimreduction = function(technique = NULL) {
+    get_seurat_dimreduction = function(technique = NULL, batch_mode = FALSE) {
 
       # Identify all obsm/varm dimreduction_ arrays
       prefix <- "dimreduction_"
@@ -613,7 +613,9 @@ SOMA <- R6::R6Class(
       }
 
       # TODO: validate we're only returning 1 array per dimension
-      mats <- lapply(arrays, function(x) x[[1]]$to_matrix())
+      mats <- lapply(arrays,
+        function(x) x[[1]]$to_matrix(batch_mode = batch_mode)
+      )
 
       # TODO: validate all keys match? For now just take the first one
       key <- unlist(arrays)[[1]]$get_metadata(key = "dimreduction_key")
@@ -627,13 +629,13 @@ SOMA <- R6::R6Class(
     },
 
     #' @description Retrieve a list of all [`SeuratObject::DimReduc`] objects.
-    get_seurat_dimreductions_list = function() {
-      arrays <-self$get_annotation_matrix_arrays(prefix = "dimreduction_")
+    get_seurat_dimreductions_list = function(batch_mode = FALSE) {
+      arrays <- self$get_annotation_matrix_arrays(prefix = "dimreduction_")
       array_names <- names(unlist(arrays))
       techniques <- unique(sub("(obs|var)m\\.dimreduction_", "", array_names))
       sapply(
         techniques,
-        function(x) self$get_seurat_dimreduction(x),
+        function(x) self$get_seurat_dimreduction(x, batch_mode = batch_mode),
         simplify = FALSE,
         USE.NAMES = TRUE
       )

diff --git a/apis/r/R/SOMACollection.R b/apis/r/R/SOMACollection.R
@@ -199,7 +199,7 @@ SOMACollection <- R6::R6Class(
     #' @description Convert to a [SeuratObject::Seurat] object.
     #' @param project [`SeuratObject::Project`] name for the `Seurat` object
     #' @param batch_mode logical, if `TRUE`, batch query mode is enabled for
-    #' retrieving `X` layers. See
+    #' retrieving `X`, `obsm`/`varm`, and `obsp`/`varp` layers. See
     #' [`AssayMatrix$to_dataframe()`][`AssayMatrix`] for more information.
     to_seurat = function(project = "SeuratProject", batch_mode = FALSE) {
       stopifnot(is_scalar_character(project))
@@ -243,9 +243,8 @@ SOMACollection <- R6::R6Class(
       # Retrieve list of all techniques used in any soma's obsm/varm
       # dimensionality reduction arrays. The association between assay and
       # dimreduction is maintained by the DimReduc's `assay.used` slot.
-      dimreductions <- lapply(
-        self$somas,
-        function(x) x$get_seurat_dimreductions_list()
+      dimreductions <- lapply(self$somas,
+        function(x) x$get_seurat_dimreductions_list(batch_mode)
       )
       object@reductions <- Reduce(base::c, dimreductions)
 

diff --git a/apis/r/R/TileDBArray.R b/apis/r/R/TileDBArray.R
@@ -264,6 +264,43 @@ TileDBArray <- R6::R6Class(
     },
 
     # @description Ingest data into the TileDB array.
-    ingest_data = function() return(NULL)
+    ingest_data = function() return(NULL),
+
+    # @description Retrieve data from the TileDB array
+    # @param batch_mode logical, if `TRUE`, batch query mode is enabled, which
+    # provides the ability to detect partial query results and resubmit until
+    # all results are retrieved.
+    # @param return_as Data can be read in as a `list` (default), `array`,
+    # `matrix`, `data.frame`, `data.table` or `tibble`.
+    read_data = function(attrs = NULL, batch_mode = FALSE, return_as = NULL) {
+      if (self$verbose) {
+        message(
+          sprintf("Reading %s into memory from '%s'", self$class(), self$uri)
+        )
+      }
+      arr <- self$object
+      tiledb::attrs(arr) <- attrs %||% character()
+      tiledb::return_as(arr) <- return_as %||% "asis"
+
+      if (batch_mode) {
+        if (self$verbose) message("...reading in batches")
+        batcher <- tiledb::createBatched(arr)
+        results <- list()
+        i <- 1
+        while (isFALSE(tiledb::completedBatched(batcher))) {
+          if (self$verbose) message(sprintf("...retrieving batch %d", i))
+          results[[i]] <- tiledb::fetchBatched(arr, batcher)
+          i <- i + 1
+        }
+
+        # TODO: currently tiledb-r's batched reader ignores return_as and a
+        # data.frame is always returned. When this is addressed we'll need to
+        # add class-specific concatenation logic here.
+        results <- vctrs::vec_rbind(!!!results)
+      } else {
+        results <- arr[]
+      }
+      results
+    }
   )
 )
diff --git a/apis/r/man/AnnotationMatrix.Rd b/apis/r/man/AnnotationMatrix.Rd
diff --git a/apis/r/man/SOMA.Rd b/apis/r/man/SOMA.Rd
diff --git a/apis/r/man/SOMACollection.Rd b/apis/r/man/SOMACollection.Rd
diff --git a/apis/r/tests/testthat/test_AnnotationMatrix.R b/apis/r/tests/testthat/test_AnnotationMatrix.R
@@ -23,4 +23,8 @@ test_that("annotation matrix can be stored and retrieved", {
   expect_equal(sort(colnames(mat2)), sort(colnames(mat)))
 
   expect_identical(mat[rlabs, clabs], mat2[rlabs, clabs])
+
+  # test that result is identical with batch mode
+  mat3 <- annotmat$to_matrix(batch_mode = TRUE)
+  expect_identical(mat2, mat3)
 })