single-cell-data · johnkerl · Sep 23, 2022 · Aug 31, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/apis/python/src/tiledbsoma/annotation_dataframe.py b/apis/python/src/tiledbsoma/annotation_dataframe.py
@@ -1,7 +1,6 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import Optional, Sequence, Set, Tuple, Union
 
-import numpy as np
 import pandas as pd
 import pyarrow as pa
 import tiledb
@@ -76,10 +75,19 @@ def ids(self) -> Sequence[str]:
         """
         with self._open("r") as A:
             self.dim_name = A.domain.dim(0).name
-            # TileDB string dims are ASCII not UTF-8. Decode them so they readback
-            # not like `b"AKR1C3"` but rather like `"AKR1C3"`.
+
+            # TileDB string dims are ASCII not UTF-8. Decode them so they readback not like
+            # `b"AKR1C3"` but rather like `"AKR1C3"`. Update as of
+            # https://github.com/TileDB-Inc/TileDB-Py/pull/1304 these dims will read back OK.
             retval = A.query(attrs=[], dims=[self.dim_name])[:][self.dim_name].tolist()
-            return [e.decode() for e in retval]
+
+            retval = [e.decode() for e in retval]
+
+            if len(retval) > 0 and isinstance(retval[0], bytes):
+                return [e.decode() for e in retval]
+            else:
+                # list(...) is there to appease the linter which thinks we're returning `Any`
+                return list(retval)
 
     # ----------------------------------------------------------------
     def __repr__(self) -> str:
@@ -185,6 +193,23 @@ def query(
         if query_string is None:
             return self.dim_select(ids, attrs=attrs, return_arrow=return_arrow)
 
+        return self._query_aux(
+            query_string=query_string, ids=ids, attrs=attrs, return_arrow=return_arrow
+        )
+
+    def _query_aux(
+        self,
+        query_string: Optional[str],
+        ids: Optional[Ids] = None,
+        attrs: Optional[Sequence[str]] = None,
+        *,
+        return_arrow: bool = False,
+    ) -> Union[pd.DataFrame, pa.Table]:
+        """
+        Helper method for `query`: as this has multiple `return` statements, it's easiest to track
+        elapsed-time stats in a call to this helper.
+        """
+
         with self._open() as A:
             self.dim_name = A.domain.dim(0).name
             qc = tiledb.QueryCondition(query_string)
@@ -381,7 +406,7 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None:
             dfc = dataframe[column_name]
             if len(dfc) > 0 and type(dfc[0]) == str:
                 # Force ASCII storage if string, in order to make obs/var columns queryable.
-                column_types[column_name] = np.dtype("S")
+                column_types[column_name] = "ascii"
 
         tiledb.from_pandas(
             uri=self.uri,

diff --git a/apis/python/src/tiledbsoma/annotation_matrix.py b/apis/python/src/tiledbsoma/annotation_matrix.py
@@ -112,7 +112,6 @@ def from_matrix_and_dim_values(
         :param matrix: ``anndata.obsm['foo']``, ``anndata.varm['foo']``, or ``anndata.raw.varm['foo']``.
         :param dim_values: ``anndata.obs_names``, ``anndata.var_names``, or ``anndata.raw.var_names``.
         """
-
         s = util.get_start_stamp()
         log_io(None, f"{self._indent}START  WRITING {self.uri}")
 

diff --git a/apis/python/src/tiledbsoma/assay_matrix.py b/apis/python/src/tiledbsoma/assay_matrix.py
@@ -101,8 +101,10 @@ def dim_select(
                     df = query.df[obs_ids, :]
                 else:
                     df = query.df[obs_ids, var_ids]
+
         if not return_arrow:
             df.set_index([self.row_dim_name, self.col_dim_name], inplace=True)
+
         return df
 
     # ----------------------------------------------------------------
@@ -168,7 +170,6 @@ def from_matrix_and_dim_values(
         ``scipy.sparse.csr_matrix``, ``scipy.sparse.csc_matrix``, ``numpy.ndarray``, etc.
         For ingest from ``AnnData``, these should be ``ann.obs_names`` and ``ann.var_names``.
         """
-
         s = util.get_start_stamp()
         log_io(
             f"Writing {self.nested_name} ...",

diff --git a/apis/python/src/tiledbsoma/soma.py b/apis/python/src/tiledbsoma/soma.py
@@ -322,6 +322,34 @@ def query(
         from the source SOMAs; if they are specified, the slice will take the specified ``obs``/``var``
         """
 
+        retval = self._query_aux(
+            obs_attrs=obs_attrs,
+            obs_query_string=obs_query_string,
+            obs_ids=obs_ids,
+            var_attrs=var_attrs,
+            var_query_string=var_query_string,
+            var_ids=var_ids,
+            return_arrow=return_arrow,
+        )
+        return retval
+
+    # ----------------------------------------------------------------
+    def _query_aux(
+        self,
+        *,
+        obs_attrs: Optional[Sequence[str]] = None,
+        obs_query_string: Optional[str] = None,
+        obs_ids: Optional[Ids] = None,
+        var_attrs: Optional[Sequence[str]] = None,
+        var_query_string: Optional[str] = None,
+        var_ids: Optional[Ids] = None,
+        return_arrow: bool = False,
+    ) -> Optional[SOMASlice]:
+        """
+        Helper method for `query`: as this has multiple `return` statements, it's easiest to track
+        elapsed-time stats in a call to this helper.
+        """
+
         slice_obs_df = self.obs.query(
             query_string=obs_query_string,
             ids=obs_ids,

diff --git a/apis/python/tests/test_ascii_and_unicode.py b/apis/python/tests/test_ascii_and_unicode.py
@@ -1,11 +1,14 @@
 import anndata as ad
 import numpy as np
 import pandas as pd
+import pytest
 
 import tiledbsoma.io as io
 from tiledbsoma import SOMA
 
 
+# TODO: restore once https://github.com/single-cell-data/TileDB-SingleCell/issues/274 is in place.
+@pytest.mark.skip(reason="Unicode attributes temporarily unsupported")
 def test_readback(tmp_path):
     """
     Validate correct encode/decode of non-ASCII attribute text.

diff --git a/apis/python/tests/test_dim_select.py b/apis/python/tests/test_dim_select.py
@@ -140,7 +140,7 @@ def test_dim_select(adata):
         "VDAC3",
     ]
 
-    df = soma.obs.dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
+    df = soma.obs.dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"])
     assert df.shape == (2, 7)
     assert df.at["AAGCGACTTTGACG", "groups"] == "g1"
     assert df.at["AATGCGTGGACGGA", "nFeature_RNA"] == 73
@@ -150,23 +150,23 @@ def test_dim_select(adata):
     # AATGCGTGGACGGA           0       389.0            73                1              1     g1              1
     assert soma.obs.dim_select(None).shape == (80, 7)
 
-    df = soma.var.dim_select([b"AKR1C3", b"MYL9"])
+    df = soma.var.dim_select(["AKR1C3", "MYL9"])
     assert df.shape == (2, 5)
     assert df.at["AKR1C3", "vst.variable"] == 1
     assert df.at["MYL9", "vst.variable"] == 1
     assert soma.var.dim_select(None).shape == (20, 5)
 
     assert sorted(soma.obsm.keys()) == sorted(["X_tsne", "X_pca"])
 
-    df = soma.obsm["X_tsne"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
+    df = soma.obsm["X_tsne"].dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"])
     assert df.shape == (2, 2)
 
-    df = soma.obsm["X_pca"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"])
+    df = soma.obsm["X_pca"].dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"])
     assert df.shape == (2, 19)
 
-    assert soma.X["data"].dim_select([b"AAGCGACTTTGACG"], [b"AKR1C3"]).shape == (1, 1)
-    assert soma.X["data"].dim_select(None, [b"AKR1C3"]).shape == (80, 1)
-    assert soma.X["data"].dim_select([b"AAGCGACTTTGACG"], None).shape == (20, 1)
+    assert soma.X["data"].dim_select(["AAGCGACTTTGACG"], ["AKR1C3"]).shape == (1, 1)
+    assert soma.X["data"].dim_select(None, ["AKR1C3"]).shape == (80, 1)
+    assert soma.X["data"].dim_select(["AAGCGACTTTGACG"], None).shape == (20, 1)
     assert soma.X["data"].dim_select(None, None).shape == (1600, 1)
 
     tempdir.cleanup()
@@ -211,7 +211,8 @@ def test_zeroes_handling():
     n_obs = len(obs_ids)
     n_var = len(var_ids)
 
-    cell_types = ["blööd" if obs_id[1] == "A" else "lung" for obs_id in obs_ids]
+    # TODO: restore once https://github.com/single-cell-data/TileDB-SingleCell/issues/274 is in place.
+    cell_types = ["blood" if obs_id[1] == "A" else "lung" for obs_id in obs_ids]
     feature_names = [
         "ENSG00000999999" if var_id[1] < "M" else "ENSG00000123456"
         for var_id in var_ids

diff --git a/apis/python/tests/test_type_diversity.py b/apis/python/tests/test_type_diversity.py
@@ -108,8 +108,10 @@ def test_from_anndata_DataFrame_type(tmp_path):
     df_col_type_sweep = [
         ("bool", lambda a: a.astype(bool)),
         ("str", lambda a: a.astype(str)),
-        ("bytes", lambda a: a.astype(str).astype(bytes)),
-        # ("float16", lambda a: a.astype(np.dtype("float16"))),         TODO: Enable when #39 is fixed
+        # TODO: restore once #274 is in place.
+        # ("bytes", lambda a: a.astype(str).astype(bytes)),
+        # TODO: Enable when #39 is fixed
+        # ("float16", lambda a: a.astype(np.dtype("float16"))),
         ("float32", lambda a: a.astype("float32")),
         ("float64", lambda a: a.astype("float64")),
         ("int8", lambda a: a.astype("int8")),
@@ -147,9 +149,7 @@ def test_from_anndata_DataFrame_type(tmp_path):
             ),
         ),
     ]
-    index = (
-        np.arange(1, n + 1).astype(str).astype(bytes)
-    )  # AnnData requires string indices, TileDB wants bytes. Use LCD
+    index = np.arange(1, n + 1).astype(str).astype(str)
     df = pd.DataFrame(
         data={
             f"col_{name}": cast(pd.Series(index=index, data=np.arange(n)))
@@ -158,7 +158,8 @@ def test_from_anndata_DataFrame_type(tmp_path):
     )
     X = np.ones((n, n), dtype=np.float32)
     adata = ad.AnnData(X=X, obs=df, var=df, dtype=X.dtype)
-    io.from_anndata(SOMA(tmp_path.as_posix()), adata)
+    soma = SOMA(tmp_path.as_posix())
+    io.from_anndata(soma, adata)
     assert all(
         (tmp_path / sub_array_path).exists()
         for sub_array_path in ["obs", "var", "X/data"]
@@ -176,33 +177,29 @@ def cmp_dtype(series, tdb: tiledb.Attr) -> bool:
             # TODO: see annotation_dataframe.py. Once Unicode attributes are queryable, we'll need
             # to remove this check which is verifying the current force-to-ASCII workaround.
             if ad_dtype.name == "str":
-                ad_dtype = np.dtype("S")
+                ad_dtype = np.dtype("bytes")
 
         return ad_dtype == tdb.dtype
 
     for df_name in ["var", "obs"]:
-        with tiledb.open((tmp_path / df_name).as_posix()) as arr:
-            df = getattr(adata, df_name)
+        annotation_dataframe = getattr(soma, df_name)
+        with annotation_dataframe._open() as A:
 
             # verify names match
-            assert set(arr.schema.attr(i).name for i in range(arr.schema.nattr)) == set(
+            assert set(A.schema.attr(i).name for i in range(A.schema.nattr)) == set(
                 getattr(adata, df_name).keys()
             )
 
             # verify length
-            assert n == len(arr.query(dims=[]).df[:])
+            assert n == len(A.query(dims=[]).df[:])
 
             # verify index
-            assert np.array_equal(
-                np.sort(df.index.to_numpy()), np.sort(arr[:][df_name + "_id"])
-            )
+            assert np.array_equal(np.sort(df.index.to_numpy()), np.sort(A.df[:].index))
 
             # verify individual column types
-            attr_idx = {
-                arr.schema.attr(idx).name: idx for idx in range(arr.schema.nattr)
-            }
+            attr_idx = {A.schema.attr(idx).name: idx for idx in range(A.schema.nattr)}
             for k in df.keys():
-                assert cmp_dtype(df[k], arr.schema.attr(attr_idx[k]))
+                assert cmp_dtype(df[k], A.schema.attr(attr_idx[k]))
 
 
 def test_from_anndata_annotations_empty(tmp_path):
@@ -212,30 +209,26 @@ def test_from_anndata_annotations_empty(tmp_path):
     n_obs = 100
     n_var = 10
 
-    # AnnData requires a string index. TileDB does not support UTF8, so use ASCII.
-    obs = pd.DataFrame(index=np.arange(n_obs).astype(bytes))
-    var = pd.DataFrame(index=np.arange(n_var).astype(bytes))
+    obs = pd.DataFrame(index=np.arange(n_obs).astype(str))
+    var = pd.DataFrame(index=np.arange(n_var).astype(str))
 
     X = np.ones((n_obs, n_var))
     adata = ad.AnnData(X=X, obs=obs, var=var, dtype=X.dtype)
 
-    io.from_anndata(SOMA(tmp_path.as_posix()), adata)
+    soma = SOMA(tmp_path.as_posix())
+    io.from_anndata(soma, adata)
 
-    assert all(
-        (tmp_path / sub_array_path).exists()
-        for sub_array_path in ["obs", "var", "X/data"]
-    )
+    assert soma.obs.exists()
+    assert soma.var.exists()
+    assert soma.X.data.exists()
 
     # obs/var are sparse. Sort before comparing contents.
-    with tiledb.open((tmp_path / "obs").as_posix()) as obs:
-        assert np.array_equal(
-            np.sort(adata.obs.index.to_numpy()), np.sort(obs[:]["obs_id"])
-        )
-
-    with tiledb.open((tmp_path / "var").as_posix()) as var:
-        assert np.array_equal(
-            np.sort(adata.var.index.to_numpy()), np.sort(var[:]["var_id"])
-        )
+    assert np.array_equal(
+        np.sort(adata.obs.index.to_numpy()), np.sort(soma.obs.df().index)
+    )
+    assert np.array_equal(
+        np.sort(adata.var.index.to_numpy()), np.sort(soma.var.df().index)
+    )
 
 
 def test_from_anndata_annotations_none(tmp_path):