diff --git a/apis/python/src/tiledbsoma/annotation_dataframe.py b/apis/python/src/tiledbsoma/annotation_dataframe.py index 455eae060b..57d7ab145d 100644 --- a/apis/python/src/tiledbsoma/annotation_dataframe.py +++ b/apis/python/src/tiledbsoma/annotation_dataframe.py @@ -1,7 +1,6 @@ from concurrent.futures import ThreadPoolExecutor from typing import Optional, Sequence, Set, Tuple, Union -import numpy as np import pandas as pd import pyarrow as pa import tiledb @@ -76,10 +75,19 @@ def ids(self) -> Sequence[str]: """ with self._open("r") as A: self.dim_name = A.domain.dim(0).name - # TileDB string dims are ASCII not UTF-8. Decode them so they readback - # not like `b"AKR1C3"` but rather like `"AKR1C3"`. + + # TileDB string dims are ASCII not UTF-8. Decode them so they readback not like + # `b"AKR1C3"` but rather like `"AKR1C3"`. Update as of + # https://github.com/TileDB-Inc/TileDB-Py/pull/1304 these dims will read back OK. retval = A.query(attrs=[], dims=[self.dim_name])[:][self.dim_name].tolist() - return [e.decode() for e in retval] + + retval = [e.decode() for e in retval] + + if len(retval) > 0 and isinstance(retval[0], bytes): + return [e.decode() for e in retval] + else: + # list(...) is there to appease the linter which thinks we're returning `Any` + return list(retval) # ---------------------------------------------------------------- def __repr__(self) -> str: @@ -185,6 +193,23 @@ def query( if query_string is None: return self.dim_select(ids, attrs=attrs, return_arrow=return_arrow) + return self._query_aux( + query_string=query_string, ids=ids, attrs=attrs, return_arrow=return_arrow + ) + + def _query_aux( + self, + query_string: Optional[str], + ids: Optional[Ids] = None, + attrs: Optional[Sequence[str]] = None, + *, + return_arrow: bool = False, + ) -> Union[pd.DataFrame, pa.Table]: + """ + Helper method for `query`: as this has multiple `return` statements, it's easiest to track + elapsed-time stats in a call to this helper. + """ + with self._open() as A: self.dim_name = A.domain.dim(0).name qc = tiledb.QueryCondition(query_string) @@ -381,7 +406,7 @@ def from_dataframe(self, dataframe: pd.DataFrame, extent: int = 2048) -> None: dfc = dataframe[column_name] if len(dfc) > 0 and type(dfc[0]) == str: # Force ASCII storage if string, in order to make obs/var columns queryable. - column_types[column_name] = np.dtype("S") + column_types[column_name] = "ascii" tiledb.from_pandas( uri=self.uri, diff --git a/apis/python/src/tiledbsoma/annotation_matrix.py b/apis/python/src/tiledbsoma/annotation_matrix.py index e7e250a2ed..567dc70ba5 100644 --- a/apis/python/src/tiledbsoma/annotation_matrix.py +++ b/apis/python/src/tiledbsoma/annotation_matrix.py @@ -112,7 +112,6 @@ def from_matrix_and_dim_values( :param matrix: ``anndata.obsm['foo']``, ``anndata.varm['foo']``, or ``anndata.raw.varm['foo']``. :param dim_values: ``anndata.obs_names``, ``anndata.var_names``, or ``anndata.raw.var_names``. """ - s = util.get_start_stamp() log_io(None, f"{self._indent}START WRITING {self.uri}") diff --git a/apis/python/src/tiledbsoma/assay_matrix.py b/apis/python/src/tiledbsoma/assay_matrix.py index 24c0900866..a8f906baba 100644 --- a/apis/python/src/tiledbsoma/assay_matrix.py +++ b/apis/python/src/tiledbsoma/assay_matrix.py @@ -101,8 +101,10 @@ def dim_select( df = query.df[obs_ids, :] else: df = query.df[obs_ids, var_ids] + if not return_arrow: df.set_index([self.row_dim_name, self.col_dim_name], inplace=True) + return df # ---------------------------------------------------------------- @@ -168,7 +170,6 @@ def from_matrix_and_dim_values( ``scipy.sparse.csr_matrix``, ``scipy.sparse.csc_matrix``, ``numpy.ndarray``, etc. For ingest from ``AnnData``, these should be ``ann.obs_names`` and ``ann.var_names``. """ - s = util.get_start_stamp() log_io( f"Writing {self.nested_name} ...", diff --git a/apis/python/src/tiledbsoma/soma.py b/apis/python/src/tiledbsoma/soma.py index bad3bb98de..c015fe9480 100644 --- a/apis/python/src/tiledbsoma/soma.py +++ b/apis/python/src/tiledbsoma/soma.py @@ -322,6 +322,34 @@ def query( from the source SOMAs; if they are specified, the slice will take the specified ``obs``/``var`` """ + retval = self._query_aux( + obs_attrs=obs_attrs, + obs_query_string=obs_query_string, + obs_ids=obs_ids, + var_attrs=var_attrs, + var_query_string=var_query_string, + var_ids=var_ids, + return_arrow=return_arrow, + ) + return retval + + # ---------------------------------------------------------------- + def _query_aux( + self, + *, + obs_attrs: Optional[Sequence[str]] = None, + obs_query_string: Optional[str] = None, + obs_ids: Optional[Ids] = None, + var_attrs: Optional[Sequence[str]] = None, + var_query_string: Optional[str] = None, + var_ids: Optional[Ids] = None, + return_arrow: bool = False, + ) -> Optional[SOMASlice]: + """ + Helper method for `query`: as this has multiple `return` statements, it's easiest to track + elapsed-time stats in a call to this helper. + """ + slice_obs_df = self.obs.query( query_string=obs_query_string, ids=obs_ids, diff --git a/apis/python/tests/test_ascii_and_unicode.py b/apis/python/tests/test_ascii_and_unicode.py index 81c4847d38..dd8122dff1 100644 --- a/apis/python/tests/test_ascii_and_unicode.py +++ b/apis/python/tests/test_ascii_and_unicode.py @@ -1,11 +1,14 @@ import anndata as ad import numpy as np import pandas as pd +import pytest import tiledbsoma.io as io from tiledbsoma import SOMA +# TODO: restore once https://github.com/single-cell-data/TileDB-SingleCell/issues/274 is in place. +@pytest.mark.skip(reason="Unicode attributes temporarily unsupported") def test_readback(tmp_path): """ Validate correct encode/decode of non-ASCII attribute text. diff --git a/apis/python/tests/test_dim_select.py b/apis/python/tests/test_dim_select.py index 02430166e9..7071f11c36 100644 --- a/apis/python/tests/test_dim_select.py +++ b/apis/python/tests/test_dim_select.py @@ -140,7 +140,7 @@ def test_dim_select(adata): "VDAC3", ] - df = soma.obs.dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"]) + df = soma.obs.dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"]) assert df.shape == (2, 7) assert df.at["AAGCGACTTTGACG", "groups"] == "g1" assert df.at["AATGCGTGGACGGA", "nFeature_RNA"] == 73 @@ -150,7 +150,7 @@ def test_dim_select(adata): # AATGCGTGGACGGA 0 389.0 73 1 1 g1 1 assert soma.obs.dim_select(None).shape == (80, 7) - df = soma.var.dim_select([b"AKR1C3", b"MYL9"]) + df = soma.var.dim_select(["AKR1C3", "MYL9"]) assert df.shape == (2, 5) assert df.at["AKR1C3", "vst.variable"] == 1 assert df.at["MYL9", "vst.variable"] == 1 @@ -158,15 +158,15 @@ def test_dim_select(adata): assert sorted(soma.obsm.keys()) == sorted(["X_tsne", "X_pca"]) - df = soma.obsm["X_tsne"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"]) + df = soma.obsm["X_tsne"].dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"]) assert df.shape == (2, 2) - df = soma.obsm["X_pca"].dim_select([b"AAGCGACTTTGACG", b"AATGCGTGGACGGA"]) + df = soma.obsm["X_pca"].dim_select(["AAGCGACTTTGACG", "AATGCGTGGACGGA"]) assert df.shape == (2, 19) - assert soma.X["data"].dim_select([b"AAGCGACTTTGACG"], [b"AKR1C3"]).shape == (1, 1) - assert soma.X["data"].dim_select(None, [b"AKR1C3"]).shape == (80, 1) - assert soma.X["data"].dim_select([b"AAGCGACTTTGACG"], None).shape == (20, 1) + assert soma.X["data"].dim_select(["AAGCGACTTTGACG"], ["AKR1C3"]).shape == (1, 1) + assert soma.X["data"].dim_select(None, ["AKR1C3"]).shape == (80, 1) + assert soma.X["data"].dim_select(["AAGCGACTTTGACG"], None).shape == (20, 1) assert soma.X["data"].dim_select(None, None).shape == (1600, 1) tempdir.cleanup() @@ -211,7 +211,8 @@ def test_zeroes_handling(): n_obs = len(obs_ids) n_var = len(var_ids) - cell_types = ["blööd" if obs_id[1] == "A" else "lung" for obs_id in obs_ids] + # TODO: restore once https://github.com/single-cell-data/TileDB-SingleCell/issues/274 is in place. + cell_types = ["blood" if obs_id[1] == "A" else "lung" for obs_id in obs_ids] feature_names = [ "ENSG00000999999" if var_id[1] < "M" else "ENSG00000123456" for var_id in var_ids diff --git a/apis/python/tests/test_type_diversity.py b/apis/python/tests/test_type_diversity.py index 7c399a7540..73b5398010 100644 --- a/apis/python/tests/test_type_diversity.py +++ b/apis/python/tests/test_type_diversity.py @@ -108,8 +108,10 @@ def test_from_anndata_DataFrame_type(tmp_path): df_col_type_sweep = [ ("bool", lambda a: a.astype(bool)), ("str", lambda a: a.astype(str)), - ("bytes", lambda a: a.astype(str).astype(bytes)), - # ("float16", lambda a: a.astype(np.dtype("float16"))), TODO: Enable when #39 is fixed + # TODO: restore once #274 is in place. + # ("bytes", lambda a: a.astype(str).astype(bytes)), + # TODO: Enable when #39 is fixed + # ("float16", lambda a: a.astype(np.dtype("float16"))), ("float32", lambda a: a.astype("float32")), ("float64", lambda a: a.astype("float64")), ("int8", lambda a: a.astype("int8")), @@ -147,9 +149,7 @@ def test_from_anndata_DataFrame_type(tmp_path): ), ), ] - index = ( - np.arange(1, n + 1).astype(str).astype(bytes) - ) # AnnData requires string indices, TileDB wants bytes. Use LCD + index = np.arange(1, n + 1).astype(str).astype(str) df = pd.DataFrame( data={ f"col_{name}": cast(pd.Series(index=index, data=np.arange(n))) @@ -158,7 +158,8 @@ def test_from_anndata_DataFrame_type(tmp_path): ) X = np.ones((n, n), dtype=np.float32) adata = ad.AnnData(X=X, obs=df, var=df, dtype=X.dtype) - io.from_anndata(SOMA(tmp_path.as_posix()), adata) + soma = SOMA(tmp_path.as_posix()) + io.from_anndata(soma, adata) assert all( (tmp_path / sub_array_path).exists() for sub_array_path in ["obs", "var", "X/data"] @@ -176,33 +177,29 @@ def cmp_dtype(series, tdb: tiledb.Attr) -> bool: # TODO: see annotation_dataframe.py. Once Unicode attributes are queryable, we'll need # to remove this check which is verifying the current force-to-ASCII workaround. if ad_dtype.name == "str": - ad_dtype = np.dtype("S") + ad_dtype = np.dtype("bytes") return ad_dtype == tdb.dtype for df_name in ["var", "obs"]: - with tiledb.open((tmp_path / df_name).as_posix()) as arr: - df = getattr(adata, df_name) + annotation_dataframe = getattr(soma, df_name) + with annotation_dataframe._open() as A: # verify names match - assert set(arr.schema.attr(i).name for i in range(arr.schema.nattr)) == set( + assert set(A.schema.attr(i).name for i in range(A.schema.nattr)) == set( getattr(adata, df_name).keys() ) # verify length - assert n == len(arr.query(dims=[]).df[:]) + assert n == len(A.query(dims=[]).df[:]) # verify index - assert np.array_equal( - np.sort(df.index.to_numpy()), np.sort(arr[:][df_name + "_id"]) - ) + assert np.array_equal(np.sort(df.index.to_numpy()), np.sort(A.df[:].index)) # verify individual column types - attr_idx = { - arr.schema.attr(idx).name: idx for idx in range(arr.schema.nattr) - } + attr_idx = {A.schema.attr(idx).name: idx for idx in range(A.schema.nattr)} for k in df.keys(): - assert cmp_dtype(df[k], arr.schema.attr(attr_idx[k])) + assert cmp_dtype(df[k], A.schema.attr(attr_idx[k])) def test_from_anndata_annotations_empty(tmp_path): @@ -212,30 +209,26 @@ def test_from_anndata_annotations_empty(tmp_path): n_obs = 100 n_var = 10 - # AnnData requires a string index. TileDB does not support UTF8, so use ASCII. - obs = pd.DataFrame(index=np.arange(n_obs).astype(bytes)) - var = pd.DataFrame(index=np.arange(n_var).astype(bytes)) + obs = pd.DataFrame(index=np.arange(n_obs).astype(str)) + var = pd.DataFrame(index=np.arange(n_var).astype(str)) X = np.ones((n_obs, n_var)) adata = ad.AnnData(X=X, obs=obs, var=var, dtype=X.dtype) - io.from_anndata(SOMA(tmp_path.as_posix()), adata) + soma = SOMA(tmp_path.as_posix()) + io.from_anndata(soma, adata) - assert all( - (tmp_path / sub_array_path).exists() - for sub_array_path in ["obs", "var", "X/data"] - ) + assert soma.obs.exists() + assert soma.var.exists() + assert soma.X.data.exists() # obs/var are sparse. Sort before comparing contents. - with tiledb.open((tmp_path / "obs").as_posix()) as obs: - assert np.array_equal( - np.sort(adata.obs.index.to_numpy()), np.sort(obs[:]["obs_id"]) - ) - - with tiledb.open((tmp_path / "var").as_posix()) as var: - assert np.array_equal( - np.sort(adata.var.index.to_numpy()), np.sort(var[:]["var_id"]) - ) + assert np.array_equal( + np.sort(adata.obs.index.to_numpy()), np.sort(soma.obs.df().index) + ) + assert np.array_equal( + np.sort(adata.var.index.to_numpy()), np.sort(soma.var.df().index) + ) def test_from_anndata_annotations_none(tmp_path):