Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 57 additions & 5 deletions src/scanpy/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,41 @@ def read_10x_mtx(
return adata[:, gex_rows].copy()


def _read_mtx(
filename: Path,
*,
dtype: str = "float32",
sparse_format: Literal["csr", "csc", "coo"] = "csc",
) -> AnnData:
"""Read `.mtx` file with configurable sparse format.

Inlines the logic from :func:`anndata.read_mtx` to allow choosing the
sparse format directly, avoiding unnecessary conversions when the result
will be transposed (e.g., for 10x data where ``CSC.T → CSR``).

Parameters
----------
filename
Path to the ``.mtx`` file.
dtype
Numpy data type.
sparse_format
Sparse matrix format for the output. Defaults to ``'csc'`` so that
a subsequent ``.T`` produces a CSR matrix with no extra conversion.
"""
from scipy.io import mmread
from scipy.sparse import csc_matrix, csr_matrix # noqa: TID251

x = mmread(filename)
if x.dtype != np.dtype(dtype):
x = x.astype(dtype)
if sparse_format == "csr":
x = csr_matrix(x)
elif sparse_format == "csc":
x = csc_matrix(x)
return AnnData(x)


def _read_10x_mtx(
path: Path,
*,
Expand All @@ -607,11 +642,28 @@ def _read_10x_mtx(
"""Read mex from output from Cell Ranger v2- or v3+."""
# Only append .gz if not a legacy file AND compression is requested
suffix = "" if is_legacy else (".gz" if compressed else "")
adata = read(
path / f"{prefix}matrix.mtx{suffix}",
cache=cache,
cache_compression=cache_compression,
).T # transpose the data
mtx_file = path / f"{prefix}matrix.mtx{suffix}"
ext = f"mtx{suffix}"

if cache:
path_cache: Path = settings.cachedir / _slugify(mtx_file).replace(
f".{ext}", ".h5ad"
)
if path_cache.is_file():
logg.info(f"... reading from cache file {path_cache}")
adata = read_h5ad(path_cache)
else:
adata = _read_mtx(mtx_file, sparse_format="csc")
if isinstance(cache_compression, Default):
cache_compression = settings.cache_compression
if not path_cache.parent.is_dir():
path_cache.parent.mkdir(parents=True)
adata.write(path_cache, compression=cache_compression)
else:
# Read MTX as CSC so that .T yields CSR (one conversion, not two)
adata = _read_mtx(mtx_file, sparse_format="csc")

adata = adata.T # transpose: 10x stores var×obs, anndata uses obs×var
genes = pd.read_csv(
path / f"{prefix}{'genes' if is_legacy else 'features'}.tsv{suffix}",
header=None,
Expand Down
4 changes: 4 additions & 0 deletions tests/test_read_10x.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pytest

import scanpy as sc
from scanpy._compat import CSRBase

if TYPE_CHECKING:
from typing import Literal
Expand Down Expand Up @@ -63,6 +64,9 @@ def test_read_10x(
if "3.0.0" in str(h5_path):
h5.var.drop(columns="genome", inplace=True)

# Verify CSR format (not CSC from transpose)
assert isinstance(mtx.X, CSRBase), f"Expected CSR matrix, got {type(mtx.X)}"

# Check equivalence
assert_anndata_equal(mtx, h5)

Expand Down
Loading