diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..969cf3a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: CI + +on: + push: + branches: [master, main] + pull_request: + branches: [master, main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e ".[all]" + + - name: Lint with ruff + run: | + ruff check src/ tests/ + ruff format --check src/ tests/ + + - name: Run tests + run: | + pytest --cov=samalg --cov-report=xml --cov-report=term + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + files: ./coverage.xml + fail_ci_if_error: false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..f0ebd9d --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,46 @@ +name: Release to PyPI + +on: + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install build tools + run: | + python -m pip install --upgrade pip + pip install build + + - name: Build package + run: python -m build + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish: + needs: build + runs-on: ubuntu-latest + environment: pypi + permissions: + id-token: write + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5a2a789 --- /dev/null +++ b/.gitignore @@ -0,0 +1,102 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +*.egg + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Jupyter Notebook +.ipynb_checkpoints/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# ruff +.ruff_cache/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Project specific +*.h5 +*.hdf5 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..27af5a4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + args: ['--maxkb=1000'] + - id: check-toml + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.0 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + additional_dependencies: + - numpy>=1.26.0 + - pandas-stubs + - types-requests + args: [--ignore-missing-imports] + files: ^src/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5b7485d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,30 +0,0 @@ -# Try container-based infrastucture -#sudo: false - -notifications: - email: false - -language: python - -matrix: - include: - - python: 3.7 - os: linux - dist: bionic - sudo: true - - python: 3.8 - os: linux - dist: bionic - sudo: true - -before_install: - - ./.travis_before_install.sh - -install: - - ./.travis_install.sh - -script: - - ./.travis_test.sh - -#after_success: -# - ./.travis_deploy.sh diff --git a/.travis_before_install.sh b/.travis_before_install.sh deleted file mode 100755 index d570dfb..0000000 --- a/.travis_before_install.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -if [ $TRAVIS_OS_NAME == 'linux' ]; then - echo "Installing deps for linux" - sudo apt-get update - sudo apt-get install libhdf5-dev - #sudo apt-get install libgcc-5-dev - - #sudo add-apt-repository ppa:nschloe/swig-backports -y - #sudo apt-get -qq update - #sudo apt-get install -y swig3.0 -else - echo "OS not recognized: $TRAVIS_OS_NAME" - exit 1 -fi diff --git a/.travis_install.sh b/.travis_install.sh deleted file mode 100755 index dc373d5..0000000 --- a/.travis_install.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -if [ $TRAVIS_OS_NAME == 'osx' ]; then - export PATH="$HOME/miniconda/bin:$PATH" - source activate - ## Somehow we need this to execute the setup.py at all... - #pip install numpy -fi - -python setup.py install -## setuptools < 18.0 has issues with Cython as a dependency -#pip install Cython -#if [ $? != 0 ]; then -# exit 1 -#fi - -# deps #FIXME: do better -#python setup.py install -#pip install numpy -#pip install scipy -#pip install pandas -#pip install scikit-learn -#pip install matplotlib - -# old setuptools also has a bug for extras, but it still compiles -#pip install -v '.' -#if [ $? != 0 ]; then -# exit 1 -#fi diff --git a/.travis_test.sh b/.travis_test.sh deleted file mode 100755 index c6f7065..0000000 --- a/.travis_test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -if [ "$TRAVIS_OS_NAME" == 'osx' ]; then - export PATH="$HOME/miniconda/bin:$PATH" - source $HOME/miniconda/bin/activate - PYTHON="$HOME/miniconda/bin/python$PYTHON_VERSION" -else - PYTHON=${PYTHON:-python} -fi - -echo "python: ${PYTHON}" - -echo 'Running tests...' - -echo 'SAM test...' -${PYTHON} "test/test_sam.py" -if [ $? != 0 ]; then - exit 1 -fi - -echo 'done!' diff --git a/Docker/Dockerfile b/Docker/Dockerfile deleted file mode 100644 index dd1d95d..0000000 --- a/Docker/Dockerfile +++ /dev/null @@ -1,49 +0,0 @@ -FROM debian:buster-slim -ARG PATH="/root/miniconda/bin:${PATH}" -ENV PATH="/root/miniconda/bin:${PATH}" - -RUN apt-get update \ - && apt-get install -y wget \ - && apt-get install -y g++ \ - && apt-get install -y build-essential \ - && apt-get install -y git \ - && apt-get install -y libcurl4-gnutls-dev libxml2-dev libssl-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN wget -O /tmp/miniconda.sh \ - https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ - && mkdir /root/.conda \ - && bash /tmp/miniconda.sh -b -p /root/miniconda \ - && rm -f /tmp/miniconda.sh \ - && conda update -n base -c defaults conda \ - && conda install python=3.7 pip \ - && conda clean -afy - -RUN conda install -c plotly -c conda-forge \ - jupyter==1.0.0 \ - plotly==4.0.0 \ - colorlover==0.3.0 \ - ipyevents==0.8.1 \ - numpy==1.19.2 \ - scipy==1.5.2 \ - python-igraph==0.8.3 \ - leidenalg==0.8.3 \ - pandas==1.0.0 \ - umap-learn==0.4.6 \ - && conda clean -afy - -RUN git clone https://github.com/atarashansky/self-assembling-manifold.git && \ - /root/miniconda/bin/pip install self-assembling-manifold/. && rm -rf ~/.cache -RUN /root/miniconda/bin/pip install hnswlib==0.4.0 && rm -rf ~/.cache - -RUN chmod ugo+rwx /root -RUN chmod ugo+rwx /tmp && mkdir /jupyter && mkdir /jupyter/notebooks -WORKDIR /jupyter/ - -ARG USER_ID -ARG GROUP_ID -RUN addgroup --gid $GROUP_ID user -RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user -USER user - -CMD jupyter notebook --port=$PORT --no-browser --ip=0.0.0.0 --allow-root --NotebookApp.password="" --NotebookApp.token="" diff --git a/Docker/build_image.sh b/Docker/build_image.sh deleted file mode 100644 index 325fc7e..0000000 --- a/Docker/build_image.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -read -e -p "Docker image name: " image -docker build . -f Dockerfile -t $image --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) diff --git a/Docker/run_image.sh b/Docker/run_image.sh deleted file mode 100644 index 72c0fe7..0000000 --- a/Docker/run_image.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -read -e -p "Docker image name: " image -read -e -p "Docker container name: " name -read -e -p "Volume name: " folder -read -e -p "Jupyter notebook port: " port - -path="${folder/#\~/$HOME}" -parentdir="$(dirname "$path")" -chmod 755 $parentdir - -docker run -d --rm --name=$name \ - -v $path:/jupyter/notebooks \ - -p $port:$port -e PORT=$port $image diff --git a/README.md b/README.md index 2fe2c8e..cc22d41 100644 --- a/README.md +++ b/README.md @@ -68,14 +68,14 @@ Having activated the environment, SAM can be downloaded from the PyPI repository PIP install: ``` -pip install sam-algorithm +pip install sc-sam ``` Development version install: ``` git clone https://github.com/atarashansky/self-assembling-manifold.git cd self-assembling-manifold -python setup.py install +pip install -e . ``` For plotting, install `matplotlib`: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d393ae4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,193 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "sc-sam" +version = "2.0.2" +description = "The Self-Assembling-Manifold algorithm for single-cell RNA sequencing analysis" +readme = "README.md" +license = "MIT" +authors = [ + { name = "Alexander J. Tarashansky", email = "tarashan@stanford.edu" } +] +keywords = ["scrnaseq", "analysis", "manifold", "reconstruction", "single-cell"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +requires-python = ">=3.11" +dependencies = [ + "numpy>=1.26.0", + "scipy>=1.11.0", + "pandas>=2.0.0", + "scikit-learn>=1.3.0", + "numba>=0.59.0", + "anndata>=0.10.0", + "h5py>=3.10.0", + "umap-learn>=0.5.0", + "hnswlib>=0.8.0", + "dill>=0.3.7", + "packaging>=23.0", +] + +[project.optional-dependencies] +clustering = [ + "leidenalg>=0.10.0", + "louvain>=0.8.0", + "python-igraph>=0.11.0", +] +viz = [ + "matplotlib>=3.8", + "scanpy>=1.10.0", +] +hdbscan = [ + "hdbscan>=0.8.33", +] +harmony = [ + "harmonypy>=0.0.5", +] +dev = [ + "pytest>=8.0", + "pytest-cov>=4.1", + "ruff>=0.2", + "mypy>=1.8", + "pre-commit>=3.6", +] +all = [ + "sc-sam[clustering,viz,hdbscan,harmony,dev]", +] + +[project.urls] +Homepage = "https://github.com/atarashansky/self-assembling-manifold" +Documentation = "https://github.com/atarashansky/self-assembling-manifold#readme" +Repository = "https://github.com/atarashansky/self-assembling-manifold" +Issues = "https://github.com/atarashansky/self-assembling-manifold/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/samalg"] + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", + "/README.md", + "/pyproject.toml", +] + +# Ruff configuration +[tool.ruff] +target-version = "py311" +line-length = 100 +src = ["src", "tests"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "SIM", # flake8-simplify + "RUF", # Ruff-specific rules + "NPY", # NumPy-specific rules +] +ignore = [ + "E501", # line too long (handled by formatter) + "B008", # do not perform function calls in argument defaults + "B028", # no stacklevel in warnings + "C901", # too complex + "NPY002", # legacy numpy random (too noisy to fix) + "SIM101", # multiple isinstance calls + "SIM102", # nested if statements + "SIM105", # use contextlib.suppress + "SIM108", # Use ternary operator instead of if-else-block + "SIM113", # use enumerate + "SIM115", # use context manager for open + "SIM118", # use key in dict instead of key in dict.keys() + "UP038", # use X | Y in isinstance (removed in newer ruff) +] + +[tool.ruff.lint.per-file-ignores] +"tests/*" = ["S101", "NPY002"] # Allow assert and legacy numpy random in tests +"__init__.py" = ["F401"] # Allow unused imports in __init__.py + +[tool.ruff.lint.isort] +known-first-party = ["samalg"] + +# Mypy configuration +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true +strict_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +show_error_codes = true +exclude = [ + "tests/", + "build/", + "dist/", +] + +[[tool.mypy.overrides]] +module = [ + "scipy.*", + "sklearn.*", + "numba.*", + "umap.*", + "anndata.*", + "h5py.*", + "hnswlib.*", + "harmonypy.*", + "dill.*", + "leidenalg.*", + "louvain.*", + "igraph.*", + "hdbscan.*", + "matplotlib.*", + "scanpy.*", +] +ignore_missing_imports = true + +# Pytest configuration +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = [ + "-v", + "--tb=short", + "--strict-markers", +] +filterwarnings = [ + "ignore::numba.core.errors.NumbaWarning", + "ignore::DeprecationWarning", +] + +# Coverage configuration +[tool.coverage.run] +source = ["src/samalg"] +branch = true +omit = [ + "*/tests/*", + "*/__init__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise NotImplementedError", + "if TYPE_CHECKING:", + "if __name__ == .__main__.:", +] diff --git a/samalg/__init__.py b/samalg/__init__.py deleted file mode 100755 index a01470a..0000000 --- a/samalg/__init__.py +++ /dev/null @@ -1,1837 +0,0 @@ -import gc -import numpy as np -from anndata import AnnData -import anndata -import scipy.sparse as sp -import time -from sklearn.preprocessing import Normalizer, StandardScaler -import pickle -import pandas as pd -from . import utilities as ut -import sklearn.manifold as man -import sklearn.utils.sparsefuncs as sf -from packaging import version -import warnings -from numba.core.errors import NumbaWarning -from sklearn.utils.extmath import svd_flip -from scipy.linalg import block_diag -import harmonypy - -warnings.filterwarnings("ignore", category=NumbaWarning) - -__version__ = "1.0.0" - -""" -Copyright 2018, Alexander J. Tarashansky, All rights reserved. -Email: -""" - -class SAM(object): - """Self-Assembling Manifolds single-cell RNA sequencing analysis tool. - - SAM iteratively rescales the input gene expression matrix to emphasize - genes that are spatially variable along the intrinsic manifold of the data. - It outputs the gene weights, nearest neighbor matrix, and a 2D projection. - - Parameters - ---------- - counts : tuple or list (scipy.sparse matrix, numpy.ndarray,numpy.ndarray), - OR tuple or list (numpy.ndarray, numpy.ndarray,numpy.ndarray), OR - pandas.DataFrame, OR anndata.AnnData - - If a tuple or list, it should contain the gene expression data - (scipy.sparse or numpy.ndarray) matrix (cells x genes), numpy array of - gene IDs, and numpy array of cell IDs in that order. - - If a pandas.DataFrame, it should be (cells x genes) - - Only use this argument if you want to pass in preloaded data. Otherwise - use one of the load functions. - - - Attributes - ---------- - - preprocess_args: dict - Dictionary of arguments used for the 'preprocess_data' function. - - run_args: dict - Dictionary of arguments used for the 'run' function. - - adata_raw: AnnData - An AnnData object containing the raw, unfiltered input data. - - adata: AnnData - An AnnData object containing all processed data and SAM outputs. - - """ - - def __init__(self, counts=None, inplace=False): - - if isinstance(counts, tuple) or isinstance(counts, list): - raw_data, all_gene_names, all_cell_names = counts - if isinstance(raw_data, np.ndarray): - raw_data = sp.csr_matrix(raw_data) - - self.adata_raw = AnnData( - X=raw_data, - obs={"obs_names": all_cell_names}, - var={"var_names": all_gene_names}, - ) - - elif isinstance(counts, pd.DataFrame): - raw_data = sp.csr_matrix(counts.values) - all_gene_names = np.array(list(counts.columns.values)) - all_cell_names = np.array(list(counts.index.values)) - - self.adata_raw = AnnData( - X=raw_data, - obs={"obs_names": all_cell_names}, - var={"var_names": all_gene_names}, - ) - - elif isinstance(counts, AnnData): - all_cell_names = np.array(list(counts.obs_names)) - all_gene_names = np.array(list(counts.var_names)) - if counts.is_view: - counts = counts.copy() - - if inplace: - self.adata_raw = counts - else: - self.adata_raw = counts.copy() - - elif counts is not None: - raise Exception( - "'counts' must be either a tuple/list of " - "(data,gene IDs,cell IDs) or a Pandas DataFrame of" - "cells x genes" - ) - - if counts is not None: - if np.unique(all_gene_names).size != all_gene_names.size: - self.adata_raw.var_names_make_unique() - if np.unique(all_cell_names).size != all_cell_names.size: - self.adata_raw.obs_names_make_unique() - - if inplace: - self.adata = self.adata_raw - else: - self.adata = self.adata_raw.copy() - - if "X_disp" not in self.adata_raw.layers.keys(): - self.adata.layers["X_disp"] = self.adata.X - - self.run_args = {} - self.preprocess_args = {} - - def preprocess_data( - self, - div=1, - downsample=0, - sum_norm="cell_median", - norm="log", - min_expression=1, - thresh_low=0.0, - thresh_high=0.96, - thresh = None, - filter_genes=True, - ): - """Log-normalizes and filters the expression data. - - Parameters - ---------- - - div : float, optional, default 1 - The factor by which the gene expression will be divided prior to - normalization (e.g. log normalization). - - downsample : float, optional, default 0 - The factor by which to randomly downsample the data. If 0, the - data will not be downsampled. - - sum_norm : str or float, optional, default "cell_median" - If a float, the total number of transcripts in each cell will be - normalized to this value prior to normalization and filtering. - Otherwise, nothing happens. If 'cell_median', each cell is - normalized to have the median total read count per cell. If - 'gene_median', each gene is normalized to have the median total - read count per gene. - - norm : str, optional, default 'log' - If 'log', log-normalizes the expression data. If 'ftt', applies the - Freeman-Tukey variance-stabilization transformation. If - 'multinomial', applies the Pearson-residual transformation (this is - experimental and should only be used for raw, un-normalized UMI - datasets). If None, the data is not normalized. - - min_expression : float, optional, default 1 - The threshold above which a gene is considered - expressed. Gene expression values less than 'min_expression' are - set to zero. - - thresh_low : float, optional, default 0.0 - Keep genes expressed in greater than 'thresh_low'*100 % of cells, - where a gene is considered expressed if its expression value - exceeds 'min_expression'. - - thresh_high : float, optional, default 0.96 - Keep genes expressed in less than 'thresh_high'*100 % of cells, - where a gene is considered expressed if its expression value - exceeds 'min_expression'. - - filter_genes : bool, optional, default True - Setting this to False turns off filtering operations. - - """ - if thresh is not None: - thresh_low = thresh - thresh_high = 1-thresh - - self.preprocess_args = { - "div": div, - "sum_norm": sum_norm, - "norm": norm, - "min_expression": min_expression, - "thresh_low": thresh_low, - "thresh_high":thresh_high, - "filter_genes": filter_genes, - } - - self.run_args = self.adata.uns.get('run_args',{}) - # load data - try: - D = self.adata_raw.X - self.adata = self.adata_raw.copy() - - except AttributeError: - print("No data loaded") - - D = self.adata.X - if isinstance(D, np.ndarray): - D = sp.csr_matrix(D, dtype="float32") - else: - if str(D.dtype) != "float32": - D = D.astype("float32") - D.sort_indices() - - if D.getformat() == "csc": - D = D.tocsr() - - # sum-normalize - if sum_norm == "cell_median" and norm != "multinomial": - s = D.sum(1).A.flatten() - sum_norm = np.median(s) - D = D.multiply(1 / s[:, None] * sum_norm).tocsr() - elif sum_norm == "gene_median" and norm != "multinomial": - s = D.sum(0).A.flatten() - sum_norm = np.median(s[s > 0]) - s[s == 0] = 1 - D = D.multiply(1 / s[None, :] * sum_norm).tocsr() - - elif sum_norm is not None and norm != "multinomial": - D = D.multiply(1 / D.sum(1).A.flatten()[:, None] * sum_norm).tocsr() - - # normalize - self.adata.X = D - if norm is None: - D.data[:] = D.data / div - - elif norm.lower() == "log": - D.data[:] = np.log2(D.data / div + 1) - - elif norm.lower() == "ftt": - D.data[:] = np.sqrt(D.data / div) + np.sqrt(D.data / div + 1) - 1 - - elif norm.lower() == "asin": - D.data[:] = np.arcsinh(D.data / div) - elif norm.lower() == "multinomial": - ni = D.sum(1).A.flatten() # cells - pj = (D.sum(0) / D.sum()).A.flatten() # genes - col = D.indices - row = [] - for i in range(D.shape[0]): - row.append(i * np.ones(D.indptr[i + 1] - D.indptr[i])) - row = np.concatenate(row).astype("int32") - mu = sp.coo_matrix((ni[row] * pj[col], (row, col))).tocsr() - mu2 = mu.copy() - mu2.data[:] = mu2.data ** 2 - mu2 = mu2.multiply(1 / ni[:, None]) - mu.data[:] = (D.data - mu.data) / np.sqrt(mu.data - mu2.data) - - self.adata.X = mu - if sum_norm is None: - sum_norm = np.median(ni) - D = D.multiply(1 / ni[:, None] * sum_norm).tocsr() - D.data[:] = np.log2(D.data / div + 1) - - else: - D.data[:] = D.data / div - - # zero-out low-expressed genes - idx = np.where(D.data <= min_expression)[0] - D.data[idx] = 0 - - # filter genes - idx_genes = np.arange(D.shape[1]) - if filter_genes: - a, ct = np.unique(D.indices, return_counts=True) - c = np.zeros(D.shape[1]) - c[a] = ct - - keep = np.where( - np.logical_and(c / D.shape[0] > thresh_low, c / D.shape[0] <= thresh_high) - )[0] - - idx_genes = np.array(list(set(keep) & set(idx_genes))) - - mask_genes = np.zeros(D.shape[1], dtype="bool") - mask_genes[idx_genes] = True - - self.adata.X = self.adata.X.multiply(mask_genes[None, :]).tocsr() - self.adata.X.eliminate_zeros() - self.adata.var["mask_genes"] = mask_genes - - if norm == "multinomial": - self.adata.layers["X_disp"] = D.multiply(mask_genes[None, :]).tocsr() - self.adata.layers["X_disp"].eliminate_zeros() - else: - self.adata.layers["X_disp"] = self.adata.X - - self.calculate_mean_var() - - self.adata.uns["preprocess_args"] = self.preprocess_args - self.adata.uns['run_args'] = self.run_args - - def calculate_mean_var(self,adata=None): - if adata is None: - adata = self.adata - - if sp.issparse(adata.X): - mu,var = sf.mean_variance_axis(adata.X,axis=0) - else: - mu = adata.X.mean(0) - var = adata.X.var(0) - - adata.var['means'] = mu - adata.var['variances'] = var - - def get_avg_obsm(self, keym, keyl): - clu = self.get_labels_un(keyl) - cl = self.get_labels(keyl) - x = [] - for i in range(clu.size): - x.append(self.adata.obsm[keym][cl == clu[i]].mean(0)) - x = np.vstack(x) - return x - - def get_labels_un(self, key): - if key not in list(self.adata.obs.keys()): - print("Key does not exist in `obs`.") - return np.array([]) - else: - return np.array(list(np.unique(self.adata.obs[key]))) - - def get_labels(self, key): - if key not in list(self.adata.obs.keys()): - print("Key does not exist in `obs`.") - return np.array([]) - else: - return np.array(list(self.adata.obs[key])) - - def get_cells(self, label, key): - """Retrieves cells of a particular annotation. - - Parameters - ---------- - label - The annotation to retrieve - key - The key in `obs` from which to retrieve the annotation. - - """ - if key not in list(self.adata.obs.keys()): - print("Key does not exist in `obs`.") - return np.array([]) - else: - return np.array( - list(self.adata.obs_names[np.array(list(self.adata.obs[key])) == label]) - ) - - def load_data( - self, - filename, - transpose=True, - save_sparse_file=None, - sep=",", - calculate_avg=False, - **kwargs - ): - """Loads the specified data file. The file can be a table of - read counts (i.e. '.csv' or '.txt'), with genes as rows and cells - as columns by default. The file can also be a pickle file (output from - 'save_sparse_data') or an h5ad file (output from 'save_anndata'). - - This function that loads the file specified by 'filename'. - - Parameters - ---------- - filename - string - The path to the tabular raw expression counts file. - - sep - string, optional, default ',' - The delimeter used to read the input data table. By default - assumes the input table is delimited by commas. - - save_sparse_file - str, optional, default None - Path to which the sparse data will be saved ('.h5ad'). - Writes the SAM 'adata_raw' object to a h5ad file (the native AnnData - file format) for faster loading in the future. - - transpose - bool, optional, default True - By default, assumes file is (genes x cells). Set this to False if - the file has dimensions (cells x genes). - - calculate_avg - bool, optional, default True - If nearest neighbors are already calculated in the .h5ad file, - setting this parameter to True performs knn averaging and stores - the result in adata.layers['X_knn_avg']. This is a fairly dense - matrix, so set this to False if you do not need the averaged - expressions. - - """ - if filename.split(".")[-1] == "p": - raw_data, all_cell_names, all_gene_names = pickle.load(open(filename, "rb")) - - if transpose: - raw_data = raw_data.T - if raw_data.getformat() == "csc": - print("Converting sparse matrix to csr format...") - raw_data = raw_data.tocsr() - - save_sparse_file = None - elif filename.split(".")[-1] != "h5ad": - df = pd.read_csv(filename, sep=sep, index_col=0, **kwargs) - if transpose: - dataset = df.T - else: - dataset = df - - raw_data = sp.csr_matrix(dataset.values) - all_cell_names = np.array(list(dataset.index.values)) - all_gene_names = np.array(list(dataset.columns.values)) - - if filename.split(".")[-1] != "h5ad": - self.adata_raw = AnnData( - X=raw_data, - obs={"obs_names": all_cell_names}, - var={"var_names": all_gene_names}, - ) - - if np.unique(all_gene_names).size != all_gene_names.size: - self.adata_raw.var_names_make_unique() - if np.unique(all_cell_names).size != all_cell_names.size: - self.adata_raw.obs_names_make_unique() - - self.adata = self.adata_raw.copy() - self.adata.layers["X_disp"] = raw_data - - else: - self.adata = anndata.read_h5ad(filename, **kwargs) - if self.adata.raw is not None: - self.adata_raw = AnnData(X=self.adata.raw.X) - self.adata_raw.var_names = self.adata.var_names - self.adata_raw.obs_names = self.adata.obs_names - self.adata_raw.obs = self.adata.obs - - if version.parse(str(anndata.__version__)) >= version.parse("0.7rc1"): - del self.adata.raw - else: - self.adata.raw = None - - if ( - "X_knn_avg" not in self.adata.layers.keys() - and "connectivities" in self.adata.obsp.keys() - and calculate_avg - ): - self.dispersion_ranking_NN(save_avgs=True) - else: - self.adata_raw = self.adata - - if "X_disp" not in list(self.adata.layers.keys()): - self.adata.layers["X_disp"] = self.adata.X - save_sparse_file = None - filename = '.'.join(filename.split('.')[:-1])+'.h5ad' - self.adata.uns['path_to_file'] = filename - self.adata_raw.uns['path_to_file'] = filename - if save_sparse_file is not None: - if save_sparse_file.split(".")[-1] == "p": - self.save_sparse_data(save_sparse_file) - elif save_sparse_file.split(".")[-1] == "h5ad": - self.save_anndata(save_sparse_file) - - def save_anndata(self, fname='', save_knn=False, **kwargs): - """Saves `adata_raw` to a .h5ad file (AnnData's native file format). - - Parameters - ---------- - fname - string, default '' - The filename of the output file. If empty, defaults to `.adata.uns['path_to_file']`. - - save_knn - bool, optional, default = False - If True, saves `.layers['X_knn_avg']`. If False, does not save - this layer. Default value is set to False as the nearest-neighbor - averaged expression values can be quite dense. - - """ - if not save_knn: - try: - Xknn = self.adata.layers["X_knn_avg"] - del self.adata.layers["X_knn_avg"] - except: - 0 - if fname == '': - try: - fname = self.adata.uns['path_to_file'] - except KeyError: - raise KeyError('Path to file not known.') - - x = self.adata - x.raw = self.adata_raw - - # fixing weird issues when index name is an integer. - for y in [x.obs.columns,x.var.columns, - x.obs.index,x.var.index, - x.raw.var.index,x.raw.var.columns]: - y.name = str(y.name) if y.name is not None else None - - x.write_h5ad(fname, **kwargs) - if version.parse(str(anndata.__version__)) >= version.parse("0.7rc1"): - del x.raw - else: - x.raw = None - - try: - self.adata.layers["X_knn_avg"] = Xknn - except: - 0 - - def load_var_annotations(self, aname, sep=",", key_added="annotations"): - """Loads gene annotations. - - Loads the gene annotations into .adata_raw.var and .adata.var. The keys - added correspond to the column labels in the input table. - - Parameters - ---------- - aname - string or pandas.DataFrame - If string, it is the path to the annotations file, which should be - a table with the first column being the gene IDs and the first row - being the column names for the annotations. Alternatively, you can - directly pass in a preloaded pandas.DataFrame. - - sep - string, default ',' - The delimeter used in the file. Ignored if passing in a preloaded - pandas.DataFrame. - - """ - if isinstance(aname, pd.DataFrame): - ann = aname - else: - ann = pd.read_csv(aname, sep=sep, index_col=0) - - for i in range(ann.shape[1]): - self.adata_raw.var[ann.columns[i]] = ann[ann.columns[i]] - self.adata.var[ann.columns[i]] = ann[ann.columns[i]] - - def load_obs_annotations(self, aname, sep=","): - """Loads cell annotations. - - Loads the cell annotations into .adata_raw.obs and .adata.obs. The keys - added correspond to the column labels in the input table. - - Parameters - ---------- - aname - string or pandas.DataFrame - If string, it is the path to the annotations file, which should be - a table with the first column being the cell IDs and the first row - being the column names for the annotations. Alternatively, you can - directly pass in a preloaded pandas.DataFrame. - - sep - string, default ',' - The delimeter used in the file. Ignored if passing in a preloaded - pandas.DataFrame. - - """ - if isinstance(aname, pd.DataFrame): - ann = aname - else: - ann = pd.read_csv(aname, sep=sep, index_col=0) - - for i in range(ann.shape[1]): - self.adata_raw.obs[ann.columns[i]] = ann[ann.columns[i]] - self.adata.obs[ann.columns[i]] = ann[ann.columns[i]] - - def scatter( - self, - projection=None, - c=None, - colorspec=None, - cmap="rainbow", - linewidth=0.0, - edgecolor="k", - axes=None, - colorbar=True, - s=10, - **kwargs - ): - - """Display a scatter plot. - Displays a scatter plot using the SAM projection or another input - projection. - - Parameters - ---------- - projection - string, numpy.ndarray, default None - A case-sensitive string indicating the projection to display (a key - in adata.obsm) or a 2D numpy array with cell coordinates. If None, - projection defaults to UMAP. - - c - string, numpy.ndarray, default None - Categorical data to be mapped to a colormap and overlaid on top of - the projection. Can be a key from adata.obs or a 1D numpy array. - - colorspec - string, numpy.ndarray, default None - A string specifying a color or an array specifying the color - for each point (can be strings, RGBA, RGB, etc). Colorbar will be - turned off if `colorspec` is not None. - - axes - matplotlib axis, optional, default None - Plot output to the specified, existing axes. If None, create new - figure window. - - **kwargs - all keyword arguments in matplotlib.pyplot.scatter are eligible. - """ - - try: - import matplotlib.pyplot as plt - - if isinstance(projection, str): - try: - dt = self.adata.obsm[projection] - except KeyError: - print( - "Please create a projection first using run_umap or" "run_tsne" - ) - - elif projection is None: - try: - dt = self.adata.obsm["X_umap"] - except KeyError: - try: - dt = self.adata.obsm["X_tsne"] - except KeyError: - print( - "Please create either a t-SNE or UMAP projection" "first." - ) - return - else: - dt = projection - - if axes is None: - plt.figure() - axes = plt.gca() - - if colorspec is not None: - axes.scatter( - dt[:, 0], - dt[:, 1], - s=s, - linewidth=linewidth, - edgecolor=edgecolor, - c=colorspec, - **kwargs - ) - elif c is None: - axes.scatter( - dt[:, 0], - dt[:, 1], - s=s, - linewidth=linewidth, - edgecolor=edgecolor, - **kwargs - ) - else: - - if isinstance(c, str): - try: - c = self.get_labels(c) - except KeyError: - 0 # do nothing - - if (isinstance(c[0], str) or isinstance(c[0], np.str_)) and ( - isinstance(c, np.ndarray) or isinstance(c, list) - ): - i = ut.convert_annotations(c) - ui, ai = np.unique(i, return_index=True) - cax = axes.scatter( - dt[:, 0], - dt[:, 1], - c=i, - cmap=cmap, - s=s, - linewidth=linewidth, - edgecolor=edgecolor, - **kwargs - ) - - if colorbar: - cbar = plt.colorbar(cax, ax=axes, ticks=ui) - cbar.ax.set_yticklabels(c[ai]) - else: - if not (isinstance(c, np.ndarray) or isinstance(c, list)): - colorbar = False - i = c - - cax = axes.scatter( - dt[:, 0], - dt[:, 1], - c=i, - cmap=cmap, - s=s, - linewidth=linewidth, - edgecolor=edgecolor, - **kwargs - ) - - if colorbar: - plt.colorbar(cax, ax=axes) - return axes - except ImportError: - print("matplotlib not installed!") - - def show_gene_expression(self, gene, avg=True, axes=None, **kwargs): - """Display a gene's expressions. - Displays a scatter plot using the SAM projection or another input - projection with a particular gene's expressions overlaid. - Parameters - ---------- - gene - string - a case-sensitive string indicating the gene expression pattern - to display. - avg - bool, optional, default True - If True, the plots use the k-nearest-neighbor-averaged expression - values to smooth out noisy expression patterns and improves - visualization. - axes - matplotlib axis, optional, default None - Plot output to the specified, existing axes. If None, create new - figure window. - **kwargs - all keyword arguments in 'SAM.scatter' are eligible. - """ - all_gene_names = np.array(list(self.adata.var_names)) - cell_names = np.array(list(self.adata.obs_names)) - all_cell_names = np.array(list(self.adata_raw.obs_names)) - idx2 = np.where(np.in1d(all_cell_names, cell_names))[0] - idx = np.where(all_gene_names == gene)[0] - name = gene - if idx.size == 0: - print( - "Gene note found in the filtered dataset. Note that genes " - "are case sensitive." - ) - return - - if avg: - a = self.adata.layers["X_knn_avg"][:, idx].toarray().flatten() - if a.sum() == 0: - a = self.adata_raw.X[:, idx].toarray().flatten()[idx2] - try: - norm = self.preprocess_args["norm"] - except KeyError: - norm = "log" - if norm is not None: - if norm.lower() == "log": - a = np.log2(a + 1) - - elif norm.lower() == "ftt": - a = np.sqrt(a) + np.sqrt(a + 1) - elif norm.lower() == "asin": - a = np.arcsinh(a) - else: - a = self.adata_raw.X[:, idx].toarray().flatten()[idx2] - try: - norm = self.preprocess_args["norm"] - except KeyError: - norm = "log" - - if norm is not None: - if norm.lower() == "log": - a = np.log2(a + 1) - - elif norm.lower() == "ftt": - a = np.sqrt(a) + np.sqrt(a + 1) - elif norm.lower() == "asin": - a = np.arcsinh(a) - - axes = self.scatter(c=a, axes=axes, **kwargs) - axes.set_title(name) - - return axes, a - - def dispersion_ranking_NN(self, nnm=None, num_norm_avg=50, weight_mode='combined',save_avgs=False, adata=None): - """Computes the spatial dispersion factors for each gene. - - Parameters - ---------- - nnm - scipy.sparse, default None - Square cell-to-cell nearest-neighbor matrix. If None, uses the - nearest neighbor matrix in .adata.obsp['connectivities'] - - num_norm_avg - int, optional, default 50 - The top 'num_norm_avg' dispersions are averaged to determine the - normalization factor when calculating the weights. This ensures - that outlier genes do not significantly skew the weight - distribution. - - Returns: - ------- - weights - ndarray, float - The vector of gene weights. - """ - if adata is None: - adata = self.adata - - if nnm is None: - nnm = adata.obsp["connectivities"] - f = nnm.sum(1).A - f[f==0]=1 - D_avg = (nnm.multiply(1 / f)).dot(adata.layers["X_disp"]) - - if save_avgs: - adata.layers["X_knn_avg"] = D_avg.copy() - - if sp.issparse(D_avg): - mu, var = sf.mean_variance_axis(D_avg, axis=0) - if weight_mode == 'rms': - D_avg.data[:]=D_avg.data**2 - mu,_ =sf.mean_variance_axis(D_avg, axis=0) - mu=mu**0.5 - - if weight_mode == 'combined': - D_avg.data[:]=D_avg.data**2 - mu2,_ =sf.mean_variance_axis(D_avg, axis=0) - mu2=mu2**0.5 - else: - mu = D_avg.mean(0) - var = D_avg.var(0) - if weight_mode == 'rms': - mu =(D_avg**2).mean(0)**0.5 - if weight_mode == 'combined': - mu2 =(D_avg**2).mean(0)**0.5 - - - if not save_avgs: - del D_avg - gc.collect() - - if weight_mode == 'dispersion' or weight_mode == 'rms' or weight_mode == 'combined': - dispersions = np.zeros(var.size) - dispersions[mu > 0] = var[mu > 0] / mu[mu > 0] - adata.var["spatial_dispersions"] = dispersions.copy() - - if weight_mode == 'combined': - dispersions2 = np.zeros(var.size) - dispersions2[mu2 > 0] = var[mu2 > 0] / mu2[mu2 > 0] - - - elif weight_mode == 'variance': - dispersions = var - adata.var["spatial_variances"] = dispersions.copy() - else: - raise ValueError('`weight_mode` ',weight_mode,' not recognized.') - - ma = np.sort(dispersions)[-num_norm_avg:].mean() - dispersions[dispersions >= ma] = ma - - weights = ((dispersions / dispersions.max()) ** 0.5).flatten() - - if weight_mode == 'combined': - ma = np.sort(dispersions2)[-num_norm_avg:].mean() - dispersions2[dispersions2 >= ma] = ma - - weights2 = ((dispersions2 / dispersions2.max()) ** 0.5).flatten() - weights = np.vstack((weights,weights2)).max(0) - - return weights - - def run( - self, - max_iter=10, - verbose=True, - projection="umap", - stopping_condition=1e-2, - num_norm_avg=50, - k=20, - distance="cosine", - preprocessing="StandardScaler", - npcs=150, - n_genes=3000, - weight_PCs=False, - sparse_pca=False, - proj_kwargs={}, - seed = 0, - weight_mode='rms', - components=None, - batch_key=None - ): - """Runs the Self-Assembling Manifold algorithm. - - Parameters - ---------- - k - int, optional, default 20 - The number of nearest neighbors to identify for each cell. - - distance : string, optional, default 'correlation' - The distance metric to use when identifying nearest neighbors. - Can be any of the distance metrics supported by sklearn's 'pdist'. - - max_iter - int, optional, default 10 - The maximum number of iterations SAM will run. - - stopping_condition - float, optional, default 5e-3 - The stopping condition threshold for the RMSE between gene weights - in adjacent iterations. - - verbose - bool, optional, default True - If True, the iteration number and error between gene weights in - adjacent iterations will be displayed. - - projection - str, optional, default 'umap' - If 'tsne', generates a t-SNE embedding. If 'umap', generates a UMAP - embedding. Otherwise, no embedding will be generated. - - preprocessing - str, optional, default 'StandardScaler' - If 'Normalizer', use sklearn.preprocessing.Normalizer, which - normalizes expression data prior to PCA such that each cell has - unit L2 norm. If 'StandardScaler', use - sklearn.preprocessing.StandardScaler, which normalizes expression - data prior to PCA such that each gene has zero mean and unit - variance. Otherwise, do not normalize the expression data. We - recommend using 'StandardScaler' for large datasets and - 'Normalizer' otherwise. - - npcs - int, optional, default 150 - Sets the number of principal components to use. - - n_genes - int, optional, default 3000 - Sets the number of genes to include prior to PCA. Genes are selected - based on their SAM weights. If `None`, all genes are selected. - In this case, it is recommended to set `sparse_pca` equal to True. - - num_norm_avg - int, optional, default 50 - The top 'num_norm_avg' dispersions are averaged to determine the - normalization factor when calculating the weights. This prevents - genes with large spatial dispersions from skewing the distribution - of weights. - - sparse_pca - bool, optional, default False - If True, uses an implementation of PCA that accepts sparse inputs. - This is worth setting True for large datasets, where memory - constraints start becoming noticeable. - - weight_PCs - bool, optional, default False - Scale the principal components by their eigenvalues. If many - cell populations are expected, it is recommended to set this False - as the populations may be found on lesser-varying PCs. - - weight_mode - str, optional, default 'combined' - Can be one of 'dispersion', 'variance', 'rms', 'combined'. - 'dispersion' - Gene weights are calculated as the fano factors - (variance / mean) of kNN-averaged expressions. - - 'variance' - Gene weights are calculated as the variances of - kNN-averaged expressions. - - 'rms' - Gene weights are calculated as a modified fano factor - (variance / root-mean square) of kNN-averaged expressions. - This makes the weight more robust to genes with extremely low - mean expressions. - - 'combined' - Gene weights are calculated as the maximum of the - weights calculated from 'dispersion' and 'rms' modes. - - - proj_kwargs - dict, optional, default {} - A dictionary of keyword arguments to pass to the projection - functions. - """ - D = self.adata.X - if k < 5: - k = 5 - if k > D.shape[0] - 1: - k = D.shape[0] - 2 - - if preprocessing not in ["StandardScaler", "Normalizer", None, "None"]: - raise ValueError( - "preprocessing must be 'StandardScaler', 'Normalizer', or None" - ) - if weight_mode not in ["dispersion", "variance", "rms", "combined"]: - raise ValueError( - "weight_mode must be 'dispersion', 'variance', 'rms', or 'combined'." - ) - - if self.adata.layers['X_disp'].min() < 0 and weight_mode == 'dispersion': - print("`X_disp` layer contains negative values. Setting `weight_mode` to 'rms'.") - weight_mode = 'rms' - - numcells = D.shape[0] - - if n_genes == None: - n_genes = self.adata.shape[1] - if not sparse_pca and numcells > 10000: - warnings.warn('All genes are being used. It is recommended ' - 'to set `sparse_pca=False` to satisfy memory ' - 'constraints for datasets with more than ' - '10,000 cells. Setting `sparse_pca` to True.') - sparse_pca=True - - if not sparse_pca: - n_genes = min(n_genes, (D.sum(0) > 0).sum()) - - self.run_args = { - "max_iter": max_iter, - "verbose": verbose, - "projection": projection, - "stopping_condition": stopping_condition, - "num_norm_avg": num_norm_avg, - "k": k, - "distance": distance, - "preprocessing": preprocessing, - "npcs": npcs, - "n_genes": n_genes, - "weight_PCs": weight_PCs, - "proj_kwargs": proj_kwargs, - "sparse_pca": sparse_pca, - "weight_mode": weight_mode, - "seed": seed, - "components": components, - } - self.adata.uns["run_args"] = self.run_args - - - tinit = time.time() - np.random.seed(seed) - - if verbose: - print("RUNNING SAM") - - W = np.ones(D.shape[1]) - self.adata.var['weights'] = W - - old = np.zeros(W.size) - new = W - - i = 0 - err = ((new - old) ** 2).mean() ** 0.5 - - if max_iter < 5: - max_iter = 5 - - nnas = num_norm_avg - - while i < max_iter and err > stopping_condition: - - conv = err - if verbose: - print("Iteration: " + str(i) + ", Convergence: " + str(conv)) - - i += 1 - old = new - if i == 1: - first=True - else: - first=False - - W = self.calculate_nnm(batch_key=batch_key, - n_genes=n_genes, preprocessing=preprocessing, npcs=npcs, num_norm_avg=nnas, - weight_PCs=weight_PCs, sparse_pca=sparse_pca,weight_mode=weight_mode,seed=seed,components=components,first=first - ) - gc.collect() - new = W - err = ((new - old) ** 2).mean() ** 0.5 - self.adata.var['weights']=W - - all_gene_names = np.array(list(self.adata.var_names)) - indices = np.argsort(-W) - ranked_genes = all_gene_names[indices] - - self.adata.uns["ranked_genes"] = ranked_genes - - if projection == "tsne": - if verbose: - print("Computing the t-SNE embedding...") - self.run_tsne(**proj_kwargs) - elif projection == "umap": - if verbose: - print("Computing the UMAP embedding...") - self.run_umap(seed=seed,**proj_kwargs) - elif projection == "diff_umap": - if verbose: - print("Computing the diffusion UMAP embedding...") - self.run_diff_umap(**proj_kwargs) - - elapsed = time.time() - tinit - if verbose: - print("Elapsed time: " + str(elapsed) + " seconds") - - def calculate_nnm( - self, adata=None,batch_key=None,g_weighted=None, n_genes=3000, preprocessing='StandardScaler', npcs=150, num_norm_avg=50, weight_PCs=False, sparse_pca=False, - update_manifold=True,weight_mode='dispersion',seed=0,components=None,first=False - ): - if adata is None: - adata = self.adata - - numcells = adata.shape[0] - k = adata.uns['run_args'].get("k", 20) - distance = adata.uns['run_args'].get("distance", "correlation") - - D = adata.X - W = adata.var["weights"].values - - - if 'means' not in adata.var.keys() or 'variances' not in adata.var.keys(): - self.calculate_mean_var(adata) - - if n_genes is None: - gkeep = np.arange(W.size) - else: - if first: - mu = np.array(list(adata.var['means'])) - var = np.array(list(adata.var['variances'])) - mu[mu==0]=1 - dispersions = var/mu - gkeep = np.sort(np.argsort(-dispersions)[:n_genes]) - else: - gkeep = np.sort(np.argsort(-W)[:n_genes]) - - if g_weighted is None: - if preprocessing == "Normalizer": - Ds = D[:, gkeep] - if sp.issparse(Ds) and not sparse_pca: - Ds = Ds.toarray() - - Ds = Normalizer().fit_transform(Ds) - - elif preprocessing == "StandardScaler": - if not sparse_pca: - Ds = D[:, gkeep] - if sp.issparse(Ds): - Ds = Ds.toarray() - - v = adata.var['variances'].values[gkeep] - m = adata.var['means'].values[gkeep] - v[v==0]=1 - Ds = (Ds - m)/v**0.5 - - Ds[Ds > 10] = 10 - Ds[Ds < -10] = -10 - else: - Ds = D[:, gkeep] - v = adata.var['variances'].values[gkeep] - v[v==0]=1 - Ds = Ds.multiply(1/v**0.5).tocsr() - - else: - Ds = D[:, gkeep].toarray() - - if sp.issparse(Ds): - D_sub = Ds.multiply(W[gkeep]).tocsr() - else: - D_sub = Ds * (W[gkeep]) - - if components is None: - if not sparse_pca: - npcs = min(npcs, min((D.shape[0],gkeep.size))) - if numcells > 500: - g_weighted, pca = ut.weighted_PCA( - D_sub, - npcs=npcs, - do_weight=weight_PCs, - solver="auto",seed=seed - ) - else: - g_weighted, pca = ut.weighted_PCA( - D_sub, - npcs=npcs, - do_weight=weight_PCs, - solver="full",seed=seed - ) - pca_obj = pca - components = pca.components_ - - else: - npcs=min(npcs, min((D.shape[0],gkeep.size)) - 1) - v = adata.var['variances'].values[gkeep] - v[v==0]=1 - m = adata.var['means'].values[gkeep]*W[gkeep] - no = (m/v**0.5) if preprocessing=='StandardScaler' else D_sub.mean(0).A.flatten() - mean_correction = no - output = ut._pca_with_sparse(D_sub, npcs,mu = (no)[None,:], seed = seed) - components = output['components'] - g_weighted = output['X_pca'] - - if weight_PCs: - ev = output['variance'] - ev = ev / ev.max() - g_weighted = g_weighted * (ev ** 0.5) - else: - components = components[:,gkeep] - v = adata.var['variances'].values[gkeep] - v[v==0]=1 - m = adata.var['means'].values[gkeep]*W[gkeep] - ns = (m/v**0.5) if preprocessing=='StandardScaler' else D_sub.mean(0).A.flatten() - mean_correction = ns - - if sp.issparse(D_sub): - g_weighted = D_sub.dot(components.T) - ns.flatten().dot(components.T) - else: - g_weighted = (D_sub-ns).dot(components.T) - if weight_PCs: - ev = g_weighted.var(0) #fix this? - ev = ev / ev.max() - g_weighted = g_weighted * (ev ** 0.5) - - adata.varm["PCs"] = np.zeros(shape=(adata.n_vars, npcs)) - adata.varm["PCs"][gkeep] = components.T - adata.obsm["X_processed"] = D_sub - adata.uns["dimred_indices"] = gkeep - if sparse_pca: - mc = np.zeros(adata.shape[1]) - mc[gkeep] = mean_correction - adata.var["mean_correction"] = mc - - if distance == "euclidean": - pass;#g_weighted = Normalizer().fit_transform(g_weighted) - - if batch_key is not None: - harmony_out = harmonypy.run_harmony(g_weighted, adata.obs, batch_key, verbose=False) - g_weighted = harmony_out.Z_corr.T - - - if update_manifold: - edm = ut.calc_nnm(g_weighted, k, distance) - adata.obsp['distances'] = edm.tolil() - adata.obsp['distances'].setdiag(0) - adata.obsp['distances']=adata.obsp['distances'].tocsr() - - EDM = edm.copy() - EDM.data[:] = 1 - EDM = EDM.tolil(); EDM.setdiag(1); EDM = EDM.tocsr(); - - adata.obsp['connectivities'] = EDM - - if distance in ['correlation','cosine']: #keep edge weights and store in nnm if distance is bounded - edm.data[:] = 1-edm.data - edm = edm.tolil(); edm.setdiag(1); edm = edm.tocsr(); - edm.data[edm.data<0]=0.001 #if negative correlation, set close to zero but not zero to preserve kNN structure - adata.obsp['nnm'] = edm - else: - adata.obsp['nnm'] = EDM - W = self.dispersion_ranking_NN(EDM, weight_mode=weight_mode, num_norm_avg=num_norm_avg, adata=adata) - adata.obsm["X_pca"] = g_weighted - ge = np.array(list(adata.var_names[gkeep])) - return W - else: - print('Not updating the manifold...') - PCs = np.zeros(shape=(adata.n_vars, npcs)) - PCs[gkeep] = components.T - return PCs,g_weighted - - def run_tsne(self, X=None, metric="correlation", **kwargs): - """Wrapper for sklearn's t-SNE implementation. - - See sklearn for the t-SNE documentation. All arguments are the same - with the exception that 'metric' is set to 'correlation' by default. - """ - if X is not None: - dt = man.TSNE(metric=metric, **kwargs).fit_transform(X) - return dt - - else: - distance = self.adata.uns['run_args'].get("distance", "correlation") - dt = man.TSNE(metric=distance, **kwargs).fit_transform( - self.adata.obsm["X_pca"] - ) - tsne2d = dt - self.adata.obsm["X_tsne"] = tsne2d - - def run_umap(self, X="X_pca", metric=None,seed = None, **kwargs): - """Wrapper for umap-learn. - - See https://github.com/lmcinnes/umap sklearn for the documentation - and source code. - """ - - import umap as umap - - if metric is None: - metric = self.adata.uns['run_args'].get("distance", "correlation") - - if seed is None: - seed = self.adata.uns['run_args'].get('seed',0) - - if type(X) is str: - if X == "": - X = self.adata.X - else: - X = self.adata.obsm[X] - # print(X.shape) - umap_obj = umap.UMAP(metric=metric,random_state=seed, **kwargs) - umap2d = umap_obj.fit_transform(X) - self.adata.obsm["X_umap"] = umap2d - return umap_obj - else: - umap_obj = umap.UMAP(metric=metric,random_state=seed, **kwargs) - dt = umap_obj.fit_transform(X) - return dt, umap_obj - - def run_diff_umap( - self, use_rep="X_pca", metric="euclidean", n_comps=15, method="gauss", **kwargs - ): - """ - Experimental -- running UMAP on the diffusion components. Requires scanpy. - """ - import scanpy.api as sc - - k = self.adata.uns['run_args'].get("k", 20) - distance = self.adata.uns['run_args'].get("distance", "correlation") - sc.pp.neighbors( - self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method - ) - sc.tl.diffmap(self.adata, n_comps=n_comps) - sc.pp.neighbors( - self.adata, - use_rep="X_diffmap", - n_neighbors=k, - metric="euclidean", - method=method, - ) - - if "X_umap" in self.adata.obsm.keys(): - temp = self.adata.obsm["X_umap"].copy() - - sc.tl.umap(self.adata, min_dist=0.1, copy=False) - temp2 = self.adata.obsm["X_umap"] - self.adata.obsm["X_umap"] = temp - self.adata.obsm["X_diff_umap"] = temp2 - - def run_diff_map( - self, use_rep="X_pca", metric="euclidean", n_comps=15, method="gauss", **kwargs - ): - import scanpy.api as sc - - k = self.adata.uns['run_args'].get("k", 20) - distance = self.adata.uns['run_args'].get("distance", "correlation") - sc.pp.neighbors( - self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method - ) - sc.tl.diffmap(self.adata, n_comps=n_comps + 1) - self.adata.obsm["X_diffmap"] = self.adata.obsm["X_diffmap"][:, 1:] - - def density_clustering(self, X=None, eps=1, metric="euclidean", **kwargs): - from sklearn.cluster import DBSCAN - - if X is None: - X = self.adata.obsm["X_umap"] - save = True - else: - save = False - - cl = DBSCAN(eps=eps, metric=metric, **kwargs).fit_predict(X) - k = self.adata.uns['run_args'].get("k", 20) - idx0 = np.where(cl != -1)[0] - idx1 = np.where(cl == -1)[0] - if idx1.size > 0 and idx0.size > 0: - xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :]) - knn = np.argsort(xcmap.T, axis=1)[:, :k] - nnm = np.zeros(xcmap.shape).T - nnm[ - np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(), - knn.flatten(), - ] = 1 - nnmc = np.zeros((nnm.shape[0], cl.max() + 1)) - for i in range(cl.max() + 1): - nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1) - - cl[idx1] = np.argmax(nnmc, axis=1) - - if save: - self.adata.obs["dbscan_clusters"] = pd.Categorical(cl) - else: - return cl - - def clustering(self, X=None, param=None, method="leiden"): - """A wrapper for clustering the SAM output using various clustering - algorithms - - Parameters - ---------- - X - data, optional, default None - Data to be passed into the selected clustering algorithm. If None, - uses the data in the SAM AnnData object. Different clustering - algorithms accept different types of data. For example, Louvain - and Leiden accept a scipy.sparse adjacency matrix representing the - nearest neighbor graph. Dbscan, kmeans, and Hdbscan accept - coordinates (like UMAP coordinates or PCA coordinates). If None, - cluster results are saved to the 'obs' attribute in the AnnData - object. Otherwise, cluster results are returned. - - param : float, optional, default None - The parameter used for the different clustering algorithms. For - louvain and leiden, it is the resolution parameter. For dbscan, it - is the distance parameter. For kmeans, it is the number of clusters. - - method : string, optional, default 'leiden' - Determines which clustering method is run. - 'leiden' - Leiden clustering with modularity optimization - 'leiden_sig' - Leiden clustering with significance optimization - 'louvain' - Louvain clustering with modularity optimization - 'louvain_sig' - Louvain clustering with significance optimization - 'kmeans' - Kmeans clustering - 'dbscan' - DBSCAN clustering - 'hdbscan' - HDBSCAN clustering - If X is None, cluster assignments are saved in '.adata.obs' with - key name equal to method + '_clusters'. Otherwise, they are returned. - """ - if method == "leiden": - if param is None: - param = 1 - cl = self.leiden_clustering(X=X, res=param, method="modularity") - elif method == "leiden_sig": - if param is None: - param = 1 - cl = self.leiden_clustering(X=X, res=param, method="significance") - elif method == "louvain": - if param is None: - param = 1 - cl = self.louvain_clustering(X=X, res=param, method="modularity") - elif method == "louvain_sig": - if param is None: - param = 1 - cl = self.louvain_clustering(X=X, res=param, method="significance") - elif method == "kmeans": - if param is None: - param = 6 - cl = self.kmeans_clustering(param, X=X) - elif method == "hdbscan": - if param is None: - param = 25 - cl = self.hdbknn_clustering(npcs=param) - elif method == "dbscan": - if param is None: - param = 0.5 - cl = self.density_clustering(eps=param) - else: - cl = None - return cl - - def louvain_clustering(self, X=None, res=1, method="modularity"): - """Runs Louvain clustering using the vtraag implementation. Assumes - that 'louvain' optional dependency is installed. - - Parameters - ---------- - res - float, optional, default 1 - The resolution parameter which tunes the number of clusters Louvain - finds. - - method - str, optional, default 'modularity' - Can be 'modularity' or 'significance', which are two different - optimizing funcitons in the Louvain algorithm. - - """ - - if X is None: - X = self.adata.obsp["connectivities"] - save = True - else: - if not sp.isspmatrix_csr(X): - X = sp.csr_matrix(X) - save = False - - import igraph as ig - import louvain - - adjacency = X - sources, targets = adjacency.nonzero() - weights = adjacency[sources, targets] - if isinstance(weights, np.matrix): - weights = weights.A1 - g = ig.Graph(directed=True) - g.add_vertices(adjacency.shape[0]) - g.add_edges(list(zip(sources, targets))) - try: - g.es["weight"] = weights - except BaseException: - pass - - if method == "significance": - cl = louvain.find_partition(g, louvain.SignificanceVertexPartition) - else: - cl = louvain.find_partition( - g, louvain.RBConfigurationVertexPartition, resolution_parameter=res - ) - - if save: - if method == "modularity": - self.adata.obs["louvain_clusters"] = pd.Categorical( - np.array(cl.membership) - ) - elif method == "significance": - self.adata.obs["louvain_sig_clusters"] = pd.Categorical( - np.array(cl.membership) - ) - else: - return np.array(cl.membership) - - def kmeans_clustering(self, numc, X=None, npcs=25): - """Performs k-means clustering. - - Parameters - ---------- - numc - int - Number of clusters - - npcs - int, optional, default 25 - Number of principal components to use as inpute for k-means - clustering. - - """ - - from sklearn.cluster import KMeans - - if X is None: - X = self.adata.obsm['X_pca'] - - km = KMeans(n_clusters=numc) - cl = km.fit_predict(Normalizer().fit_transform(X)) - - self.adata.obs["kmeans_clusters"] = pd.Categorical(cl) - return cl, km - - def leiden_clustering(self, X=None, res=1, method="modularity", seed = 0): - - if X is None: - X = self.adata.obsp["connectivities"] - save = True - else: - if not sp.isspmatrix_csr(X): - X = sp.csr_matrix(X) - save = False - - import igraph as ig - import leidenalg - - adjacency = X - sources, targets = adjacency.nonzero() - weights = adjacency[sources, targets] - if isinstance(weights, np.matrix): - weights = weights.A1 - g = ig.Graph(directed=True) - g.add_vertices(adjacency.shape[0]) - g.add_edges(list(zip(sources, targets))) - try: - g.es["weight"] = weights - except BaseException: - pass - - if method == "significance": - cl = leidenalg.find_partition(g, leidenalg.SignificanceVertexPartition,seed=seed) - else: - cl = leidenalg.find_partition( - g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res,seed=seed - ) - - if save: - if method == "modularity": - self.adata.obs["leiden_clusters"] = pd.Categorical( - np.array(cl.membership) - ) - elif method == "significance": - self.adata.obs["leiden_sig_clusters"] = pd.Categorical( - np.array(cl.membership) - ) - else: - return np.array(cl.membership) - - def hdbknn_clustering(self, X=None, k=None, npcs=15, **kwargs): - import hdbscan - - if X is None: - X = self.adata.obsm['X_pca'] - X = Normalizer().fit_transform(X) - save = True - else: - save = False - - if k is None: - k = self.adata.uns['run_args'].get("k", 20) - - hdb = hdbscan.HDBSCAN(metric="euclidean", **kwargs) - - cl = hdb.fit_predict(X) - - idx0 = np.where(cl != -1)[0] - idx1 = np.where(cl == -1)[0] - if idx1.size > 0 and idx0.size > 0: - xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :]) - knn = np.argsort(xcmap.T, axis=1)[:, :k] - nnm = np.zeros(xcmap.shape).T - nnm[ - np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(), - knn.flatten(), - ] = 1 - nnmc = np.zeros((nnm.shape[0], cl.max() + 1)) - for i in range(cl.max() + 1): - nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1) - - cl[idx1] = np.argmax(nnmc, axis=1) - - if save: - self.adata.obs["hdbscan_clusters"] = pd.Categorical(cl) - else: - return cl - - def identify_marker_genes_rf(self, labels=None, clusters=None, n_genes=4000): - """ - Ranks marker genes for each cluster using a random forest - classification approach. - - Parameters - ---------- - - labels - numpy.array or str, optional, default None - Cluster labels to use for marker gene identification. - Can also be a string corresponding to any of the keys - in adata.obs. - - clusters - int/string or array-like, default None - A cluster ID (int or string depending on the labels used) - or vector corresponding to the specific cluster ID(s) for - which marker genes will be calculated. If None, marker genes - will be computed for all clusters, and the result will be written - to adata.uns. - - n_genes - int, optional, default 4000 - By default, trains the classifier on the top 4000 SAM-weighted - genes. - - Returns - ------- - (dictionary of markers for each cluster, - dictionary of marker scores for each cluster) - """ - - if labels is None: - try: - keys = np.array(list(self.adata.obs_keys())) - lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0]) - except KeyError: - print( - "Please generate cluster labels first or set the " - "'labels' keyword argument." - ) - return - elif isinstance(labels, str): - lbls = self.get_labels(labels) - else: - lbls = labels - - from sklearn.ensemble import RandomForestClassifier - - markers = {} - markers_scores = {} - if clusters == None: - lblsu = np.unique(lbls) - else: - lblsu = np.unique(clusters) - - indices = np.argsort(-self.adata.var["weights"].values) - X = self.adata.layers["X_disp"][:, indices[:n_genes]].toarray() - for K in range(lblsu.size): - # print(K) - y = np.zeros(lbls.size) - - y[lbls == lblsu[K]] = 1 - - clf = RandomForestClassifier( - n_estimators=100, max_depth=None, random_state=0 - ) - - clf.fit(X, y) - - idx = np.argsort(-clf.feature_importances_) - - markers[lblsu[K]] = self.adata.uns["ranked_genes"][idx] - markers_scores[lblsu[K]] = clf.feature_importances_[idx] - - if clusters is None: - if isinstance(labels, str): - self.adata.uns["rf_" + labels] = markers - else: - self.adata.uns["rf"] = markers - - return markers, markers_scores - - def identify_marker_genes_sw(self, labels=None, clusters=None, inplace=True): - """ - Ranks marker genes for each cluster using partial sums of spatial - dispersions. - - Parameters - ---------- - - labels - numpy.array or str, optional, default None - Cluster labels to use for marker gene identification. - Can also be a string corresponding to any of the keys - in adata.obs. - - clusters - int/string or array-like, default None - A cluster ID (int or string depending on the labels used) - or vector corresponding to the specific cluster ID(s) for - which marker genes will be calculated. If None, marker genes - will be computed for all clusters, and the result will be written - to adata.uns. - - inplace - bool, default True - If True, returns nothing and places marker scores in `var` - - """ - - if labels is None: - try: - keys = np.array(list(self.adata.obs_keys())) - lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0]) - except KeyError: - print( - "Please generate cluster labels first or set the " - "'labels' keyword argument." - ) - return - elif isinstance(labels, str): - lbls = self.get_labels(labels) - else: - lbls = labels - - markers_scores = [] - if clusters == None: - lblsu = np.unique(lbls) - else: - lblsu = np.unique(clusters) - - if "X_knn_avg" not in list(self.adata.layers.keys()): - print("Performing kNN-averaging...") - self.dispersion_ranking_NN(save_avgs=True) - l = self.adata.layers["X_knn_avg"] - m = l.mean(0).A.flatten() - cells = np.array(list(self.adata.obs_names)) - for K in range(lblsu.size): - selected = np.where(np.in1d(cells, self.get_cells(lblsu[K], labels)))[0] - ms = l[selected, :].mean(0).A.flatten() - lsub = l[selected, :] - lsub.data[:] = lsub.data ** 2 - ms2 = lsub.mean(0).A.flatten() - v = ms2 - 2 * ms * m + m ** 2 - wmu = np.zeros(v.size) - wmu[m > 0] = v[m > 0] / m[m > 0] - markers_scores.append(wmu) - A = pd.DataFrame( - data=np.vstack(markers_scores), index=lblsu, columns=self.adata.var_names - ).T - if inplace: - A.columns = labels + ";;" + A.columns.astype('str').astype('object') - for Ac in A.columns: - self.adata.var[Ac]=A[Ac] - else: - return A - - def identify_marker_genes_ratio(self, labels=None): - """ - Ranks marker genes for each cluster using a SAM-weighted - expression-ratio approach (works quite well). - - Parameters - ---------- - - labels - numpy.array or str, optional, default None - Cluster labels to use for marker gene identification. If None, - assumes that one of SAM's clustering algorithms has been run. Can - be a string (i.e. 'louvain_clusters', 'kmeans_clusters', etc) to - specify specific cluster labels in adata.obs. - - """ - if labels is None: - try: - keys = np.array(list(self.adata.obs_keys())) - lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0]) - except KeyError: - print( - "Please generate cluster labels first or set the " - "'labels' keyword argument." - ) - return - elif isinstance(labels, str): - lbls = self.get_labels(labels) - else: - lbls = labels - - all_gene_names = np.array(list(self.adata.var_names)) - - markers = {} - - s = np.array(self.adata.layers["X_disp"].sum(0)).flatten() - lblsu = np.unique(lbls) - for i in lblsu: - d = np.array(self.adata.layers["X_disp"][lbls == i, :].sum(0)).flatten() - rat = np.zeros(d.size) - rat[s > 0] = ( - d[s > 0] ** 2 / s[s > 0] * self.adata.var["weights"].values[s > 0] - ) - x = np.argsort(-rat) - markers[i] = all_gene_names[x[:]] - - self.adata.uns["marker_genes_ratio"] = markers - - return markers - - def gui(self): - if 'SamGui' in self.__dict__.keys(): - return self.SamGui.SamPlot - else: - from samalg.gui import SAMGUI - self.SamGui = SAMGUI(self) - return self.SamGui.SamPlot - - def save(self,fn): - import dill - if len(fn.split('.pkl')) == 1: - fn = fn + '.pkl' - self.path_to_file = fn - d = {} - for k in self.__dict__.keys(): - d[k] = self.__dict__[k] - if 'SamGui' in d.keys(): - del d['SamGui'] - with open(fn,'wb') as f: - dill.dump(d,f) - if 'SamGui' in self.__dict__.keys(): - dill.dump(self.SamGui,f) - - def load(self,fn): - import dill - if len(fn.split('.pkl')) == 1: - fn = fn + '.pkl' - with open(fn,'rb') as f: - self.__dict__ = dill.load(f) - try: - self.__dict__['SamGui'] = dill.load(f) - except: - pass; diff --git a/samalg/gui.py b/samalg/gui.py deleted file mode 100644 index e7fcd35..0000000 --- a/samalg/gui.py +++ /dev/null @@ -1,1987 +0,0 @@ -import pickle -import colorlover as cl -import numpy as np -import scipy.sparse as sp -import ipywidgets as widgets -import plotly.graph_objs as go -from . import SAM -from . import utilities as ut -import pandas as pd - -from ipyevents import Event -from ipywidgets import Widget, Layout -import warnings -from numba.core.errors import NumbaWarning -warnings.filterwarnings("ignore", category=NumbaWarning) - -__version__ = "0.8.7" - - -class SAMGUI(object): - def __init__(self, sam=None,title=None, close_all_widgets=False, default_proj='X_umap',height=600): - self.ACTION_LOG = [] - - if close_all_widgets: - Widget.close_all() - self.widget_height = height - self.log('Setting widget height to {}'.format(height)) - - self.default_proj = default_proj - self.titles = [] - self.stab = widgets.Tab(layout={"height": "95%", "width": "50%"}) - self.stab.observe(self.on_switch_tabs, "selected_index") - - if sam is not None: - self.log('Initializing GUI from input SAM object.') - if not isinstance(sam,list): - sams=[sam] - else: - sams = sam - sam = sams[0] - if title is not None: - if not isinstance(title,list): - titles=[title] - else: - titles = title - title = titles[0] - else: - titles = [None]*len(sams) - - self.init_from_sam(sam) - if titles[0] is None: - title = 'Full dataset' - else: - title = titles[0] - - self.create_plot(0, title) - items = [self.stab, self.tab] - self.titles = [title] - for i in range(1,len(sams)): - self.sams.append(sams[i]) - self.selected.append(np.ones(sams[i].adata.shape[0]).astype("bool")) - self.selected_cells.append(np.array(list(sams[i].adata.obs_names))) - self.active_labels.append(np.zeros(sams[i].adata.shape[0])) - self.dd_opts.append(["Toggle cluster"]) - self.gene_expressions.append(np.zeros(sams[i].adata.shape[0])) - self.marker_genes.append( - np.array(list(sams[i].adata.var_names))[ - np.argsort(-sams[i].adata.var["weights"].values) - ] - ) - self.marker_genes_tt.append("Genes ranked by SAM weights.") - self.ds.append(0) - if titles[i] is None: - title = 'Full dataset {}'.format(i) - else: - title = titles[i] - self.titles.append(title) - self.create_plot(i, title) - self.update_dropdowns(i) - else: - self.log('Initializing blank GUI.') - tab = widgets.Tab(layout={"height": "95%", "width": "50%"}) - load_box = self.init_load() - children = [load_box] - self.SAM_LOADED = False - tab.children = children - tab.set_title(0, "LOAD_SAM") - self.create_plot(0, "SAM not loaded") - self.titles = ["SAM not loaded"] - items = [self.stab, tab] - - self.current_tab = 0 - self.current_sam = sam - - self.SamPlot = widgets.HBox( - items, layout=Layout(width="auto", height=str(height) + "px") - ) - - x = self.stab.children[self.stab.selected_index] - f = x.layout.xaxis.range - x.layout.xaxis.range=(-1,1) - x.layout.xaxis.range=f - - f = x.layout.yaxis.range - x.layout.yaxis.range=(-1,1) - x.layout.yaxis.range=f - - def __setstate__(self,d): - self.__init__(sam = d['sams'],title=d['titles'], - height=d['widget_height'],default_proj=d['default_proj']) - - for k in d.keys(): - self.__dict__[k] = d[k] - self.__dict__['ds'] = [0]*len(self.sams) - - for i in range(len(self.sams)): - self.create_plot(i,self.titles[i]) - - def __getstate__(self): - d = {} - exclude = ['SamPlot','children','ps_dict','rb_dict','cs_dict','stab','rs_box','cs_box', - 'pp_box','out','tab','ds'] - for k in self.__dict__.keys(): - if k not in exclude: - d[k] = self.__dict__[k] - return d - - - - def log(self,s): - self.ACTION_LOG.append(s) - - def init_from_sam(self, sam): - self.current_sam = sam - tab = widgets.Tab(layout={"height": "95%", "width": "50%"}) - self.load_vars_from_sam(sam) - self.pp_box = self.init_preprocess() - self.rs_box = self.init_run_sam() - self.cs_box = self.init_cs() - - self.SAM_LOADED = True - self.out = widgets.Output( - layout={"border": "1px solid black", "height": "95%", "width": "100%"} - ) - children = [self.cs_box, self.rs_box, self.pp_box, self.out] - self.children = children - names = ["Interact (0)", "Analysis (0)", "Preprocess (0)", "Output (0)"] - self.names = ["Interact", "Analysis", "Preprocess", "Output"] - tab.children = children - tab.set_trait("selected_index", 0) - for i in range(len(children)): - tab.set_title(i, names[i]) - tab.set_trait("selected_index", 0) - self.tab = tab - - def close_all_tabs(self): - self.log('Closing all tabs except tab 0.') - self.stab.set_trait("selected_index", 0) - I = 1 - while len(self.sams) > 1: - self.ds[I].close() - self.stab.children[I].close() - del self.ds[I] - del self.sams[I] - del self.selected[I] - del self.selected_cells[I] - del self.active_labels[I] - del self.dd_opts[I] - del self.marker_genes[I] - del self.marker_genes_tt[I] - del self.gene_expressions[I] - self.stab.children = (self.stab.children[0],) - - def get_scatter(self, i): - return self.stab.children[i].data[0] - - def load_vars_from_sam(self, sam): - self.sams = [sam] - self.GENE_KEY = "" - self.selected = [np.zeros(sam.adata.shape[0], dtype="bool")] - self.selected[0][:] = True - self.active_labels = [np.zeros(self.selected[0].size, dtype="int")] - self.dd_opts = [["Toggle cluster"]] - try: - self.marker_genes = [ - np.array(list(sam.adata.var_names))[ - np.argsort(-sam.adata.var["weights"].values) - ] - ] - self.marker_genes_tt = ["Genes ranked by SAM weights."] - except KeyError: - self.marker_genes = [np.array(list(sam.adata.var_names))] - self.marker_genes_tt = ["Unranked genes (SAM not run)."] - - self.selected_cells = [np.array(list(sam.adata.obs_names))] - self.ds = [0] - self.gene_expressions = [np.zeros(sam.adata.shape[0])] - try: - self.preprocess_args = sam.adata.uns["preprocess_args"].copy() - for i in list(self.preprocess_args.keys()): - if isinstance(self.preprocess_args[i], np.ndarray): - self.preprocess_args[i] = self.preprocess_args[i][0] - except: - self.preprocess_args = {} - - rm = list(invalidArgs(self.sams[0].preprocess_data,self.preprocess_args)) - for k in rm: - del self.preprocess_args[k] - - self.preprocess_args['thresh'] = self.preprocess_args.get('thresh_low',0.0) - self.preprocess_args_init = self.preprocess_args.copy() - - try: - self.run_args = sam.adata.uns["run_args"].copy() - for i in list(self.run_args.keys()): - if isinstance(self.run_args[i], np.ndarray): - self.run_args[i] = self.run_args[i][0] - except: - self.run_args = {} - - rm = list(invalidArgs(self.sams[0].run,self.run_args)) - for k in rm: - del self.run_args[k] - self.run_args_init = self.run_args.copy() - - - - def create_plot(self, i, title): - if self.SAM_LOADED: - projs = list(self.sams[i].adata.obsm.keys()) - if self.default_proj in projs: - p = self.default_proj - elif len(projs) > 1: - p = np.array(projs)[np.where(np.array(projs) != "X_pca")[0][0]] - else: - p = "X_pca" - - if p not in projs: - xdata = [] - ydata = [] - else: - xdata = self.sams[i].adata.obsm[p][:, 0] - ydata = self.sams[i].adata.obsm[p][:, 1] - - if i < len(self.stab.children): - f1 = self.stab.children[i] - f1.update_layout(autosize=False) - f1.data = [] - else: - f1 = go.FigureWidget() - - f1.add_scattergl(x=xdata, y=ydata) - - f1.for_each_trace(self.init_graph) - f1.update_layout( - margin_l=0, - margin_r=0, - margin_t=40, - margin_b=0, - xaxis_ticks="", - xaxis_showticklabels=False, - height=self.widget_height * 0.85, - title="", - # xaxis_showgrid=False, - # xaxis_zeroline=False, - # yaxis_showgrid=False, - # yaxis_zeroline=False, - yaxis_ticks="", - yaxis_showticklabels=False, - autosize=True, - dragmode="select", - ) - f1.update_yaxes(autorange=True) - f1.update_xaxes(autorange=True) - - f1.data[0].text = list(self.active_labels[i]) - f1.data[0].hoverinfo = "text" - f1.set_trait( - "_config", - { - "displayModeBar": True, - "scrollZoom": True, - "displaylogo": False, - "edits": {"titleText": False}, - }, - ) - - if i >= len(self.stab.children): - self.stab.children += (f1,) - # self.stab.set_trait('selected_index',i) - - if type(self.ds[i]) is int: - d = Event(source=f1, watched_events=["keydown"]) - self.ds[i] = d - d.on_dom_event(self.handle_events) - - slider = self.cs_dict["THR"] - slider.set_trait("min", slider.value) - slider.set_trait("max", slider.value) - - else: - f1 = go.FigureWidget() - f1.add_scattergl(x=[], y=[]) - f1.update_layout( - margin_l=0, - margin_r=0, - margin_t=40, - margin_b=0, - height=self.widget_height * 0.85, - xaxis_ticks="", - xaxis_showticklabels=False, - yaxis_ticks="", - yaxis_showticklabels=False, - autosize=True, - dragmode="select", - ) - f1.set_trait( - "_config", - { - "displayModeBar": True, - "scrollZoom": True, - "displaylogo": False, - "edits": {"titleText": False}, - }, - ) - self.stab.children += (f1,) - - self.stab.set_title(i, title) - - def handle_events(self, event): - if event["type"] == "keydown": - key = event["key"] - if key == "ArrowRight": - self.log('Scrolling ranked genes slider one to the right.') - self.cs_dict["RGENES"].set_trait( - "value", self.cs_dict["RGENES"].value + 1 - ) - elif key == "ArrowLeft": - self.log('Scrolling ranked genes slider one to the left.') - x = self.cs_dict["RGENES"].value - 1 - if x < 0: - x = 0 - self.cs_dict["RGENES"].set_trait("value", x) - - elif key == "Shift": - self.irm_genes(None) - elif key == "Enter": - self.ism_genes(None) - elif key == "x": - self.unselect_all(None) - elif key == "c": - self.select_all(None) - elif key == "v": - self.reset_view(None) - elif key == "a": - self.log('Turning off k-nn averaging of gene expression values.') - self.cs_dict["AVG"].set_trait("value", not self.cs_dict["AVG"].value) - - def close_tab(self, event): - I = self.stab.selected_index - if I > 0: - self.log('Closing tab {}'.format(I)) - self.stab.set_trait("selected_index", I - 1) - - titles = [] - for i in range(len(self.stab.children)): - titles.append(self.stab.get_title(i)) - del titles[I] - - self.ds[I].close() - self.stab.children[I].close() - t = list(self.stab.children) - del t[I] - self.stab.children = t - del self.ds[I] - del self.sams[I] - del self.selected[I] - del self.selected_cells[I] - del self.gene_expressions[I] - del self.active_labels[I] - del self.dd_opts[I] - del self.marker_genes[I] - del self.marker_genes_tt[I] - - for i in range(1, len(self.stab.children)): - self.stab.set_title(i, titles[i]) - - """ BEGIN PREPROCESS INIT""" - - def init_preprocess(self): - - pdata = widgets.Button( - description = 'Preprocess', - disabled = False, - tooltip = 'Preprocess data', - icon = '' - ) - pdata.on_click(self.preprocess_sam) - - fgenes = widgets.Checkbox( - value=bool(self.preprocess_args.get("filter_genes", True)), - description="Filter genes", - ) - fgenes.observe(self.pp_filtergenes, names="value") - dfts = widgets.Button( - description="Set defaults", - disabled=False, - tooltip="Set default options", - icon="", - ) - dfts.on_click(self.set_pp_defaults) - - l1 = widgets.Label("Expr threshold:") - expr_thr = widgets.FloatSlider( - value=float(self.preprocess_args.get("thresh", 0.01)), - min=0, - max=0.1, - step=0.005, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="2f", - style={"description_width": "initial"}, - ) - expr_thr.observe(self.et_update, names="value") - - l2 = widgets.Label("Min expr:") - min_expr = widgets.FloatSlider( - value=float(self.preprocess_args.get("min_expression", 1)), - min=0, - max=6.0, - step=0.02, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="1f", - style={"description_width": "initial"}, - ) - min_expr.observe(self.me_update, names="value") - - init = self.preprocess_args.get("sum_norm", "None") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = "None" - sumnorm = widgets.Dropdown( - options=["cell_median", "gene_median", "None"], - value=init, - description="Library normalization:", - disabled=False, - style={"description_width": "initial"}, - ) - sumnorm.observe(self.sumnorm_submit, "value") - - init = self.preprocess_args.get("norm", "log") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = "None" - norm = widgets.Dropdown( - options=["log", "ftt", "None"], - value=init, - description="Data normalization:", - disabled=False, - style={"description_width": "initial"}, - ) - norm.observe(self.norm_submit, "value") - - load = widgets.Button( - description="Load data", - tooltip="Enter the path to the desired data file you wish to " - "load. Accepted filetypes are .csv (comma), .txt (tab), .h5ad " - "(AnnData), and .p (pickled SAM dictionary).", - disabled=False, - ) - load.on_click(self.load_data) - load_data = widgets.Text(value="", layout={"width": "100%%"}) - - loada = widgets.Button( - description="Load annotations", - tooltip="Enter the path to the desired annotations file you wish to " - "load. Accepted filetypes are .csv (comma) or .txt (tab).", - disabled=False, - ) - loada.on_click(self.load_ann) - load_dataa = widgets.Text(value="", layout={"width": "100%%"}) - - loadv = widgets.Button( - description="Load gene annotations", - tooltip="Enter the path to the desired gene annotations file " - "you wish to load. Accepted filetypes are .csv (comma) or .txt (tab).", - disabled=False, - ) - loadv.on_click(self.load_vann) - load_datav = widgets.Text(value="", layout={"width": "100%%"}) - - self.ps_dict = {} - self.ps_dict["PDATA"] = pdata - self.ps_dict["DFTS"] = dfts - self.ps_dict["FGENES"] = fgenes - self.ps_dict["NORM"] = norm - self.ps_dict["SUMNORM"] = sumnorm - self.ps_dict["L1"] = l1 - self.ps_dict["EXPR_THR"] = expr_thr - self.ps_dict["L2"] = l2 - self.ps_dict["MIN_EXPR"] = min_expr - self.ps_dict["LOAD"] = load - self.ps_dict["LOAD_DATA"] = load_data - self.ps_dict["LOADA"] = loada - self.ps_dict["LOAD_DATAA"] = load_dataa - self.ps_dict["LOADV"] = loadv - self.ps_dict["LOAD_DATAV"] = load_datav - - pp = widgets.VBox( - [ pdata, - widgets.HBox([dfts, fgenes]), - norm, - sumnorm, - widgets.HBox([l1, expr_thr]), - widgets.HBox([l2, min_expr]), - widgets.HBox([load, load_data]), - widgets.HBox([loada, load_dataa]), - widgets.HBox([loadv, load_datav]), - ], - layout={"height": "95%", "width": "100%"}, - ) - return pp - - def load_ann(self, event): - path = self.ps_dict["LOAD_DATAA"].value - self.log('Loading cell annotation file from {}'.format(path)) - try: - for i in range(len(self.stab.children)): - self.sams[i].load_obs_annotations(path) - self.update_dropdowns(i) - except: - with self.out: - print("Annotation file not found or was improperly formatted.") - - def load_vann(self, event): - path = self.ps_dict["LOAD_DATAV"].value - self.log('Loading gene annotation file from {}'.format(path)) - try: - for i in range(len(self.stab.children)): - self.sams[i].load_var_annotations(path) - self.update_dropdowns(i) - except: - with self.out: - print("Annotation file not found or was improperly formatted.") - - def init_load(self): - load = widgets.Button( - description="Load data", - tooltip="Enter the path to the desired data file you wish to " - "load. Accepted filetypes are .csv (comma), .txt (tab), .h5ad " - "(AnnData), and .p (pickled SAM dictionary).", - disabled=False, - ) - load.on_click(self.load_data) - load_data = widgets.Text(value="", layout={"width": "100%"}) - return widgets.HBox([load, load_data], layout={"width": "50%"}) - - def load_data(self, event): - try: - if not self.SAM_LOADED: - path = self.SamPlot.children[1].children[0].children[1].value - else: - path = self.ps_dict["LOAD_DATA"].value - self.log('Loading data file from {}'.format(path)) - filetype = path.split(".")[-1] - if filetype == "gz": - filetype = path.split(".")[-2] - - sam = SAM() - if filetype == "h5ad" or filetype == "csv": - sam.load_data(path) - elif filetype == "p": - try: - sam.load(path) - except: - sam.load_data(path) - else: - sam.load_data(path, sep="\t") - - if not self.SAM_LOADED: - self.SamPlot.children[1].children[0].children[0].close() - self.SamPlot.children[1].children[0].children[1].close() - self.SamPlot.children[1].children[0].close() - self.SamPlot.children[1].close() - self.init_from_sam(sam) - self.SamPlot.children = [self.stab, self.tab] - self.tab.set_trait("selected_index", 1) - else: - self.close_all_tabs() - self.ds[0].close() - self.ds[0] = 0 - self.load_vars_from_sam(sam) - self.create_plot(0, "Full dataset") - if len(self.titles) == 0: - self.titles = ["Full dataset"] - else: - self.titles[0] = "Full dataset" - except FileNotFoundError: - 0 - # do nothing - - def me_update(self, val): - self.log('Setting min_expression to be {}'.format(val['new'])) - self.preprocess_args["min_expression"] = val["new"] - - def et_update(self, val): - self.log('Setting the threshold value to be {}'.format(val['new'])) - self.preprocess_args["thresh"] = val["new"] - - def sumnorm_submit(self, txt): - self.log('Setting sum_norm value to be {}'.format(txt['new'])) - if txt["new"] == "None": - self.preprocess_args["sum_norm"] = None - else: - self.preprocess_args["sum_norm"] = txt["new"] - - def set_pp_defaults(self, event): - self.log('Setting preprocessing parameters to defaults.') - self.preprocess_args = self.preprocess_args_init.copy() - fgenes = self.ps_dict["FGENES"] # checkbox - norm = self.ps_dict["NORM"] - sumnorm = self.ps_dict["SUMNORM"] - expr_thr = self.ps_dict["EXPR_THR"] # expr_threshold - min_expr = self.ps_dict["MIN_EXPR"] # min_expr - - init = float(self.preprocess_args.get("min_expression", 1)) - min_expr.set_trait("value", init) - - init = float(self.preprocess_args.get("thresh", 0.01)) - expr_thr.set_trait("value", init) - - init = self.preprocess_args.get("sum_norm", "None") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - sumnorm.set_trait("value", "None") - else: - sumnorm.set_trait("value", init) - - init = self.preprocess_args.get("norm", "log") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - norm.set_trait("value", "None") - else: - norm.set_trait("value", init) - - init = bool(self.preprocess_args.get("filter_genes", True)) - fgenes.set_trait("value", True) - self.preprocess_args["filter_genes"] = init - - def norm_submit(self, txt): - self.log('Setting norm parameter to be {}'.format(txt['new'])) - if txt["new"] == "None": - self.preprocess_args["norm"] = None - else: - self.preprocess_args["norm"] = txt["new"] - - def pp_filtergenes(self, event): - t = bool(self.preprocess_args.get("filter_genes", True)) - self.log('Setting filter_genes parameter to be {}'.format(not t)) - self.preprocess_args["filter_genes"] = not t - - def preprocess_sam(self, event): - i = self.stab.selected_index - self.log('Preprocessing SAM object in tab {}'.format(i)) - self.sams[i].preprocess_data(**self.preprocess_args) - - """ END PREPROCESS INIT""" - - """ BEGIN RUN INIT""" - - def init_run_sam(self): - cpm = widgets.Dropdown( - options=["UMAP", "t-SNE", "Diffusion map", "Diffusion UMAP"], - value="UMAP", - description="", - layout={"width": "60%"}, - ) - - cp = widgets.Button( - description="Compute projection", - tooltip="Compute a 2D projection using the selected method in the dropdown menu to the right.", - layout={"width": "40%"}, - ) - cp.on_click(self.compute_projection) - - clm = widgets.Dropdown( - options=[ - "Louvain cluster", - "Density cluster", - "Hdbscan cluster", - "Kmeans cluster", - "Leiden cluster", - ], - value="Leiden cluster", - description="", - layout={"width": "60%"}, - ) - clm.observe(self.rewire_cluster_slider, "value") - cl = widgets.Button( - description="Cluster", - tooltip="Cluster the data using the selected method in the dropdown menu to the right.", - layout={"width": "40%"}, - ) - cl.on_click(self.cluster_data) - - l = widgets.Label("Leiden 'res'", layout={"width": "20%"}) - cslider = widgets.FloatSlider( - value=1, - min=0.1, - max=10, - step=0.1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="2f", - description="", - style={"description_width": "initial"}, - layout={"width": "80%"}, - ) - - runb = widgets.Button( - description="Run SAM", - disabled=False, - tooltip="Run SAM on the currently selected cells. Enter the title of the new tab in the text box on the right", - icon="", - ) - runb.on_click(self.subcluster) - title = widgets.Text(value="",) - wpca = widgets.Checkbox( - value=bool(self.run_args.get("weight_PCs", True)), description="Weight PCs" - ) - wpca.observe(self.weightpcs) - - - spca = widgets.Checkbox( - value=bool(self.run_args.get("sparse_pca", True)), description="Sparse PCA" - ) - spca.observe(self.sparsepca) - - - dfts = widgets.Button( - description="Set defaults", - disabled=False, - tooltip="Set default options", - icon="", - ) - dfts.on_click(self.set_run_defaults) - - l1 = widgets.Button( - description="# genes:", - disabled=True, - tooltip="The number of highest-weighted genes to select each iteration of SAM.", - icon="", - ) - init = self.run_args.get("n_genes", 3000) - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = 3000 - ngenes = widgets.FloatSlider( - value=init, - min=min(100, self.sams[0].adata.shape[1]), - max=self.sams[0].adata.shape[1], - step=100, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="5d", - style={"description_width": "initial"}, - ) - ngenes.observe(self.ngenes_update, names="value") - - l2 = widgets.Button( - description="# PCs:", - disabled=True, - tooltip="The number of principal components to select in PCA.", - icon="", - ) - init = self.run_args.get("npcs", 150) - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = 150 - npcs = widgets.FloatSlider( - value=init, - min=10, - max=500, - step=1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="d", - style={"description_width": "initial"}, - ) - npcs.observe(self.npcs_update, names="value") - - l3 = widgets.Button( - description="# neighbors:", - disabled=True, - tooltip="The number of nearest neighbors to identify for each cell.", - icon="", - ) - init = self.run_args.get("k", 20) - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = 20 - knn = widgets.FloatSlider( - value=init, - min=5, - max=200, - step=1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="d", - style={"description_width": "initial"}, - ) - knn.observe(self.knn_update, names="value") - - l4 = widgets.Button( - description="num_norm_avg:", - disabled=True, - tooltip="The top 'num_norm_avg' dispersions are averaged to determine the " - "normalization factor when calculating the weights. This prevents " - "genes with large spatial dispersions from skewing the distribution " - "of weights.", - icon="", - ) - init = self.run_args.get("num_norm_avg", 50) - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = 50 - nna = widgets.FloatSlider( - value=init, - min=1, - max=200, - step=1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="d", - style={"description_width": "initial"}, - ) - nna.observe(self.nna_update, names="value") - - init = self.run_args.get("preprocessing", "Normalizer") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = "None" - norm = widgets.Dropdown( - options=["StandardScaler", "Normalizer", "None"], - value=init, - description="Preprocessing:", - disabled=False, - style={"description_width": "initial"}, - ) - norm.observe(self.rnorm_update, "value") - - init = self.run_args.get("distance", "correlation") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = "correlation" - distance = widgets.Dropdown( - options=["correlation", "euclidean", "cosine"], - value=init, - description="Distance:", - disabled=False, - style={"description_width": "initial"}, - ) - distance.observe(self.dist_update, "value") - - init = self.run_args.get("projection", "umap") - if isinstance(init, np.ndarray): - init = init[0] - if init is None: - init = "umap" - proj = widgets.Dropdown( - options=["umap", "tsne", "diff_umap"], - value=init, - description="Projection:", - disabled=False, - style={"description_width": "initial"}, - ) - proj.observe(self.proj_update, "value") - - self.rb_dict = {} - self.rb_dict['SPCA'] = spca - self.rb_dict["RUNB"] = runb - self.rb_dict["TITLE"] = title - self.rb_dict["DFTS"] = dfts - self.rb_dict["WPCA"] = wpca - self.rb_dict["L3"] = l3 - self.rb_dict["L4"] = l4 - self.rb_dict["NNA"] = nna - self.rb_dict["KNN"] = knn - self.rb_dict["NORM"] = norm - self.rb_dict["DISTANCE"] = distance - self.rb_dict["PROJ"] = proj - self.rb_dict["L1"] = l1 - self.rb_dict["L2"] = l2 - self.rb_dict["NGENES"] = ngenes - self.rb_dict["NPCS"] = npcs - self.rb_dict["CP"] = cp - self.rb_dict["CPM"] = cpm - self.rb_dict["CL"] = cl - self.rb_dict["CLM"] = clm - self.rb_dict["L"] = l - self.rb_dict["CSLIDER"] = cslider - - rs = widgets.VBox( - [ - widgets.HBox([runb, title]), - widgets.HBox([dfts, wpca, spca]), - widgets.HBox([l3, knn]), - widgets.HBox([l4, nna]), - norm, - distance, - proj, - widgets.HBox([l1, ngenes]), - widgets.HBox([l2, npcs]), - widgets.HBox([cp, cpm]), - widgets.HBox([cl, clm]), - widgets.HBox([l, cslider]), - ], - layout={"height": "95%", "width": "100%"}, - ) - - return rs - - def set_run_defaults(self, event): - self.log('Setting run parameters to defaults.') - self.run_args = self.run_args_init.copy() - # run - # defaults,wpca - # ,ngenes - # ,npcs - # ,knn - # ,nna - # norm,dist,proj - wpca = self.rb_dict["WPCA"] - ngenes = self.rb_dict["NGENES"] - npcs = self.rb_dict["NPCS"] - knn = self.rb_dict["KNN"] - nna = self.rb_dict["NNA"] - rnorm = self.rb_dict["NORM"] - dist = self.rb_dict["DISTANCE"] - proj = self.rb_dict["PROJ"] - - init = self.run_args.get("num_norm_avg", 50) - nna.set_trait("value", init) - init = self.run_args.get("k", 20) - knn.set_trait("value", init) - - init = self.run_args.get("npcs", 150) - if init is None: - init = 150 - npcs.set_trait("value", init) - - init = self.run_args.get("n_genes", 3000) - if init is None: - init = 3000 - ngenes.set_trait("value", init) - - init = self.run_args.get("preprocessing", "Normalizer") - if init is None: - init = "" - rnorm.set_trait("value", init) - init = self.run_args.get("distance", "correlation") - dist.set_trait("value", init) - init = self.run_args.get("projection", "umap") - proj.set_trait("value", init) - - init = bool(self.run_args.get("weight_PCs", True)) - wpca.set_trait("value", init) - self.run_args["weight_PCs"] = init - - def sam_weights(self, event): - self.log('Setting gene rankings to be ordered by the SAM weights.') - s = self.sams[self.stab.selected_index] - self.marker_genes[self.stab.selected_index] = np.array( - list(s.adata.var_names[np.argsort(-s.adata.var["weights"].values)]) - ) - self.marker_genes_tt[self.stab.selected_index] = "Genes ranked by SAM weights." - self.cs_dict["LGENES"].set_trait( - "tooltip", self.marker_genes_tt[self.stab.selected_index] - ) - - if self.cs_dict["RGENES"].value != 0: - self.cs_dict["RGENES"].set_trait("value", 0) - else: - self.show_expression(str(self.marker_genes[self.stab.selected_index][0])) - - def weightpcs(self, event): - t = self.run_args.get("weight_PCs", True) - self.log('Setting weight_PCs to be {}'.format(not t)) - self.run_args["weight_PCs"] = not t - - def sparsepca(self, event): - t = self.run_args.get("sparse_pca", True) - self.log('Setting sparse_pca to be {}'.format(not t)) - self.run_args["sparse_pca"] = not t - - def npcs_update(self, val): - self.log('Setting npcs to be {}'.format(int(val['new']))) - self.run_args["npcs"] = int(val["new"]) - - def nna_update(self, val): - self.log('Setting num_norm_avg to be {}'.format(int(val['new']))) - self.run_args["num_norm_avg"] = int(val["new"]) - - def knn_update(self, val): - self.log('Setting k to be {}'.format(int(val['new']))) - self.run_args["k"] = int(val["new"]) - - def ngenes_update(self, val): - self.log('Setting n_genes to be {}'.format(int(val['new']))) - self.run_args["n_genes"] = int(val["new"]) - - def rnorm_update(self, txt): - self.log('Setting preprocessing to be {}'.format(txt['new'])) - if txt["new"] == "None": - self.run_args["preprocessing"] = None - else: - self.run_args["preprocessing"] = txt["new"] - - def proj_update(self, txt): - self.log('Setting projection to be {}'.format(txt['new'])) - self.run_args["projection"] = txt["new"] - - def dist_update(self, txt): - self.log('Setting distance to be {}'.format(txt['new'])) - self.run_args["distance"] = txt["new"] - - def subcluster(self, event): - - execute = False - i = self.stab.selected_index - - selected = self.selected[i] - selected_cells = self.selected_cells[i] - - sam = self.sams[i] - if not np.all(selected) and selected.sum() > 0: - sam_subcluster = SAM(counts=sam.adata[selected_cells, :].copy()) - sam_subcluster.adata_raw = sam.adata_raw[selected_cells].copy() - - sam_subcluster.adata_raw.obs = sam_subcluster.adata.obs.copy() - sam_subcluster.adata_raw.obsm = sam_subcluster.adata.obsm.copy() - - #sam_subcluster.preprocess_data(**self.preprocess_args) - - self.out.clear_output() - with self.out: - sam_subcluster.run(**self.run_args) - self.sams.append(sam_subcluster) - self.selected.append(np.ones(sam_subcluster.adata.shape[0]).astype("bool")) - self.selected_cells.append(np.array(list(sam_subcluster.adata.obs_names))) - self.active_labels.append(np.zeros(sam_subcluster.adata.shape[0])) - self.dd_opts.append(["Toggle cluster"]) - self.gene_expressions.append(np.zeros(sam_subcluster.adata.shape[0])) - self.marker_genes.append( - np.array(list(sam_subcluster.adata.var_names))[ - np.argsort(-sam_subcluster.adata.var["weights"].values) - ] - ) - self.marker_genes_tt.append("Genes ranked by SAM weights.") - self.ds.append(0) - i = len(self.sams) - 1 - execute = True - - elif np.all(selected): - sam.preprocess_data(**self.preprocess_args) - - self.out.clear_output() - with self.out: - sam.run(**self.run_args) - self.marker_genes[i] = np.array(list(sam.adata.var_names))[ - np.argsort(-sam.adata.var["weights"].values) - ] - self.marker_genes_tt[i] = "Genes ranked by SAM weights." - execute = True - self.current_sam = sam - - if execute: - self.log(('Subclustering SAM object in tab {}'.format(i),selected_cells)) - title = self.rb_dict["TITLE"].value - if title == "" and i == 0: - title = "Full dataset" - elif title == "": - title = "Subcluster " + str(i) - self.create_plot(i, title) - try: - self.titles[i] = title - except: - self.titles.append(title) - - self.update_dropdowns(i) - - """ END RUN INIT""" - - """ BEGIN CS INIT""" - - def init_cs(self): - initl = list(self.sams[0].adata.obsm.keys()) - initl = [""] + initl - if self.default_proj in initl: - init = self.default_proj - else: - init = initl[0] - - dpm = widgets.Dropdown( - options=initl, value=init, description="", layout={"width": "60%"} - ) - dp = widgets.Button( - description="Display projection", - tooltip="Display the 2D projection selected in the dropdown menu to the right.", - layout={"width": "40%"}, - ) - dp.on_click(self.display_projection) - - initl = list(self.sams[0].adata.obs.keys()) - initl = [""] + initl - dam = widgets.Dropdown( - options=initl, value=initl[0], description="", layout={"width": "60%"} - ) - da = widgets.Button( - description="Display cell ann.", - tooltip="Overlay the annotations selected in the dropdown menu to the right.", - layout={"width": "40%"}, - ) - da.on_click(self.display_annotation) - - dv = widgets.Button( - description="Display gene ann.", - tooltip="Include the 'var' annotation for the corresponding gene in the title of the gene expression plots.", - layout={"width": "40%"}, - ) - dv.on_click(self.display_var_annotation) - - initl = list(self.sams[0].adata.var.keys()) - initl = [""] + initl - dvm = widgets.Dropdown( - options=initl, value=initl[0], description="", layout={"width": "60%"} - ) - - irm = widgets.Button( - description="Find markers (RF)", - tooltip="Identify marker genes of selected cells using a random forest classifier.", - layout={"width": "34%"}, - ) - irm.on_click(self.irm_genes) - ism = widgets.Button( - description="Find markers (SW)", - tooltip="Identify marker genes of selected cells by finding genes with large SAM weights among the selected cells.", - layout={"width": "33%"}, - ) - ism.on_click(self.ism_genes) - - sws = widgets.Button( - description="SAM ranking", - tooltip="Ranks genes by SAM weights.", - layout={"width": "33%"}, - ) - sws.on_click(self.sam_weights) - - us = widgets.Button( - description="Unselect all (x)", - tooltip="Unselect all cells. Pressing 'x' on the keyboard while hovering over the scatter plot will do the same thing.", - layout={"width": "34%"}, - ) - usa = widgets.Button( - description="Select all (c)", - tooltip="Select all cells. Pressing 'c' on the keyboard while hovering over the scatter plot will do the same thing.", - layout={"width": "33%"}, - ) - res = widgets.Button( - description="Reset view (v)", - tooltip="Resets the current plot to the defautl view. Pressing 'v' on the keyboard while hovering over the scatter plot" - " will do the same thing.", - layout={"width": "33%"}, - ) - us.on_click(self.unselect_all) - usa.on_click(self.select_all) - res.on_click(self.reset_view) - - avg = widgets.Checkbox( - indent=False, value=True, description="Avg expr", layout={"width": "20%"} - ) - - log = widgets.Checkbox( - indent=False, value=False, description="Log cbar", layout={"width": "20%"} - ) - - show_colorbar = widgets.Checkbox( - indent=False, value=False, description="Show cbar", layout={"width": "20%"} - ) - lann = widgets.Button( - description="Annotate", - tooltip=( - "Enter the key of the 'obs' annotation vector you wish to modify." - " If the key does not exist, a new vector in 'obs' will be created. Enter the" - " label you want to annotate the selected cells with in the right." - ), - disabled=True, - layout={"width": "20%"}, - ) - anno_name = widgets.Text( - value="", placeholder="Annotation column name", layout={"width": "40%"} - ) - anno = widgets.Text(value="", placeholder="Annotation", layout={"width": "40%"}) - anno.on_submit(self.annotate_pop) - - lgsm = widgets.Button( - description="Get similar genes", - tooltip="Rank genes in order of decreasing spatially averaged correlation with respect to the input gene.", - disabled=True, - layout={"width": "30%"}, - ) - gsm = widgets.Text( - value="", - placeholder=self.sams[0].adata.var_names[0], - layout={"width": "50%"}, - ) - gsm.on_submit(self.get_similar_genes) - lshg = widgets.Button( - description="Show expressions", - tooltip="Overlay the input gene expressions over the scatter plot. Only a substring of the gene need be provided.", - disabled=True, - layout={"width": "30%"}, - ) - shg = widgets.Text( - value="", - placeholder=self.sams[0].adata.var_names[0], - layout={"width": "50%"}, - ) - shg.on_submit(self.show_expression) - - lgenes = widgets.Button( - description="Ranked genes", - tooltip="Genes ranked by SAM weights.", - disabled=True, - layout={"width": "30%"}, - ) - rgenes = widgets.IntSlider( - value=0, - min=0, - max=200, - step=1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - description="", - style={"description_width": "initial"}, - layout={"width": "70%"}, - ) - rgenes.observe(self.gene_update, "value") - - lthr = widgets.Button( - description="Threshold expr", - tooltip="Select genes with greater than the chosen expression threshold.", - disabled=True, - layout={"width": "30%"}, - ) - thr = widgets.FloatSlider( - value=0, - min=0, - max=0, - step=0.05, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - description="", - style={"description_width": "initial"}, - layout={"width": "70%"}, - ) - thr.observe(self.threshold_selection, "value") - - lsf = widgets.Button( - description="Save", - tooltip="Save the current SAM object. Filenames should end with .h5ad.", - disabled=True, - layout={"width": "30%"}, - ) - - sf = widgets.Text(value="", layout={"width": "70%"}) - sf.on_submit(self.save_data) - - close = widgets.Button( - description="Close current tab", - tooltip="Closes the currently open subclustering tab.", - disabled=False, - ) # layout={'width':'30%'}) - close.on_click(self.close_tab) - - hotkeys = widgets.Button( - description="ReadMe", - tooltip="""While hovering over the scatter plot, the following keyboard inputs are available: - - Left/Right arrows: Scroll through & display ranked genes - - Shift: Random Forest classifier marker gene identification of selected cells - - Enter: SAM weights-based approach to marker gene identification of selected cells - - x: Unselect all cells - - c: Select all cells - - v: Reset view - - a: Toggle expression spatial averaging ON/OFF - - Also note that clicking a cell will select/unselect all cells sharing its label.""", - disabled=True, - ) # layout={'width':'30%'}) - - amslider = widgets.Button( - description="Opacity", - tooltip="Changes the opacities of unselected points in the current plot.", - disabled=True, - layout={"width": "30%"}, - ) - aslider = widgets.FloatSlider( - value=0.05, - min=0.0, - max=1, - step=0.01, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="2f", - description="", - style={"description_width": "initial"}, - layout={"width": "70%"}, - ) - aslider.observe(self.change_alpha, "value") - - lmslider = widgets.Button( - description="Marker size", - tooltip="Changes the marker sizes in the current plot.", - disabled=True, - layout={"width": "30%"}, - ) - mslider = widgets.FloatSlider( - value=5, - min=0.1, - max=20, - step=0.1, - disabled=False, - continuous_update=False, - orientation="horizontal", - readout=True, - readout_format="2f", - description="", - style={"description_width": "initial"}, - layout={"width": "70%"}, - ) - mslider.observe(self.change_msize, "value") - - acc = widgets.Dropdown(value="Toggle cluster", options=["Toggle cluster"]) - acc.observe(self.pick_cells_dd, "value") - - self.cs_dict = {} - self.cs_dict["DP"] = dp - self.cs_dict["DPM"] = dpm - self.cs_dict["DA"] = da - self.cs_dict["DAM"] = dam - self.cs_dict["ACC"] = acc - self.cs_dict["DV"] = dv - self.cs_dict["DVM"] = dvm - self.cs_dict["IRM"] = irm - self.cs_dict["ISM"] = ism - self.cs_dict["SWS"] = sws - self.cs_dict["US"] = us - self.cs_dict["USA"] = usa - self.cs_dict["RES"] = res - self.cs_dict["LANN"] = lann - self.cs_dict["ANNO_NAME"] = anno_name - self.cs_dict["ANNO"] = anno - self.cs_dict["SHOW_COLORBAR"] = show_colorbar - self.cs_dict["LGSM"] = lgsm - self.cs_dict["GSM"] = gsm - self.cs_dict["AVG"] = avg - self.cs_dict["LSHG"] = lshg - self.cs_dict["SHG"] = shg - self.cs_dict["LOG"] = log - self.cs_dict["LGENES"] = lgenes - self.cs_dict["RGENES"] = rgenes - self.cs_dict["LTHR"] = lthr - self.cs_dict["THR"] = thr - self.cs_dict["LSF"] = lsf - self.cs_dict["SF"] = sf - self.cs_dict["AMSLIDER"] = amslider - self.cs_dict["ASLIDER"] = aslider - self.cs_dict["LMSLIDER"] = lmslider - self.cs_dict["MSLIDER"] = mslider - self.cs_dict["CLOSE"] = close - self.cs_dict["HOTKEYS"] = hotkeys - - return widgets.VBox( - [ - widgets.HBox([dp, dpm]), - widgets.HBox([da, dam, acc]), - widgets.HBox([dv, dvm]), - widgets.HBox([irm, ism, sws]), - widgets.HBox([us, usa, res]), - widgets.HBox([lann, anno_name, anno,]), - widgets.HBox([lgsm, gsm,]), - widgets.HBox([lshg, shg,]), - widgets.HBox([lgenes, rgenes]), - widgets.HBox([lthr, thr]), - widgets.HBox([amslider, aslider]), - widgets.HBox([show_colorbar, avg, log]), - widgets.HBox([lmslider, mslider]), - widgets.HBox([lsf, sf]), - widgets.HBox([close, hotkeys]), - ], - layout={"height": "95%", "width": "100%"}, - ) - - def reset_view(self, event): - i = self.stab.selected_index - self.log('Resetting view for tab {}'.format(i)) - self.create_plot(i, self.stab.get_title(i)) - self.marker_genes[i] = np.array(list(self.sams[i].adata.var_names))[ - np.argsort(-self.sams[i].adata.var["weights"].values) - ] - self.marker_genes_tt[i] = "Genes ranked by SAM weights." - self.cs_dict["LGENES"].set_trait("tooltip", self.marker_genes_tt[i]) - - def save_data(self, path): - path = path.value - self.log('Saving data from tab {} to {}'.format(self.stab.selected_index,path)) - if path != "": - if path.split(".")[-1] == "h5ad": - s = self.sams[self.stab.selected_index] - s.save_anndata(path) - elif ( - path.split(".")[-1] == "png" - or path.split(".")[-1] == "pdf" - or path.split(".")[-1] == "eps" - or path.split(".")[-1] == "jpg" - ): - if len(path.split("/")) > 1: - ut.create_folder("/".join(path.split("/")[:-1])) - self.stab.children[self.stab.selected_index].write_image(path) - - def show_expression(self, text): - if type(text) is not str: - gene = text.value - else: - gene = text - - s = self.sams[self.stab.selected_index] - if gene != "": - - try: - if not (gene in s.adata.var_names): - genes = ut.search_string( - np.array(list(s.adata.var_names)), gene, case_sensitive=True - )[0] - if genes is not -1: - gene = genes[0] - - if self.cs_dict["AVG"].value: - if "X_knn_avg" not in s.adata.layers.keys(): - s.dispersion_ranking_NN(save_avgs=True) - - x = s.adata[:, gene].layers["X_knn_avg"] - if sp.issparse(x): - a = x.A.flatten() - else: - a = x.flatten() - else: - x = s.adata[:, gene][s.adata.obs_names, :].X - if sp.issparse(x): - a = x.A.flatten() - else: - a = x.flatten() - - if self.GENE_KEY != "": - title = gene + "; " + str(s.adata.var[self.GENE_KEY].T[gene]) - else: - title = gene - - # self.select_all(None) - self.update_colors_expr(a, title) - self.log('Showing gene expression values for {} in tab {}'.format(title,self.stab.selected_index)) - except: - if self.cs_dict["AVG"].value: - if not (gene in s.adata.var_names): - with self.out: - print( - "X_knn_avg does not exist. Either run" - " sam.dispersion_ranking_NN() or uncheck" - " `avg` in the control panel." - ) - elif not (gene in s.adata.var_names): - with self.out: - print( - "Gene not found. Check your spelling and " "capitalization." - ) - - def get_similar_genes(self, txt): - gene = txt.value - if gene != "": - s = self.sams[self.stab.selected_index] - if "X_knn_avg" not in s.adata.layers.keys(): - s.dispersion_ranking_NN() - - genes = ut.search_string( - np.array(list(s.adata.var_names)), gene, case_sensitive=True - )[0] - if genes is not -1: - gene = genes[0] - else: - return - # quit - - markers = ut.find_corr_genes(s, gene).flatten() - _, i = np.unique(markers, return_index=True) - markers = markers[np.sort(i)] - self.marker_genes[self.stab.selected_index] = markers - - self.cs_dict["RGENES"].set_trait("value", 0) - self.marker_genes_tt[self.stab.selected_index] = ( - "Ranked genes from most to least spatially correlated with " - + gene - + "." - ) - self.cs_dict["LGENES"].set_trait( - "tooltip", self.marker_genes_tt[self.stab.selected_index] - ) - self.log('Getting genes with similar expression patterns to {}'.format(gene)) - self.show_expression(str(gene)) - - - def annotate_pop(self, text): - text = text.value - text_name = self.cs_dict["ANNO_NAME"].value - selected = self.selected[self.stab.selected_index] - selected_cells = self.selected_cells[self.stab.selected_index] - if text != "" and text_name != "" and selected.sum() != selected.size: - self.log(('Annotating cells with the {} label in the {} key.'.format(text,text_name),selected_cells)) - for it, s in enumerate(self.sams): - try: - x1 = np.array(list(s.adata.obs_names)) - - if text_name in list(s.adata.obs.keys()): - a = s.get_labels(text_name).copy().astype(" 0: - self.log(('Calculating marker genes using SW method in tab {}'.format(self.stab.selected_index),self.selected_cells[self.stab.selected_index])) - - if "X_knn_avg" not in s.adata.layers.keys(): - s.dispersion_ranking_NN(save_avgs=True) - l = s.adata.layers["X_knn_avg"] - m = l.mean(0).A.flatten() - ms = l[selected, :].mean(0).A.flatten() - lsub = l[selected, :] - lsub.data[:] = lsub.data ** 2 - ms2 = lsub.mean(0).A.flatten() - v = ms2 - 2 * ms * m + m ** 2 - wmu = np.zeros(v.size) - wmu[m > 0] = v[m > 0] / m[m > 0] - - self.marker_genes[self.stab.selected_index] = np.array( - list(s.adata.var_names[np.argsort(-wmu)]) - ) - if self.cs_dict["RGENES"].value != 0: - self.cs_dict["RGENES"].set_trait("value", 0) - else: - self.show_expression( - str(self.marker_genes[self.stab.selected_index][0]) - ) - self.marker_genes_tt[ - self.stab.selected_index - ] = "Ranked genes according to spatial SAM weights among selected cells." - self.cs_dict["LGENES"].set_trait( - "tooltip", self.marker_genes_tt[self.stab.selected_index] - ) - - def irm_genes(self, event): - selected = self.selected[self.stab.selected_index] - s = self.sams[self.stab.selected_index] - if not np.all(selected) and selected.sum() > 0: - self.log(('Calculating marker genes using RF method in tab {}'.format(self.stab.selected_index),self.selected_cells[self.stab.selected_index])) - a = np.zeros(s.adata.shape[0]) - a[selected] = 1 - markers, _ = s.identify_marker_genes_rf(labels=a, clusters=1) - self.marker_genes[self.stab.selected_index] = markers[1] - if self.cs_dict["RGENES"].value != 0: - self.cs_dict["RGENES"].set_trait("value", 0) - else: - self.show_expression( - str(self.marker_genes[self.stab.selected_index][0]) - ) - self.marker_genes_tt[ - self.stab.selected_index - ] = "Ranked genes according to random forest classification." - self.cs_dict["LGENES"].set_trait( - "tooltip", self.marker_genes_tt[self.stab.selected_index] - ) - - def gene_update(self, val): - val = val["new"] - markers = self.marker_genes[self.stab.selected_index] - - if int(val) < markers.size: - gene = markers[int(val)] - else: - gene = markers[-1] - self.cs_dict["SHG"].set_trait("value", gene) - self.show_expression(str(gene)) - - def update_dropdowns(self, i): - s = self.sams[i] - self.cs_dict["DPM"].options = [""] + list(s.adata.obsm.keys()) - self.cs_dict["DAM"].options = [""] + list(s.adata.obs.keys()) - self.cs_dict["DVM"].options = [""] + list(s.adata.var.keys()) - self.cs_dict["ACC"].options = self.dd_opts[i] - - def on_switch_tabs(self, event): - self.log('Switching to tab {}'.format(self.stab.selected_index)) - self.update_dropdowns(self.stab.selected_index) - self.cs_dict["LGENES"].set_trait( - "tooltip", self.marker_genes_tt[self.stab.selected_index] - ) - self.current_tab = self.stab.selected_index - self.current_sam = self.sams[self.current_tab] - for i in range(len(self.children)): - self.tab.set_title( - i, self.names[i] + " (" + str(self.stab.selected_index) + ")" - ) - - x = self.stab.children[self.stab.selected_index] - f = x.layout.xaxis.range - x.layout.xaxis.range=(-1,1) - x.layout.xaxis.range=f - - f = x.layout.yaxis.range - x.layout.yaxis.range=(-1,1) - x.layout.yaxis.range=f - - - - def display_var_annotation(self, event): - self.GENE_KEY = self.cs_dict["DVM"].value - self.log('Setting the var annotations to be from key {}'.format(self.GENE_KEY)) - - def display_annotation(self, event): - key = self.cs_dict["DAM"].value - self.log('Displaying cell annotations from obs key {}'.format(key)) - if key != "": - labels = np.array(list(self.sams[self.stab.selected_index].get_labels(key))) - - self.active_labels[self.stab.selected_index] = labels - self.update_colors_anno(labels) - - def update_colors_expr(self, a, title): - if self.cs_dict["LOG"].value: - a = np.log2(a + 1) - if self.cs_dict["SHOW_COLORBAR"].value: - showscale = True - else: - showscale = False - self.gene_expressions[self.stab.selected_index] = a - - f1 = self.stab.children[self.stab.selected_index] - f1.update_traces( - marker=dict( - color=a, - colorscale="spectral", - reversescale=True, - showscale=showscale, - colorbar_ticks="outside", - colorbar_tickmode="auto", - colorbar_title="", - opacity=1, - ) - ) - - f1.update_layout(hovermode="closest", title=title) - self.stab.children[self.stab.selected_index].data[0].text = list(a) - self.stab.children[self.stab.selected_index].data[0].hoverinfo = "text" - - slider = self.cs_dict["THR"] - slider.set_trait("min", 0) - slider.set_trait("max", a.max() + (a.max() - a.min()) / 100) - slider.set_trait("step", (a.max() - a.min()) / 100) - slider.set_trait("value", 0) - - def update_colors_anno(self, labels): - nlabels = np.unique(labels).size - title = self.cs_dict["DAM"].value.split("_clusters")[0] - - if issubclass(labels.dtype.type, np.number): - if issubclass(labels.dtype.type, np.float): - self.update_colors_expr(labels, title) - return - - if nlabels == 1: - x = "spectral" - elif nlabels <= 2: - x = cl.scales["3"]["qual"]["Paired"][:nlabels] - elif nlabels <= 11: - nlabels = str(nlabels) - x = cl.scales[nlabels]["qual"]["Paired"] - else: - nlabels = "11" - x = cl.scales[nlabels]["qual"]["Paired"] - x = cl.interp(x, int(nlabels)) - - lbls, inv = np.unique(labels, return_inverse=True) - self.dd_opts[self.stab.selected_index] = ["Toggle cluster"] + list(lbls) - self.cs_dict["ACC"].options = self.dd_opts[self.stab.selected_index] - - if issubclass(labels.dtype.type, np.character): - tickvals = np.arange(lbls.size) - ticktext = list(lbls) - else: - idx = np.round(np.linspace(0, len(lbls) - 1, 6)).astype(int) - tickvals = list(idx) - ticktext = lbls[tickvals] - - if self.cs_dict["SHOW_COLORBAR"].value: - showscale = True - else: - showscale = False - - f1 = self.stab.children[self.stab.selected_index] - f1.update_traces( - marker=dict( - color=inv, - colorscale=x, - colorbar_ticks="outside", - colorbar_tickmode="array", - colorbar_title="", - colorbar_tickvals=tickvals, - showscale=showscale, - colorbar_ticktext=ticktext, - opacity=1, - ) - ) - self.stab.children[self.stab.selected_index].layout.title = title - self.stab.children[self.stab.selected_index].data[0].text = list(lbls[inv]) - self.stab.children[self.stab.selected_index].data[0].hoverinfo = "text" - self.stab.children[self.stab.selected_index].layout.hovermode = "closest" - - slider = self.cs_dict["THR"] - slider.set_trait("min", slider.value) - slider.set_trait("max", slider.value) - - def threshold_selection(self, val): - val = val["new"] - self.log('Setting gene expression selection threshold to be {}'.format(val)) - s = self.sams[self.stab.selected_index] - - self.selected[self.stab.selected_index][:] = False - self.selected[self.stab.selected_index][ - self.gene_expressions[self.stab.selected_index] >= val - ] = True - self.selected_cells[self.stab.selected_index] = np.array( - list(s.adata.obs_names) - )[self.selected[self.stab.selected_index]] - self.stab.children[self.stab.selected_index].data[0].selectedpoints = np.where( - self.selected[self.stab.selected_index] - )[0] - - def rewire_cluster_slider(self, event): - x = self.rb_dict["CSLIDER"] - l = self.rb_dict["L"] - val = event["new"] - if val == "Kmeans cluster": - x.set_trait("min", 2) - x.set_trait("max", 200) - x.set_trait("value", 6) - x.set_trait("step", 1) - l.set_trait("value", "Kmeans 'k'") - - elif val == "Louvain cluster": - x.set_trait("min", 0.1) - x.set_trait("max", 10) - x.set_trait("value", 1) - x.set_trait("step", 0.1) - l.set_trait("value", "Louvain 'res'") - - elif val == "Density cluster": - x.set_trait("min", 0.1) - x.set_trait("max", 2) - x.set_trait("value", 0.5) - x.set_trait("step", 0.01) - l.set_trait("value", "Density 'eps'") - - elif val == "Hdbscan cluster": - l.set_trait("value", "Hdbscan 'N/A'") - - elif val == "Leiden cluster": - x.set_trait("min", 0.1) - x.set_trait("max", 10) - x.set_trait("value", 1) - x.set_trait("step", 0.1) - l.set_trait("value", "Leiden 'res'") - - def cluster_data(self, event): - s = self.sams[self.stab.selected_index] - val = self.rb_dict["CLM"].value - eps = self.rb_dict["CSLIDER"].value - self.log('Clustering data using the {} method with parameter {} in tab {}'.format(val,eps,self.stab.selected_index)) - if val == "Kmeans cluster": - s.kmeans_clustering(int(eps)) - - elif val == "Louvain cluster": - s.louvain_clustering(res=eps) - - elif val == "Density cluster": - s.density_clustering(eps=eps) - - elif val == "Hdbscan cluster": - s.hdbknn_clustering(k=int(eps)) - - elif val == "Leiden cluster": - s.leiden_clustering(res=eps) - - self.cs_dict["DAM"].options = [""] + list(s.adata.obs.keys()) - - def display_projection(self, event): - s = self.sams[self.stab.selected_index] - key = self.cs_dict["DPM"].value - self.log('Displaying projection from obsm key {} in tab {}'.format(key,self.stab.selected_index)) - if key != "": - X = s.adata.obsm[key][:, :2] - self.stab.children[self.stab.selected_index].data[0]["x"] = X[:, 0] - self.stab.children[self.stab.selected_index].data[0]["y"] = X[:, 1] - - def compute_projection(self, event): - i = self.stab.selected_index - s = self.sams[i] - val = self.rb_dict["CPM"].value - if val == "UMAP": - s.run_umap() - elif val == "t-SNE": - s.run_tsne() - elif val == "Diffusion map": - s.run_diff_map() - elif val == "Diffusion UMAP": - s.run_diff_umap() - self.log('Computing 2D projection using the {} method'.format(val)) - self.update_dropdowns(i) - - """ END CS INIT""" - - def select(self, trace, points, selector): - if np.array(points.point_inds).size > 0: - self.selected[self.stab.selected_index][np.array(points.point_inds)] = True - self.selected_cells[self.stab.selected_index] = np.array( - list( - self.sams[self.stab.selected_index].adata.obs_names[ - self.selected[self.stab.selected_index] - ] - ) - ) - trace.selectedpoints = list( - np.where(self.selected[self.stab.selected_index])[0] - ) - trace.unselected = {"marker": {"opacity": self.cs_dict["ASLIDER"].value}} - - def pick_cells_dd(self, txt): - if txt["new"] != "Toggle cluster": - al = str(txt["new"]) - sel = self.selected[self.stab.selected_index] - als = self.active_labels[self.stab.selected_index].astype("str") - ratio = sel[als == al].sum() / sel[als == al].size - if ratio >= 0.5: - sel[als == al] = False - else: - sel[als == al] = True - - self.selected_cells[self.stab.selected_index] = np.array( - list(self.sams[self.stab.selected_index].adata.obs_names[sel]) - ) - - self.stab.children[self.stab.selected_index].data[0].selectedpoints = list( - np.where(sel)[0] - ) - self.log('Toggling cluster {} in tab {}'.format(al,self.stab.selected_index)) - - self.cs_dict["ACC"].value = "Toggle cluster" - - def pick_cells(self, trace, points, selector): - tf = self.selected[self.stab.selected_index][points.point_inds[0]] - als = self.active_labels[self.stab.selected_index] - al = als[points.point_inds[0]] - - if tf: - self.selected[self.stab.selected_index][als == al] = False - else: - self.selected[self.stab.selected_index][als == al] = True - - self.selected_cells[self.stab.selected_index] = np.array( - list( - self.sams[self.stab.selected_index].adata.obs_names[ - self.selected[self.stab.selected_index] - ] - ) - ) - trace.selectedpoints = list( - np.where(self.selected[self.stab.selected_index])[0] - ) - - def change_alpha(self, val): - # m = self.stab.children[self.stab.selected_index].data[0].marker.copy() - # m['opacity'] = val['new'] - self.stab.children[self.stab.selected_index].data[0].unselected = { - "marker": {"opacity": val["new"]} - } - self.log('Changing alpha (opacity) to be {}'.format(val['new'])) - - def change_msize(self, val): - val = val["new"] - markers = self.stab.children[self.stab.selected_index].data[0].marker - markers.size = val - self.log('Changing marker size to be {}'.format(val)) - - def init_graph(self, trace): - trace.mode = "markers" - trace.marker.size = 5 - trace.on_selection(self.select) - trace.on_click(self.pick_cells) - - -def invalidArgs(func, argdict): - import inspect - args = inspect.getfullargspec(func).args - return set(argdict) - set(args) - diff --git a/samalg/utilities.py b/samalg/utilities.py deleted file mode 100644 index 7e2a518..0000000 --- a/samalg/utilities.py +++ /dev/null @@ -1,453 +0,0 @@ -import pandas as pd -import numpy as np -import scipy as sp -import os -import errno -from sklearn.decomposition import PCA -import umap.distances as dist -from sklearn.utils.extmath import svd_flip -from sklearn.utils import check_array, check_random_state -from scipy import sparse -import sklearn.utils.sparsefuncs as sf -from umap.umap_ import nearest_neighbors -__version__ = "0.8.7" - - -def find_corr_genes(sam, input_gene): - """Rank genes by their spatially averaged expression pattern correlations to - a desired gene. - - Parameters - ---------- - - sam - SAM - The analyzed SAM object - - input_gene - string - The gene ID with respect to which correlations will be computed. - - Returns - ------- - A ranked list of gene IDs based on correlation to the input gene. - """ - all_gene_names = np.array(list(sam.adata.var_names)) - - D_avg = sam.adata.layers["X_knn_avg"] - - input_gene = np.where(all_gene_names == input_gene)[0] - - if input_gene.size == 0: - print( - "Gene note found in the filtered dataset. Note " - "that genes are case sensitive." - ) - return - - pw_corr = generate_correlation_map(D_avg.T.A, D_avg[:, input_gene].T.A) - return all_gene_names[np.argsort(-pw_corr.flatten())] - -def _pca_with_sparse(X, npcs, solver='arpack', mu=None, seed=0, mu_axis=0): - random_state = check_random_state(seed) - np.random.set_state(random_state.get_state()) - random_init = np.random.rand(np.min(X.shape)) - X = check_array(X, accept_sparse=['csr', 'csc']) - - if mu is None: - if mu_axis == 0: - mu = X.mean(0).A.flatten()[None, :] - else: - mu = X.mean(1).A.flatten()[:, None] - - if mu_axis == 0: - mdot = mu.dot - mmat = mdot - mhdot = mu.T.dot - mhmat = mu.T.dot - Xdot = X.dot - Xmat = Xdot - XHdot = X.T.conj().dot - XHmat = XHdot - ones = np.ones(X.shape[0])[None, :].dot - - def matvec(x): - return Xdot(x) - mdot(x) - - def matmat(x): - return Xmat(x) - mmat(x) - - def rmatvec(x): - return XHdot(x) - mhdot(ones(x)) - - def rmatmat(x): - return XHmat(x) - mhmat(ones(x)) - - else: - mdot = mu.dot - mmat = mdot - mhdot = mu.T.dot - mhmat = mu.T.dot - Xdot = X.dot - Xmat = Xdot - XHdot = X.T.conj().dot - XHmat = XHdot - ones = np.ones(X.shape[1])[None, :].dot - - def matvec(x): - return Xdot(x) - mdot(ones(x)) - - def matmat(x): - return Xmat(x) - mmat(ones(x)) - - def rmatvec(x): - return XHdot(x) - mhdot(x) - - def rmatmat(x): - return XHmat(x) - mhmat(x) - - XL = sp.sparse.linalg.LinearOperator( - matvec=matvec, - dtype=X.dtype, - matmat=matmat, - shape=X.shape, - rmatvec=rmatvec, - rmatmat=rmatmat, - ) - - u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init) - u, v = svd_flip(u, v) - idx = np.argsort(-s) - v = v[idx, :] - - X_pca = (u * s)[:, idx] - ev = s[idx] ** 2 / (X.shape[0] - 1) - - total_var = sf.mean_variance_axis(X, axis=0)[1].sum() - ev_ratio = ev / total_var - - output = { - 'X_pca': X_pca, - 'variance': ev, - 'variance_ratio': ev_ratio, - 'components': v, - } - return output - - - -def nearest_neighbors_wrapper(X,n_neighbors=15,metric='correlation',metric_kwds={},angular=True,random_state=0): - random_state=np.random.RandomState(random_state) - return nearest_neighbors(X,n_neighbors,metric,metric_kwds,angular,random_state)[:2] - -def knndist(nnma): - x, y = nnma.nonzero() - data = nnma.data - knn = y.reshape((nnma.shape[0], nnma[0, :].data.size)) - val = data.reshape(knn.shape) - return knn, val - - -def save_figures(filename, fig_IDs=None, **kwargs): - """ - Save figures. - - Parameters - ---------- - filename - str - Name of output file - - fig_IDs - int, numpy.array, list, optional, default None - A list of open figure IDs or a figure ID that will be saved to a - pdf/png file respectively. - - **kwargs - - Extra keyword arguments passed into 'matplotlib.pyplot.savefig'. - - """ - import matplotlib.pyplot as plt - - if fig_IDs is not None: - if type(fig_IDs) is list: - savetype = "pdf" - else: - savetype = "png" - else: - savetype = "pdf" - - if savetype == "pdf": - from matplotlib.backends.backend_pdf import PdfPages - - if len(filename.split(".")) == 1: - filename = filename + ".pdf" - else: - filename = ".".join(filename.split(".")[:-1]) + ".pdf" - - pdf = PdfPages(filename) - - if fig_IDs is None: - figs = [plt.figure(n) for n in plt.get_fignums()] - else: - figs = [plt.figure(n) for n in fig_IDs] - - for fig in figs: - fig.savefig(pdf, format="pdf", **kwargs) - pdf.close() - elif savetype == "png": - plt.figure(fig_IDs).savefig(filename, **kwargs) - - -def weighted_PCA(mat, do_weight=True, npcs=None, solver="auto",seed = 0): - # mat = (mat - np.mean(mat, axis=0)) - if do_weight: - if min(mat.shape) >= 10000 and npcs is None: - print( - "More than 10,000 cells. Running with 'npcs' set to < 1000 is" - " recommended." - ) - - if npcs is None: - ncom = min(mat.shape) - else: - ncom = min((min(mat.shape), npcs)) - - pca = PCA(svd_solver=solver, n_components=ncom,random_state=check_random_state(seed)) - reduced = pca.fit_transform(mat) - scaled_eigenvalues = pca.explained_variance_ - scaled_eigenvalues = scaled_eigenvalues / scaled_eigenvalues.max() - reduced_weighted = reduced * scaled_eigenvalues[None, :] ** 0.5 - else: - pca = PCA(n_components=npcs, svd_solver=solver,random_state=check_random_state(seed)) - reduced = pca.fit_transform(mat) - if reduced.shape[1] == 1: - pca = PCA(n_components=2, svd_solver=solver,random_state=check_random_state(seed)) - reduced = pca.fit_transform(mat) - reduced_weighted = reduced - - return reduced_weighted, pca - - -def transform_wPCA(mat, pca): - mat = mat - pca.mean_ - reduced = mat.dot(pca.components_.T) - v = pca.explained_variance_ # .var(0) - scaled_eigenvalues = v / v.max() - reduced_weighted = np.array(reduced) * scaled_eigenvalues[None, :] ** 0.5 - return reduced_weighted - -def search_string(vec, s, case_sensitive=False, invert=False): - vec = np.array(vec) - - - if isinstance(s,list): - S = s - else: - S = [s] - - V=[]; M=[] - for s in S: - m = [] - if not case_sensitive: - s = s.lower() - for i in range(len(vec)): - if case_sensitive: - st = vec[i] - else: - st = vec[i].lower() - b = st.find(s) - if not invert and b != -1 or invert and b == -1: - m.append(i) - if len(m) > 0: - V.append(vec[np.array(m)]); M.append(np.array(m)) - if len(V)>0: - i = len(V) - if not invert: - V = np.concatenate(V); M = np.concatenate(M); - if i > 1: - ix = np.sort(np.unique(M,return_index=True)[1]) - V=V[ix]; M=M[ix]; - else: - for i in range(len(V)): - V[i]=list(set(V[i]).intersection(*V)) - V = vec[np.in1d(vec,np.unique(np.concatenate(V)))] - M = np.array([np.where(vec==x)[0][0] for x in V]) - return V,M - else: - return -1,-1 - - -def distance_matrix_error(dist1, dist2): - s = 0 - for k in range(dist1.shape[0]): - s += np.corrcoef(dist1[k, :], dist2[k, :])[0, 1] - return 1 - s / dist1.shape[0] - - -def generate_euclidean_map(A, B): - a = (A ** 2).sum(1).flatten() - b = (B ** 2).sum(1).flatten() - x = a[:, None] + b[None, :] - 2 * np.dot(A, B.T) - x[x < 0] = 0 - return np.sqrt(x) - - -def generate_correlation_map(x, y): - mu_x = x.mean(1) - mu_y = y.mean(1) - n = x.shape[1] - if n != y.shape[1]: - raise ValueError("x and y must " + "have the same number of timepoints.") - s_x = x.std(1, ddof=n - 1) - s_y = y.std(1, ddof=n - 1) - s_x[s_x == 0] = 1 - s_y[s_y == 0] = 1 - cov = np.dot(x, y.T) - n * np.dot(mu_x[:, None], mu_y[None, :]) - return cov / np.dot(s_x[:, None], s_y[None, :]) - - -def extract_annotation(cn, x, c="_"): - m = [] - if x is not None: - for i in range(cn.size): - f = cn[i].split(c) - x = min(len(f) - 1, x) - m.append(f[x]) - return np.array(m) - else: - ms = [] - ls = [] - for i in range(cn.size): - f = cn[i].split(c) - m = [] - for x in range(len(f)): - m.append(f[x]) - ms.append(m) - ls.append(len(m)) - ml = max(ls) - for i in range(len(ms)): - ms[i].extend([""] * (ml - len(ms[i]))) - if ml - len(ms[i]) > 0: - ms[i] = np.concatenate(ms[i]) - ms = np.vstack(ms) - MS = [] - for i in range(ms.shape[1]): - MS.append(ms[:, i]) - return MS - - -def isolate(dt, x1, x2, y1, y2): - return np.where( - np.logical_and( - np.logical_and(dt[:, 0] > x1, dt[:, 0] < x2), - np.logical_and(dt[:, 1] > y1, dt[:, 1] < y2), - ) - )[0] - - -def to_lower(y): - x = y.copy().flatten() - for i in range(x.size): - x[i] = x[i].lower() - return x - - -def to_upper(y): - x = y.copy().flatten() - for i in range(x.size): - x[i] = x[i].upper() - return x - - -def create_folder(path): - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise - - -def convert_annotations(A): - x = np.unique(A) - y = np.zeros(A.size) - z = 0 - for i in x: - y[A == i] = z - z += 1 - - return y.astype("int") - - -def nearest_neighbors_hnsw(x,ef=200,M=48,n_neighbors = 100): - import hnswlib - labels = np.arange(x.shape[0]) - p = hnswlib.Index(space = 'cosine', dim = x.shape[1]) - p.init_index(max_elements = x.shape[0], ef_construction = ef, M = M) - p.add_items(x, labels) - p.set_ef(ef) - idx, dist = p.knn_query(x, k = n_neighbors) - return idx,dist - - -def calc_nnm(g_weighted, k, distance=None): - if g_weighted.shape[0] > 0: - if distance == 'cosine': - nnm, dists = nearest_neighbors_hnsw(g_weighted, n_neighbors=k) - else: - nnm, dists = nearest_neighbors_wrapper(g_weighted, n_neighbors=k, metric=distance) - EDM = gen_sparse_knn(nnm, dists) - EDM = EDM.tocsr() - return EDM - - -def compute_distances(A, dm): - if dm == "euclidean": - m = np.dot(A, A.T) - h = np.diag(m) - x = h[:, None] + h[None, :] - 2 * m - x[x < 0] = 0 - dist = np.sqrt(x) - elif dm == "correlation": - dist = 1 - np.corrcoef(A) - else: - dist = sp.spatial.distance.squareform(sp.spatial.distance.pdist(A, metric=dm)) - return dist - - -def dist_to_nn(d, K): # , offset = 0): - E = d.copy() - np.fill_diagonal(E, -1) - M = np.max(E) * 2 - x = np.argsort(E, axis=1)[:, :K] # offset:K+offset] - E[ - np.tile( - np.arange(E.shape[0]).reshape(E.shape[0], -1), (1, x.shape[1]) - ).flatten(), - x.flatten(), - ] = M - - E[E < M] = 0 - E[E > 0] = 1 - return E # ,x - - -def to_sparse_knn(D1, k): - for i in range(D1.shape[0]): - x = D1.data[D1.indptr[i] : D1.indptr[i + 1]] - idx = np.argsort(x) - if idx.size > k: - x[idx[:-k]] = 0 - D1.data[D1.indptr[i] : D1.indptr[i + 1]] = x - D1.eliminate_zeros() - return D1 - - -def gen_sparse_knn(knni, knnd, shape=None): - if shape is None: - shape = (knni.shape[0], knni.shape[0]) - - D1 = sp.sparse.lil_matrix(shape) - - D1[ - np.tile(np.arange(knni.shape[0])[:, None], (1, knni.shape[1])).flatten().astype('int32'), - knni.flatten().astype('int32'), - ] = knnd.flatten() - D1 = D1.tocsr() - return D1 diff --git a/setup.py b/setup.py deleted file mode 100644 index f206ba8..0000000 --- a/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="sam-algorithm", - version="1.0.2", - description="The Self-Assembling-Manifold algorithm", - long_description="The Self-Assembling-Manifold algorithm for analyzing single-cell RNA sequencing data.", - long_description_content_type="text/markdown", - url="https://github.com/atarashansky/self-assembling-manifold", - author="Alexander J. Tarashansky", - author_email="tarashan@stanford.edu", - keywords="scrnaseq analysis manifold reconstruction", - python_requires=">=3.6", - # py_modules=["SAM", "utilities", "SAMGUI"], - install_requires=[ - "numpy>=1.19.0", - "scipy>=1.3.1", - "pandas>1.0.0", - "scikit-learn>=0.23.1", - "packaging>=0.20.0", - "numba>=0.50.1", - "umap-learn>=0.4.6", - "dill", - "anndata>=0.7.4", - "h5py", - "harmonypy", - "hnswlib" - ], - extras_require={ - "louvain": ["louvain", "cython", "python-igraph"], - "leiden": ["leidenalg", "cython", "python-igraph"], - "hdbscan": ["hdbscan"], - "plot": ["ipythonwidgets", "jupyter", "plotly==4.0.0", "matplotlib"], - "scanpy": ["scanpy"], - }, - packages=find_packages(), - include_package_data=True, - zip_safe=False, -) diff --git a/src/samalg/__init__.py b/src/samalg/__init__.py new file mode 100644 index 0000000..3ab3eb4 --- /dev/null +++ b/src/samalg/__init__.py @@ -0,0 +1,66 @@ +"""SAM (Self-Assembling Manifold) algorithm for single-cell RNA sequencing analysis. + +SAM iteratively rescales the input gene expression matrix to emphasize +genes that are spatially variable along the intrinsic manifold of the data. + +Example +------- +>>> from samalg import SAM +>>> sam = SAM() +>>> sam.load_data("expression_matrix.h5ad") +>>> sam.preprocess_data() +>>> sam.run() +>>> sam.leiden_clustering() + +Copyright 2018, Alexander J. Tarashansky, All rights reserved. +""" + +from __future__ import annotations + +from ._logging import get_logger, set_verbosity, setup_logging +from .exceptions import ( + ClusteringError, + ConvergenceError, + DataNotLoadedError, + DimensionalityReductionError, + FileLoadError, + FileSaveError, + InvalidParameterError, + PreprocessingError, + ProcessingError, + SAMError, +) +from .sam import SAM +from .utilities import ( + calc_nnm, + find_corr_genes, + generate_correlation_map, + generate_euclidean_map, + nearest_neighbors_hnsw, + weighted_PCA, +) + +__version__ = "2.0.0" + +__all__ = [ + "SAM", + "ClusteringError", + "ConvergenceError", + "DataNotLoadedError", + "DimensionalityReductionError", + "FileLoadError", + "FileSaveError", + "InvalidParameterError", + "PreprocessingError", + "ProcessingError", + "SAMError", + "calc_nnm", + "find_corr_genes", + "generate_correlation_map", + "generate_euclidean_map", + "get_logger", + "nearest_neighbors_hnsw", + "set_verbosity", + "setup_logging", + "weighted_PCA", +] diff --git a/src/samalg/_logging.py b/src/samalg/_logging.py new file mode 100644 index 0000000..8b04332 --- /dev/null +++ b/src/samalg/_logging.py @@ -0,0 +1,123 @@ +"""Logging infrastructure for the SAM algorithm. + +This module provides a centralized logging configuration for the samalg package. +All logging throughout the package should use the logger obtained from get_logger(). +""" + +from __future__ import annotations + +import logging +import sys +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + pass + +# Package-level logger name +LOGGER_NAME = "samalg" + +# Default format for log messages +DEFAULT_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +SIMPLE_FORMAT = "%(levelname)s: %(message)s" + + +def get_logger(name: str | None = None) -> logging.Logger: + """Get a logger instance for the samalg package. + + Parameters + ---------- + name : str | None, optional + The name for the logger. If None, returns the root samalg logger. + If provided, returns a child logger (e.g., 'samalg.sam'). + + Returns + ------- + logging.Logger + A configured logger instance. + + Examples + -------- + >>> logger = get_logger() # Returns 'samalg' logger + >>> logger = get_logger('sam') # Returns 'samalg.sam' logger + """ + if name is None: + return logging.getLogger(LOGGER_NAME) + return logging.getLogger(f"{LOGGER_NAME}.{name}") + + +def setup_logging( + level: int | str = logging.INFO, + format_string: str | None = None, + stream: bool = True, +) -> None: + """Configure logging for the samalg package. + + Parameters + ---------- + level : int | str, optional + The logging level. Can be an integer (e.g., logging.DEBUG) or + a string (e.g., 'DEBUG'). Default is logging.INFO. + format_string : str | None, optional + The format string for log messages. If None, uses SIMPLE_FORMAT. + stream : bool, optional + Whether to add a StreamHandler to output to stderr. Default is True. + + Examples + -------- + >>> setup_logging(level='DEBUG') + >>> setup_logging(level=logging.WARNING, format_string='%(message)s') + """ + logger = get_logger() + + # Convert string level to int if needed + if isinstance(level, str): + level = getattr(logging, level.upper(), logging.INFO) + + logger.setLevel(level) + + # Remove existing handlers to avoid duplicates + logger.handlers.clear() + + if stream: + handler = logging.StreamHandler(sys.stderr) + handler.setLevel(level) + + formatter = logging.Formatter(format_string or SIMPLE_FORMAT) + handler.setFormatter(formatter) + + logger.addHandler(handler) + + +def set_verbosity(verbose: bool) -> None: + """Set the verbosity level of the samalg logger. + + Parameters + ---------- + verbose : bool + If True, set logging level to INFO. + If False, set logging level to WARNING. + + Examples + -------- + >>> set_verbosity(True) # Show INFO messages + >>> set_verbosity(False) # Only show WARNING and above + """ + logger = get_logger() + if verbose: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.WARNING) + + +# Initialize the logger with default settings +# This ensures the logger exists even if setup_logging is not called +_logger = get_logger() +if not _logger.handlers: + # Only add handler if none exist (avoids duplicate handlers) + _handler = logging.StreamHandler(sys.stderr) + _handler.setFormatter(logging.Formatter(SIMPLE_FORMAT)) + _logger.addHandler(_handler) + _logger.setLevel(logging.INFO) + +# Convenience: export the main logger directly +logger = get_logger() diff --git a/src/samalg/exceptions.py b/src/samalg/exceptions.py new file mode 100644 index 0000000..d041ee8 --- /dev/null +++ b/src/samalg/exceptions.py @@ -0,0 +1,193 @@ +"""Custom exceptions for the SAM algorithm. + +This module defines exception classes used throughout the samalg package +to provide clear, specific error messages for different failure modes. +""" + +from __future__ import annotations + + +class SAMError(Exception): + """Base exception for all SAM-related errors. + + All custom exceptions in the samalg package inherit from this class, + making it easy to catch any SAM-specific error. + + Examples + -------- + >>> try: + ... sam.run() + ... except SAMError as e: + ... print(f"SAM operation failed: {e}") + """ + + pass + + +class DataNotLoadedError(SAMError): + """Raised when an operation requires data that hasn't been loaded. + + This error is raised when attempting to perform analysis operations + (preprocessing, running SAM, etc.) before loading expression data. + + Examples + -------- + >>> sam = SAM() + >>> sam.run() # No data loaded yet + DataNotLoadedError: No data has been loaded. Use load_data() or pass data to constructor. + """ + + def __init__(self, message: str | None = None) -> None: + if message is None: + message = "No data has been loaded. Use load_data() or pass data to the constructor." + super().__init__(message) + + +class PreprocessingError(SAMError): + """Raised when data preprocessing fails. + + This error indicates a problem during the preprocessing step, + such as invalid normalization parameters or data format issues. + + Examples + -------- + >>> sam.preprocess_data(norm='invalid') + PreprocessingError: Unknown normalization method: 'invalid' + """ + + pass + + +class ProcessingError(SAMError): + """Raised when a general processing operation fails. + + This is a general error for processing failures that don't fit + into more specific categories. + """ + + pass + + +class ClusteringError(SAMError): + """Raised when a clustering operation fails. + + This error is raised when clustering algorithms fail to complete, + typically due to invalid parameters or missing dependencies. + + Examples + -------- + >>> sam.leiden_clustering(res=-1) + ClusteringError: Resolution parameter must be positive. + """ + + pass + + +class DimensionalityReductionError(SAMError): + """Raised when dimensionality reduction fails. + + This error indicates problems with PCA, UMAP, t-SNE, or other + dimensionality reduction operations. + + Examples + -------- + >>> sam.run(npcs=10000) # More PCs than features + DimensionalityReductionError: Cannot compute 10000 PCs with only 5000 genes. + """ + + pass + + +class ConvergenceError(SAMError): + """Raised when the SAM algorithm fails to converge. + + This error is raised when the iterative SAM algorithm doesn't + reach the specified convergence threshold within max_iter iterations. + + Examples + -------- + >>> sam.run(max_iter=1, stopping_condition=1e-10) + ConvergenceError: SAM did not converge after 1 iterations. + """ + + def __init__(self, iterations: int, final_error: float | None = None) -> None: + message = f"SAM did not converge after {iterations} iterations." + if final_error is not None: + message += f" Final error: {final_error:.6f}" + super().__init__(message) + self.iterations = iterations + self.final_error = final_error + + +class InvalidParameterError(SAMError): + """Raised when an invalid parameter value is provided. + + This error provides clear feedback about which parameter is invalid + and what values are acceptable. + + Examples + -------- + >>> sam.run(weight_mode='invalid') + InvalidParameterError: Invalid value for 'weight_mode': 'invalid'. + Expected one of: 'dispersion', 'variance', 'rms', 'combined' + """ + + def __init__( + self, + param_name: str, + value: object, + valid_values: list[str] | None = None, + reason: str | None = None, + ) -> None: + message = f"Invalid value for '{param_name}': {value!r}." + if valid_values: + valid_str = ", ".join(repr(v) for v in valid_values) + message += f" Expected one of: {valid_str}" + elif reason: + message += f" {reason}" + super().__init__(message) + self.param_name = param_name + self.value = value + self.valid_values = valid_values + + +class FileLoadError(SAMError): + """Raised when loading a file fails. + + This error provides context about what file couldn't be loaded + and why. + + Examples + -------- + >>> sam.load_data('nonexistent.h5ad') + FileLoadError: Failed to load file 'nonexistent.h5ad': File not found. + """ + + def __init__(self, filepath: str, reason: str | None = None) -> None: + message = f"Failed to load file '{filepath}'." + if reason: + message += f" {reason}" + super().__init__(message) + self.filepath = filepath + self.reason = reason + + +class FileSaveError(SAMError): + """Raised when saving a file fails. + + This error provides context about what file couldn't be saved + and why. + + Examples + -------- + >>> sam.save_anndata('/readonly/path/data.h5ad') + FileSaveError: Failed to save file '/readonly/path/data.h5ad': Permission denied. + """ + + def __init__(self, filepath: str, reason: str | None = None) -> None: + message = f"Failed to save file '{filepath}'." + if reason: + message += f" {reason}" + super().__init__(message) + self.filepath = filepath + self.reason = reason diff --git a/src/samalg/py.typed b/src/samalg/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/samalg/sam.py b/src/samalg/sam.py new file mode 100644 index 0000000..d231455 --- /dev/null +++ b/src/samalg/sam.py @@ -0,0 +1,2043 @@ +"""Self-Assembling Manifold (SAM) algorithm for single-cell RNA sequencing analysis. + +This module contains the main SAM class that implements iterative manifold-aware +feature weighting for single-cell analysis. + +Copyright 2018, Alexander J. Tarashansky, All rights reserved. +Email: +""" + +from __future__ import annotations + +import gc +import pickle +import time +import warnings +from typing import TYPE_CHECKING, Any, Literal + +import anndata +import numpy as np +import pandas as pd +import scipy.sparse as sp +import sklearn.manifold as man +import sklearn.utils.sparsefuncs as sf +from anndata import AnnData +from numba.core.errors import NumbaWarning +from sklearn.preprocessing import Normalizer + +from . import utilities as ut +from ._logging import get_logger +from .exceptions import DataNotLoadedError, InvalidParameterError + +if TYPE_CHECKING: + from collections.abc import Sequence + + from numpy.typing import NDArray + +# Suppress NumbaWarning +warnings.filterwarnings("ignore", category=NumbaWarning) + +# Get module logger +logger = get_logger("sam") + + +class SAM: + """Self-Assembling Manifolds single-cell RNA sequencing analysis tool. + + SAM iteratively rescales the input gene expression matrix to emphasize + genes that are spatially variable along the intrinsic manifold of the data. + It outputs the gene weights, nearest neighbor matrix, and a 2D projection. + + Parameters + ---------- + counts : tuple | list | pd.DataFrame | AnnData | None + Input data in one of the following formats: + - tuple/list: (data, gene_names, cell_names) where data is sparse/dense matrix + - pd.DataFrame: cells x genes expression matrix + - AnnData: annotated data object + Only use this argument if you want to pass in preloaded data. + Otherwise use one of the load functions. + inplace : bool, optional + If True and counts is AnnData, use the object directly without copying. + Default is False. + + Attributes + ---------- + preprocess_args : dict + Dictionary of arguments used for the 'preprocess_data' function. + run_args : dict + Dictionary of arguments used for the 'run' function. + adata_raw : AnnData + An AnnData object containing the raw, unfiltered input data. + adata : AnnData + An AnnData object containing all processed data and SAM outputs. + + Examples + -------- + >>> sam = SAM() + >>> sam.load_data("expression_matrix.h5ad") + >>> sam.preprocess_data() + >>> sam.run() + >>> sam.leiden_clustering() + """ + + def __init__( + self, + counts: ( + tuple[sp.spmatrix | NDArray[np.floating[Any]], NDArray[Any], NDArray[Any]] + | list[Any] + | pd.DataFrame + | AnnData + | None + ) = None, + inplace: bool = False, + ) -> None: + self.run_args: dict[str, Any] = {} + self.preprocess_args: dict[str, Any] = {} + + if isinstance(counts, (tuple, list)): + raw_data, all_gene_names, all_cell_names = counts + if isinstance(raw_data, np.ndarray): + raw_data = sp.csr_matrix(raw_data) + + self.adata_raw = AnnData( + X=raw_data, + obs={"obs_names": all_cell_names}, + var={"var_names": all_gene_names}, + ) + + elif isinstance(counts, pd.DataFrame): + raw_data = sp.csr_matrix(counts.values) + all_gene_names = np.array(list(counts.columns.values)) + all_cell_names = np.array(list(counts.index.values)) + + self.adata_raw = AnnData( + X=raw_data, + obs={"obs_names": all_cell_names}, + var={"var_names": all_gene_names}, + ) + + elif isinstance(counts, AnnData): + all_cell_names = np.array(list(counts.obs_names)) + all_gene_names = np.array(list(counts.var_names)) + if counts.is_view: + counts = counts.copy() + + if inplace: + self.adata_raw = counts + else: + self.adata_raw = counts.copy() + + elif counts is not None: + raise TypeError( + "'counts' must be either a tuple/list of " + "(data, gene IDs, cell IDs), a Pandas DataFrame of " + "cells x genes, or an AnnData object." + ) + + if counts is not None: + if np.unique(all_gene_names).size != all_gene_names.size: + self.adata_raw.var_names_make_unique() + if np.unique(all_cell_names).size != all_cell_names.size: + self.adata_raw.obs_names_make_unique() + + if inplace: + self.adata = self.adata_raw + else: + self.adata = self.adata_raw.copy() + + if "X_disp" not in self.adata_raw.layers.keys(): + self.adata.layers["X_disp"] = self.adata.X + + def preprocess_data( + self, + div: float = 1, + downsample: float = 0, + sum_norm: str | float | None = "cell_median", + norm: str | None = "log", + min_expression: float = 1, + thresh_low: float = 0.0, + thresh_high: float = 0.96, + thresh: float | None = None, + filter_genes: bool = True, + ) -> None: + """Log-normalizes and filters the expression data. + + Parameters + ---------- + div : float, optional + The factor by which the gene expression will be divided prior to + normalization. Default is 1. + downsample : float, optional + The factor by which to randomly downsample the data. If 0, the + data will not be downsampled. Default is 0. + sum_norm : str | float | None, optional + Library normalization method. Options: + - float: Normalize each cell to this total count + - 'cell_median': Normalize to median total count per cell + - 'gene_median': Normalize genes to median total count per gene + - None: No normalization + Default is 'cell_median'. + norm : str | None, optional + Data transformation method. Options: + - 'log': log2(x + 1) transformation + - 'ftt': Freeman-Tukey variance-stabilizing transformation + - 'asin': arcsinh transformation + - 'multinomial': Pearson residual transformation (experimental) + - None: No transformation + Default is 'log'. + min_expression : float, optional + Threshold above which a gene is considered expressed. Values below + this are set to zero. Default is 1. + thresh_low : float, optional + Keep genes expressed in greater than thresh_low*100% of cells. + Default is 0.0. + thresh_high : float, optional + Keep genes expressed in less than thresh_high*100% of cells. + Default is 0.96. + thresh : float | None, optional + If provided, sets thresh_low=thresh and thresh_high=1-thresh. + filter_genes : bool, optional + Whether to apply gene filtering. Default is True. + """ + if thresh is not None: + thresh_low = thresh + thresh_high = 1 - thresh + + # Load data - check this first before accessing adata + if not hasattr(self, "adata_raw"): + raise DataNotLoadedError() + + self.preprocess_args = { + "div": div, + "sum_norm": sum_norm, + "norm": norm, + "min_expression": min_expression, + "thresh_low": thresh_low, + "thresh_high": thresh_high, + "filter_genes": filter_genes, + } + + self.run_args = self.adata.uns.get("run_args", {}) + + D = self.adata_raw.X + self.adata = self.adata_raw.copy() + + D = self.adata.X + if isinstance(D, np.ndarray): + D = sp.csr_matrix(D, dtype="float32") + else: + if str(D.dtype) != "float32": + D = D.astype("float32") + D.sort_indices() + + if D.getformat() == "csc": + D = D.tocsr() + + # Sum-normalize + if sum_norm == "cell_median" and norm != "multinomial": + s = np.asarray(D.sum(1)).flatten() + sum_norm_val = np.median(s) + D = D.multiply(1 / s[:, None] * sum_norm_val).tocsr() + elif sum_norm == "gene_median" and norm != "multinomial": + s = np.asarray(D.sum(0)).flatten() + sum_norm_val = np.median(s[s > 0]) + s[s == 0] = 1 + D = D.multiply(1 / s[None, :] * sum_norm_val).tocsr() + elif sum_norm is not None and norm != "multinomial": + D = D.multiply(1 / np.asarray(D.sum(1)).flatten()[:, None] * sum_norm).tocsr() + + # Normalize + self.adata.X = D + if norm is None: + D.data[:] = D.data / div + + elif norm.lower() == "log": + D.data[:] = np.log2(D.data / div + 1) + + elif norm.lower() == "ftt": + D.data[:] = np.sqrt(D.data / div) + np.sqrt(D.data / div + 1) - 1 + + elif norm.lower() == "asin": + D.data[:] = np.arcsinh(D.data / div) + + elif norm.lower() == "multinomial": + ni = np.asarray(D.sum(1)).flatten() # cells + pj = np.asarray(D.sum(0) / D.sum()).flatten() # genes + col = D.indices + row = [] + for i in range(D.shape[0]): + row.append(i * np.ones(D.indptr[i + 1] - D.indptr[i])) + row = np.concatenate(row).astype("int32") + mu = sp.coo_matrix((ni[row] * pj[col], (row, col))).tocsr() + mu2 = mu.copy() + mu2.data[:] = mu2.data**2 + mu2 = mu2.multiply(1 / ni[:, None]) + mu.data[:] = (D.data - mu.data) / np.sqrt(mu.data - mu2.data) + + self.adata.X = mu + if sum_norm is None: + sum_norm = np.median(ni) + D = D.multiply(1 / ni[:, None] * sum_norm).tocsr() + D.data[:] = np.log2(D.data / div + 1) + + else: + D.data[:] = D.data / div + + # Zero-out low-expressed genes + idx = np.where(D.data <= min_expression)[0] + D.data[idx] = 0 + + # Filter genes + idx_genes = np.arange(D.shape[1]) + if filter_genes: + a, ct = np.unique(D.indices, return_counts=True) + c = np.zeros(D.shape[1]) + c[a] = ct + + keep = np.where( + np.logical_and(c / D.shape[0] > thresh_low, c / D.shape[0] <= thresh_high) + )[0] + + idx_genes = np.array(list(set(keep) & set(idx_genes)), dtype=np.intp) + + mask_genes = np.zeros(D.shape[1], dtype="bool") + mask_genes[idx_genes] = True + + self.adata.X = self.adata.X.multiply(mask_genes[None, :]).tocsr() + self.adata.X.eliminate_zeros() + self.adata.var["mask_genes"] = mask_genes + + if norm == "multinomial": + self.adata.layers["X_disp"] = D.multiply(mask_genes[None, :]).tocsr() + self.adata.layers["X_disp"].eliminate_zeros() + else: + self.adata.layers["X_disp"] = self.adata.X + + self.calculate_mean_var() + + self.adata.uns["preprocess_args"] = self.preprocess_args + self.adata.uns["run_args"] = self.run_args + + def calculate_mean_var(self, adata: AnnData | None = None) -> None: + """Calculate mean and variance for each gene. + + Parameters + ---------- + adata : AnnData | None, optional + The AnnData object to calculate statistics for. + If None, uses self.adata. + """ + if adata is None: + adata = self.adata + + if sp.issparse(adata.X): + mu, var = sf.mean_variance_axis(adata.X, axis=0) + else: + mu = adata.X.mean(0) + var = adata.X.var(0) + + adata.var["means"] = mu + adata.var["variances"] = var + + def get_avg_obsm(self, keym: str, keyl: str) -> NDArray[np.floating[Any]]: + """Get average obsm values grouped by label. + + Parameters + ---------- + keym : str + Key in adata.obsm to average. + keyl : str + Key in adata.obs containing labels for grouping. + + Returns + ------- + NDArray + Array of averaged values for each unique label. + """ + clu = self.get_labels_un(keyl) + cl = self.get_labels(keyl) + x = [] + for i in range(clu.size): + x.append(self.adata.obsm[keym][cl == clu[i]].mean(0)) + return np.vstack(x) + + def get_labels_un(self, key: str) -> NDArray[Any]: + """Get unique labels from obs. + + Parameters + ---------- + key : str + Key in adata.obs. + + Returns + ------- + NDArray + Array of unique labels. + """ + if key not in list(self.adata.obs.keys()): + logger.warning("Key '%s' does not exist in `obs`.", key) + return np.array([]) + return np.array(list(np.unique(self.adata.obs[key]))) + + def get_labels(self, key: str) -> NDArray[Any]: + """Get labels from obs. + + Parameters + ---------- + key : str + Key in adata.obs. + + Returns + ------- + NDArray + Array of labels. + """ + if key not in list(self.adata.obs.keys()): + logger.warning("Key '%s' does not exist in `obs`.", key) + return np.array([]) + return np.array(list(self.adata.obs[key])) + + def get_cells(self, label: Any, key: str) -> NDArray[Any]: + """Retrieves cells of a particular annotation. + + Parameters + ---------- + label : Any + The annotation value to retrieve. + key : str + The key in `obs` from which to retrieve the annotation. + + Returns + ------- + NDArray + Array of cell names matching the label. + """ + if key not in list(self.adata.obs.keys()): + logger.warning("Key '%s' does not exist in `obs`.", key) + return np.array([]) + return np.array(list(self.adata.obs_names[np.array(list(self.adata.obs[key])) == label])) + + def load_data( + self, + filename: str, + transpose: bool = True, + save_sparse_file: str | None = None, + sep: str = ",", + calculate_avg: bool = False, + **kwargs: Any, + ) -> None: + """Load expression data from file. + + Parameters + ---------- + filename : str + Path to the data file. Supported formats: + - .csv/.txt: Tabular format (genes x cells by default) + - .h5ad: AnnData format + - .p: Pickle format + transpose : bool, optional + If True (default), assumes file is genes x cells. + Set to False if file is cells x genes. + save_sparse_file : str | None, optional + If provided, save processed data to this path (.h5ad format). + sep : str, optional + Delimiter for CSV/TXT files. Default is ','. + calculate_avg : bool, optional + If True and loading .h5ad with existing neighbors, perform + kNN averaging. Default is False. + **kwargs + Additional arguments passed to file loading functions. + """ + if filename.split(".")[-1] == "p": + raw_data, all_cell_names, all_gene_names = pickle.load(open(filename, "rb")) + + if transpose: + raw_data = raw_data.T + if raw_data.getformat() == "csc": + logger.info("Converting sparse matrix to csr format...") + raw_data = raw_data.tocsr() + + save_sparse_file = None + elif filename.split(".")[-1] != "h5ad": + df = pd.read_csv(filename, sep=sep, index_col=0, **kwargs) + if transpose: + dataset = df.T + else: + dataset = df + + raw_data = sp.csr_matrix(dataset.values) + all_cell_names = np.array(list(dataset.index.values)) + all_gene_names = np.array(list(dataset.columns.values)) + + if filename.split(".")[-1] != "h5ad": + self.adata_raw = AnnData( + X=raw_data, + obs={"obs_names": all_cell_names}, + var={"var_names": all_gene_names}, + ) + + if np.unique(all_gene_names).size != all_gene_names.size: + self.adata_raw.var_names_make_unique() + if np.unique(all_cell_names).size != all_cell_names.size: + self.adata_raw.obs_names_make_unique() + + self.adata = self.adata_raw.copy() + self.adata.layers["X_disp"] = raw_data + + else: + self.adata = anndata.read_h5ad(filename, **kwargs) + if self.adata.raw is not None: + self.adata_raw = AnnData(X=self.adata.raw.X) + self.adata_raw.var_names = self.adata.var_names + self.adata_raw.obs_names = self.adata.obs_names + self.adata_raw.obs = self.adata.obs + + del self.adata.raw + + if ( + "X_knn_avg" not in self.adata.layers.keys() + and "connectivities" in self.adata.obsp.keys() + and calculate_avg + ): + self.dispersion_ranking_NN(save_avgs=True) + else: + self.adata_raw = self.adata + + if "X_disp" not in list(self.adata.layers.keys()): + self.adata.layers["X_disp"] = self.adata.X + save_sparse_file = None + + filename = ".".join(filename.split(".")[:-1]) + ".h5ad" + self.adata.uns["path_to_file"] = filename + self.adata_raw.uns["path_to_file"] = filename + + if save_sparse_file is not None: + if save_sparse_file.split(".")[-1] == "p": + self.save_sparse_data(save_sparse_file) + elif save_sparse_file.split(".")[-1] == "h5ad": + self.save_anndata(save_sparse_file) + + def save_anndata(self, fname: str = "", save_knn: bool = False, **kwargs: Any) -> None: + """Save adata to an h5ad file. + + Parameters + ---------- + fname : str, optional + Output file path. If empty, uses path from adata.uns['path_to_file']. + save_knn : bool, optional + If True, include X_knn_avg layer. Default is False (layer can be large). + **kwargs + Additional arguments passed to AnnData.write_h5ad(). + """ + Xknn = None + if not save_knn: + if "X_knn_avg" in self.adata.layers: + Xknn = self.adata.layers["X_knn_avg"] + del self.adata.layers["X_knn_avg"] + + if fname == "": + if "path_to_file" not in self.adata.uns: + raise KeyError("Path to file not known.") + fname = self.adata.uns["path_to_file"] + + x = self.adata + x.raw = self.adata_raw + + # Fix weird issues when index name is an integer + for y in [ + x.obs.columns, + x.var.columns, + x.obs.index, + x.var.index, + x.raw.var.index, + x.raw.var.columns, + ]: + y.name = str(y.name) if y.name is not None else None + + x.write_h5ad(fname, **kwargs) + del x.raw + + if Xknn is not None: + self.adata.layers["X_knn_avg"] = Xknn + + def load_var_annotations( + self, aname: str | pd.DataFrame, sep: str = ",", key_added: str = "annotations" + ) -> None: + """Load gene annotations. + + Parameters + ---------- + aname : str | pd.DataFrame + Path to annotations file or DataFrame. First column should be gene IDs. + sep : str, optional + Delimiter for file. Default is ','. + key_added : str, optional + Unused parameter for backwards compatibility. + """ + if isinstance(aname, pd.DataFrame): + ann = aname + else: + ann = pd.read_csv(aname, sep=sep, index_col=0) + + for i in range(ann.shape[1]): + self.adata_raw.var[ann.columns[i]] = ann[ann.columns[i]] + self.adata.var[ann.columns[i]] = ann[ann.columns[i]] + + def load_obs_annotations(self, aname: str | pd.DataFrame, sep: str = ",") -> None: + """Load cell annotations. + + Parameters + ---------- + aname : str | pd.DataFrame + Path to annotations file or DataFrame. First column should be cell IDs. + sep : str, optional + Delimiter for file. Default is ','. + """ + if isinstance(aname, pd.DataFrame): + ann = aname + else: + ann = pd.read_csv(aname, sep=sep, index_col=0) + + for i in range(ann.shape[1]): + self.adata_raw.obs[ann.columns[i]] = ann[ann.columns[i]] + self.adata.obs[ann.columns[i]] = ann[ann.columns[i]] + + def scatter( + self, + projection: str | NDArray[np.floating[Any]] | None = None, + c: str | NDArray[Any] | None = None, + colorspec: str | NDArray[Any] | None = None, + cmap: str = "rainbow", + linewidth: float = 0.0, + edgecolor: str = "k", + axes: Any | None = None, + colorbar: bool = True, + s: float = 10, + **kwargs: Any, + ) -> Any: + """Display a scatter plot. + + Parameters + ---------- + projection : str | NDArray | None, optional + Key in adata.obsm or 2D coordinates array. Default is UMAP. + c : str | NDArray | None, optional + Color data - key in adata.obs or array. + colorspec : str | NDArray | None, optional + Direct color specification. + cmap : str, optional + Colormap name. Default is 'rainbow'. + linewidth : float, optional + Marker edge width. Default is 0.0. + edgecolor : str, optional + Marker edge color. Default is 'k'. + axes : matplotlib.axes.Axes | None, optional + Existing axes to plot on. + colorbar : bool, optional + Whether to show colorbar. Default is True. + s : float, optional + Marker size. Default is 10. + **kwargs + Additional arguments passed to matplotlib.pyplot.scatter. + + Returns + ------- + matplotlib.axes.Axes + The axes object. + """ + try: + import matplotlib.pyplot as plt + + if isinstance(projection, str): + if projection not in self.adata.obsm: + logger.error("Please create a projection first using run_umap or run_tsne") + return None + dt = self.adata.obsm[projection] + + elif projection is None: + if "X_umap" in self.adata.obsm: + dt = self.adata.obsm["X_umap"] + elif "X_tsne" in self.adata.obsm: + dt = self.adata.obsm["X_tsne"] + else: + logger.error("Please create either a t-SNE or UMAP projection first.") + return None + else: + dt = projection + + if axes is None: + plt.figure() + axes = plt.gca() + + if colorspec is not None: + axes.scatter( + dt[:, 0], + dt[:, 1], + s=s, + linewidth=linewidth, + edgecolor=edgecolor, + c=colorspec, + **kwargs, + ) + elif c is None: + axes.scatter( + dt[:, 0], + dt[:, 1], + s=s, + linewidth=linewidth, + edgecolor=edgecolor, + **kwargs, + ) + else: + if isinstance(c, str): + try: + c = self.get_labels(c) + except KeyError: + pass + + if (isinstance(c[0], str) or isinstance(c[0], np.str_)) and ( + isinstance(c, np.ndarray) or isinstance(c, list) + ): + i = ut.convert_annotations(c) + ui, ai = np.unique(i, return_index=True) + cax = axes.scatter( + dt[:, 0], + dt[:, 1], + c=i, + cmap=cmap, + s=s, + linewidth=linewidth, + edgecolor=edgecolor, + **kwargs, + ) + + if colorbar: + cbar = plt.colorbar(cax, ax=axes, ticks=ui) + cbar.ax.set_yticklabels(c[ai]) + else: + if not (isinstance(c, np.ndarray) or isinstance(c, list)): + colorbar = False + i = c + + # Only pass cmap if c is numeric data (not color specs) + scatter_kwargs: dict[str, Any] = { + "c": i, + "s": s, + "linewidth": linewidth, + "edgecolor": edgecolor, + **kwargs, + } + # Check if c is numeric array data suitable for colormapping + if isinstance(i, np.ndarray) and np.issubdtype(i.dtype, np.number): + scatter_kwargs["cmap"] = cmap + + cax = axes.scatter( + dt[:, 0], + dt[:, 1], + **scatter_kwargs, + ) + + if colorbar: + plt.colorbar(cax, ax=axes) + return axes + except ImportError: + logger.error("matplotlib not installed!") + return None + + def show_gene_expression( + self, gene: str, avg: bool = True, axes: Any | None = None, **kwargs: Any + ) -> tuple[Any, NDArray[np.floating[Any]]] | None: + """Display a gene's expression pattern. + + Parameters + ---------- + gene : str + Gene name to display. + avg : bool, optional + If True, use k-nearest-neighbor averaged expression. Default is True. + axes : matplotlib.axes.Axes | None, optional + Existing axes to plot on. + **kwargs + Additional arguments passed to scatter(). + + Returns + ------- + tuple[Axes, NDArray] | None + The axes object and expression values, or None if gene not found. + """ + all_gene_names = np.array(list(self.adata.var_names)) + cell_names = np.array(list(self.adata.obs_names)) + all_cell_names = np.array(list(self.adata_raw.obs_names)) + idx2 = np.where(np.isin(all_cell_names, cell_names))[0] + idx = np.where(all_gene_names == gene)[0] + name = gene + if idx.size == 0: + logger.warning( + "Gene not found in the filtered dataset. Note that genes are case sensitive." + ) + return None + + if avg: + a = self.adata.layers["X_knn_avg"][:, idx].toarray().flatten() + if a.sum() == 0: + a = self.adata_raw.X[:, idx].toarray().flatten()[idx2] + norm = self.preprocess_args.get("norm", "log") + if norm is not None: + if norm.lower() == "log": + a = np.log2(a + 1) + elif norm.lower() == "ftt": + a = np.sqrt(a) + np.sqrt(a + 1) + elif norm.lower() == "asin": + a = np.arcsinh(a) + else: + a = self.adata_raw.X[:, idx].toarray().flatten()[idx2] + norm = self.preprocess_args.get("norm", "log") + if norm is not None: + if norm.lower() == "log": + a = np.log2(a + 1) + elif norm.lower() == "ftt": + a = np.sqrt(a) + np.sqrt(a + 1) + elif norm.lower() == "asin": + a = np.arcsinh(a) + + axes = self.scatter(c=a, axes=axes, **kwargs) + if axes is not None: + axes.set_title(name) + + return axes, a + + def dispersion_ranking_NN( + self, + nnm: sp.spmatrix | None = None, + num_norm_avg: int = 50, + weight_mode: Literal["dispersion", "variance", "rms", "combined"] = "combined", + save_avgs: bool = False, + adata: AnnData | None = None, + ) -> NDArray[np.float64]: + """Compute spatial dispersion factors for each gene. + + Parameters + ---------- + nnm : scipy.sparse.spmatrix | None, optional + Cell-to-cell nearest-neighbor matrix. If None, uses + adata.obsp['connectivities']. + num_norm_avg : int, optional + Number of top dispersions to average for normalization. Default is 50. + weight_mode : str, optional + Weight calculation method. One of 'dispersion', 'variance', 'rms', + 'combined'. Default is 'combined'. + save_avgs : bool, optional + If True, save kNN-averaged values to layers['X_knn_avg']. Default is False. + adata : AnnData | None, optional + AnnData object to use. If None, uses self.adata. + + Returns + ------- + NDArray[np.float64] + Vector of gene weights. + """ + if adata is None: + adata = self.adata + + if nnm is None: + nnm = adata.obsp["connectivities"] + f = np.asarray(nnm.sum(1)) + f[f == 0] = 1 + D_avg = (nnm.multiply(1 / f)).dot(adata.layers["X_disp"]) + + if save_avgs: + adata.layers["X_knn_avg"] = D_avg.copy() + + if sp.issparse(D_avg): + mu, var = sf.mean_variance_axis(D_avg, axis=0) + if weight_mode == "rms": + D_avg.data[:] = D_avg.data**2 + mu, _ = sf.mean_variance_axis(D_avg, axis=0) + mu = mu**0.5 + + if weight_mode == "combined": + D_avg.data[:] = D_avg.data**2 + mu2, _ = sf.mean_variance_axis(D_avg, axis=0) + mu2 = mu2**0.5 + else: + mu = D_avg.mean(0) + var = D_avg.var(0) + if weight_mode == "rms": + mu = (D_avg**2).mean(0) ** 0.5 + if weight_mode == "combined": + mu2 = (D_avg**2).mean(0) ** 0.5 + + if not save_avgs: + del D_avg + gc.collect() + + if weight_mode in ("dispersion", "rms", "combined"): + dispersions = np.zeros(var.size) + dispersions[mu > 0] = var[mu > 0] / mu[mu > 0] + adata.var["spatial_dispersions"] = dispersions.copy() + + if weight_mode == "combined": + dispersions2 = np.zeros(var.size) + dispersions2[mu2 > 0] = var[mu2 > 0] / mu2[mu2 > 0] + + elif weight_mode == "variance": + dispersions = var + adata.var["spatial_variances"] = dispersions.copy() + else: + raise InvalidParameterError( + "weight_mode", + weight_mode, + valid_values=["dispersion", "variance", "rms", "combined"], + ) + + ma = np.sort(dispersions)[-num_norm_avg:].mean() + dispersions[dispersions >= ma] = ma + + weights = ((dispersions / dispersions.max()) ** 0.5).flatten() + + if weight_mode == "combined": + ma = np.sort(dispersions2)[-num_norm_avg:].mean() + dispersions2[dispersions2 >= ma] = ma + + weights2 = ((dispersions2 / dispersions2.max()) ** 0.5).flatten() + weights = np.vstack((weights, weights2)).max(0) + + return weights + + def run( + self, + max_iter: int = 10, + verbose: bool = True, + projection: Literal["umap", "tsne", "diff_umap"] | None = "umap", + stopping_condition: float = 1e-2, + num_norm_avg: int = 50, + k: int = 20, + distance: Literal["correlation", "euclidean", "cosine"] = "cosine", + preprocessing: Literal["StandardScaler", "Normalizer"] | None = "StandardScaler", + npcs: int = 150, + n_genes: int | None = 3000, + weight_PCs: bool = False, + sparse_pca: bool = False, + proj_kwargs: dict[str, Any] | None = None, + seed: int = 0, + weight_mode: Literal["dispersion", "variance", "rms", "combined"] = "rms", + components: NDArray[np.floating[Any]] | None = None, + batch_key: str | None = None, + ) -> None: + """Run the Self-Assembling Manifold algorithm. + + Parameters + ---------- + max_iter : int, optional + Maximum number of iterations. Default is 10. + verbose : bool, optional + If True, print progress. Default is True. + projection : str | None, optional + Projection method: 'umap', 'tsne', 'diff_umap', or None. Default is 'umap'. + stopping_condition : float, optional + RMSE threshold for convergence. Default is 1e-2. + num_norm_avg : int, optional + Top dispersions to average for normalization. Default is 50. + k : int, optional + Number of nearest neighbors. Default is 20. + distance : str, optional + Distance metric: 'correlation', 'euclidean', 'cosine'. Default is 'cosine'. + preprocessing : str | None, optional + Preprocessing method: 'StandardScaler', 'Normalizer', None. Default is 'StandardScaler'. + npcs : int, optional + Number of principal components. Default is 150. + n_genes : int | None, optional + Number of genes to use. Default is 3000. If None, uses all genes. + weight_PCs : bool, optional + Weight PCs by eigenvalues. Default is False. + sparse_pca : bool, optional + Use sparse PCA implementation. Default is False. + proj_kwargs : dict | None, optional + Additional arguments for projection. Default is None. + seed : int, optional + Random seed. Default is 0. + weight_mode : str, optional + Weight calculation mode. Default is 'rms'. + components : NDArray | None, optional + Pre-computed PCA components. Default is None. + batch_key : str | None, optional + Key in obs for batch correction with Harmony. Default is None. + """ + if proj_kwargs is None: + proj_kwargs = {} + + D = self.adata.X + if k < 5: + k = 5 + if k > D.shape[0] - 1: + k = D.shape[0] - 2 + + if preprocessing not in ("StandardScaler", "Normalizer", None, "None"): + raise InvalidParameterError( + "preprocessing", + preprocessing, + valid_values=["StandardScaler", "Normalizer", None], + ) + if weight_mode not in ("dispersion", "variance", "rms", "combined"): + raise InvalidParameterError( + "weight_mode", + weight_mode, + valid_values=["dispersion", "variance", "rms", "combined"], + ) + + if self.adata.layers["X_disp"].min() < 0 and weight_mode == "dispersion": + logger.warning( + "`X_disp` layer contains negative values. Setting `weight_mode` to 'rms'." + ) + weight_mode = "rms" + + numcells = D.shape[0] + + if n_genes is None: + n_genes = self.adata.shape[1] + if not sparse_pca and numcells > 10000: + warnings.warn( + "All genes are being used. It is recommended " + "to set `sparse_pca=True` to satisfy memory " + "constraints for datasets with more than " + "10,000 cells. Setting `sparse_pca` to True." + ) + sparse_pca = True + + if not sparse_pca: + n_genes = min(n_genes, (D.sum(0) > 0).sum()) + + self.run_args = { + "max_iter": max_iter, + "verbose": verbose, + "projection": projection, + "stopping_condition": stopping_condition, + "num_norm_avg": num_norm_avg, + "k": k, + "distance": distance, + "preprocessing": preprocessing, + "npcs": npcs, + "n_genes": n_genes, + "weight_PCs": weight_PCs, + "proj_kwargs": proj_kwargs, + "sparse_pca": sparse_pca, + "weight_mode": weight_mode, + "seed": seed, + "components": components, + } + self.adata.uns["run_args"] = self.run_args + + tinit = time.time() + np.random.seed(seed) + + if verbose: + logger.info("Running SAM algorithm") + + W = np.ones(D.shape[1]) + self.adata.var["weights"] = W + + old = np.zeros(W.size) + new = W + + i = 0 + err = ((new - old) ** 2).mean() ** 0.5 + + if max_iter < 5: + max_iter = 5 + + nnas = num_norm_avg + + while i < max_iter and err > stopping_condition: + conv = err + if verbose: + logger.info("Iteration: %d, Convergence: %.6f", i, conv) + + i += 1 + old = new + first = i == 1 + + W = self.calculate_nnm( + batch_key=batch_key, + n_genes=n_genes, + preprocessing=preprocessing, + npcs=npcs, + num_norm_avg=nnas, + weight_PCs=weight_PCs, + sparse_pca=sparse_pca, + weight_mode=weight_mode, + seed=seed, + components=components, + first=first, + ) + gc.collect() + new = W + err = ((new - old) ** 2).mean() ** 0.5 + self.adata.var["weights"] = W + + all_gene_names = np.array(list(self.adata.var_names)) + indices = np.argsort(-W) + ranked_genes = all_gene_names[indices] + + self.adata.uns["ranked_genes"] = ranked_genes + + if projection == "tsne": + if verbose: + logger.info("Computing the t-SNE embedding...") + self.run_tsne(**proj_kwargs) + elif projection == "umap": + if verbose: + logger.info("Computing the UMAP embedding...") + self.run_umap(seed=seed, **proj_kwargs) + elif projection == "diff_umap": + if verbose: + logger.info("Computing the diffusion UMAP embedding...") + self.run_diff_umap(**proj_kwargs) + + elapsed = time.time() - tinit + if verbose: + logger.info("Elapsed time: %.2f seconds", elapsed) + + def calculate_nnm( + self, + adata: AnnData | None = None, + batch_key: str | None = None, + g_weighted: NDArray[np.floating[Any]] | None = None, + n_genes: int = 3000, + preprocessing: str | None = "StandardScaler", + npcs: int = 150, + num_norm_avg: int = 50, + weight_PCs: bool = False, + sparse_pca: bool = False, + update_manifold: bool = True, + weight_mode: str = "dispersion", + seed: int = 0, + components: NDArray[np.floating[Any]] | None = None, + first: bool = False, + ) -> NDArray[np.float64] | tuple[NDArray[np.floating[Any]], NDArray[np.floating[Any]]]: + """Calculate nearest neighbor matrix and update weights. + + This is the core iteration step of the SAM algorithm. + + Parameters + ---------- + adata : AnnData | None + AnnData object to use. + batch_key : str | None + Key for batch correction. + g_weighted : NDArray | None + Pre-computed weighted coordinates. + n_genes : int + Number of genes to use. + preprocessing : str | None + Preprocessing method. + npcs : int + Number of PCs. + num_norm_avg : int + Normalization averaging. + weight_PCs : bool + Weight by eigenvalues. + sparse_pca : bool + Use sparse PCA. + update_manifold : bool + Update manifold structure. + weight_mode : str + Weight calculation mode. + seed : int + Random seed. + components : NDArray | None + Pre-computed components. + first : bool + Is this the first iteration. + + Returns + ------- + NDArray | tuple + Gene weights, or (PCs, weighted_coords) if not updating manifold. + """ + if adata is None: + adata = self.adata + + numcells = adata.shape[0] + k = adata.uns["run_args"].get("k", 20) + distance = adata.uns["run_args"].get("distance", "correlation") + + D = adata.X + W = adata.var["weights"].values + + if "means" not in adata.var.keys() or "variances" not in adata.var.keys(): + self.calculate_mean_var(adata) + + if n_genes is None: + gkeep = np.arange(W.size) + else: + if first: + mu = np.array(list(adata.var["means"])) + var = np.array(list(adata.var["variances"])) + mu[mu == 0] = 1 + dispersions = var / mu + gkeep = np.sort(np.argsort(-dispersions)[:n_genes]) + else: + gkeep = np.sort(np.argsort(-W)[:n_genes]) + + if g_weighted is None: + if preprocessing == "Normalizer": + Ds = D[:, gkeep] + if sp.issparse(Ds) and not sparse_pca: + Ds = Ds.toarray() + + Ds = Normalizer().fit_transform(Ds) + + elif preprocessing == "StandardScaler": + if not sparse_pca: + Ds = D[:, gkeep] + if sp.issparse(Ds): + Ds = Ds.toarray() + + v = adata.var["variances"].values[gkeep] + m = adata.var["means"].values[gkeep] + v[v == 0] = 1 + Ds = (Ds - m) / v**0.5 + + Ds[Ds > 10] = 10 + Ds[Ds < -10] = -10 + else: + Ds = D[:, gkeep] + v = adata.var["variances"].values[gkeep] + v[v == 0] = 1 + Ds = Ds.multiply(1 / v**0.5).tocsr() + + else: + Ds = D[:, gkeep].toarray() + + if sp.issparse(Ds): + D_sub = Ds.multiply(W[gkeep]).tocsr() + else: + D_sub = Ds * (W[gkeep]) + + if components is None: + if not sparse_pca: + npcs = min(npcs, min((D.shape[0], gkeep.size))) + if numcells > 500: + g_weighted, pca = ut.weighted_PCA( + D_sub, + npcs=npcs, + do_weight=weight_PCs, + solver="auto", + seed=seed, + ) + else: + g_weighted, pca = ut.weighted_PCA( + D_sub, + npcs=npcs, + do_weight=weight_PCs, + solver="full", + seed=seed, + ) + components = pca.components_ + + else: + npcs = min(npcs, min((D.shape[0], gkeep.size)) - 1) + v = adata.var["variances"].values[gkeep] + v[v == 0] = 1 + m = adata.var["means"].values[gkeep] * W[gkeep] + if preprocessing == "StandardScaler": + no = m / v**0.5 + else: + no = np.asarray(D_sub.mean(0)).flatten() + mean_correction = no + output = ut._pca_with_sparse(D_sub, npcs, mu=(no)[None, :], seed=seed) + components = output["components"] + g_weighted = output["X_pca"] + + if weight_PCs: + ev = output["variance"] + ev = ev / ev.max() + g_weighted = g_weighted * (ev**0.5) + else: + components = components[:, gkeep] + v = adata.var["variances"].values[gkeep] + v[v == 0] = 1 + m = adata.var["means"].values[gkeep] * W[gkeep] + if preprocessing == "StandardScaler": + ns = m / v**0.5 + else: + ns = np.asarray(D_sub.mean(0)).flatten() + mean_correction = ns + + if sp.issparse(D_sub): + g_weighted = D_sub.dot(components.T) - ns.flatten().dot(components.T) + else: + g_weighted = (D_sub - ns).dot(components.T) + if weight_PCs: + ev = g_weighted.var(0) + ev = ev / ev.max() + g_weighted = g_weighted * (ev**0.5) + + adata.varm["PCs"] = np.zeros(shape=(adata.n_vars, npcs)) + adata.varm["PCs"][gkeep] = components.T + adata.obsm["X_processed"] = D_sub + adata.uns["dimred_indices"] = gkeep + if sparse_pca: + mc = np.zeros(adata.shape[1]) + mc[gkeep] = mean_correction + adata.var["mean_correction"] = mc + + if batch_key is not None: + try: + import harmonypy + + harmony_out = harmonypy.run_harmony(g_weighted, adata.obs, batch_key, verbose=False) + g_weighted = harmony_out.Z_corr.T + except ImportError as err: + raise ImportError( + "harmonypy is required for batch correction. " + "Install it with: pip install harmonypy" + ) from err + + if update_manifold: + edm = ut.calc_nnm(g_weighted, k, distance) + # Use lil format temporarily for setdiag, then convert back to csr + edm_lil = edm.tolil() + edm_lil.setdiag(0) + adata.obsp["distances"] = edm_lil.tocsr() + + EDM = edm.copy() + EDM.data[:] = 1 + EDM_lil = EDM.tolil() + EDM_lil.setdiag(1) + EDM = EDM_lil.tocsr() + + adata.obsp["connectivities"] = EDM + + if distance in ("correlation", "cosine"): + edm.data[:] = 1 - edm.data + edm_lil = edm.tolil() + edm_lil.setdiag(1) + edm = edm_lil.tocsr() + edm.data[edm.data < 0] = 0.001 + adata.obsp["nnm"] = edm + else: + adata.obsp["nnm"] = EDM + W = self.dispersion_ranking_NN( + EDM, weight_mode=weight_mode, num_norm_avg=num_norm_avg, adata=adata + ) + adata.obsm["X_pca"] = g_weighted + return W + else: + logger.info("Not updating the manifold...") + PCs = np.zeros(shape=(adata.n_vars, npcs)) + PCs[gkeep] = components.T + return PCs, g_weighted + + def run_tsne( + self, + X: NDArray[np.floating[Any]] | None = None, + metric: str = "correlation", + **kwargs: Any, + ) -> NDArray[np.floating[Any]] | None: + """Compute t-SNE embedding. + + Parameters + ---------- + X : NDArray | None, optional + Input data. If None, uses X_pca from adata. + metric : str, optional + Distance metric. Default is 'correlation'. + **kwargs + Additional arguments passed to sklearn.manifold.TSNE. + + Returns + ------- + NDArray | None + t-SNE coordinates if X provided, None otherwise. + """ + if X is not None: + dt = man.TSNE(metric=metric, **kwargs).fit_transform(X) + return dt + + distance = self.adata.uns["run_args"].get("distance", "correlation") + dt = man.TSNE(metric=distance, **kwargs).fit_transform(self.adata.obsm["X_pca"]) + self.adata.obsm["X_tsne"] = dt + return None + + def run_umap( + self, + X: str | NDArray[np.floating[Any]] = "X_pca", + metric: str | None = None, + seed: int | None = None, + **kwargs: Any, + ) -> Any: + """Compute UMAP embedding. + + Parameters + ---------- + X : str | NDArray, optional + Key in obsm or data array. Default is 'X_pca'. + metric : str | None, optional + Distance metric. If None, uses value from run_args. + seed : int | None, optional + Random seed. If None, uses value from run_args. + **kwargs + Additional arguments passed to umap.UMAP. + + Returns + ------- + umap.UMAP | tuple + UMAP object if using key, or (coords, UMAP) if using array. + """ + import umap + + if metric is None: + metric = self.adata.uns["run_args"].get("distance", "correlation") + + if seed is None: + seed = self.adata.uns["run_args"].get("seed", 0) + + if isinstance(X, str): + if X == "": + X_data = self.adata.X + else: + X_data = self.adata.obsm[X] + umap_obj = umap.UMAP(metric=metric, random_state=seed, **kwargs) + umap2d = umap_obj.fit_transform(X_data) + self.adata.obsm["X_umap"] = umap2d + return umap_obj + else: + umap_obj = umap.UMAP(metric=metric, random_state=seed, **kwargs) + dt = umap_obj.fit_transform(X) + return dt, umap_obj + + def run_diff_umap( + self, + use_rep: str = "X_pca", + metric: str = "euclidean", + n_comps: int = 15, + method: str = "gauss", + **kwargs: Any, + ) -> None: + """Compute diffusion UMAP embedding. + + Requires scanpy. + + Parameters + ---------- + use_rep : str, optional + Key in obsm to use. Default is 'X_pca'. + metric : str, optional + Distance metric. Default is 'euclidean'. + n_comps : int, optional + Number of diffusion components. Default is 15. + method : str, optional + Method for scanpy neighbors. Default is 'gauss'. + **kwargs + Additional arguments. + """ + import scanpy.api as sc + + k = self.adata.uns["run_args"].get("k", 20) + distance = self.adata.uns["run_args"].get("distance", "correlation") + sc.pp.neighbors(self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method) + sc.tl.diffmap(self.adata, n_comps=n_comps) + sc.pp.neighbors( + self.adata, + use_rep="X_diffmap", + n_neighbors=k, + metric="euclidean", + method=method, + ) + + if "X_umap" in self.adata.obsm.keys(): + temp = self.adata.obsm["X_umap"].copy() + + sc.tl.umap(self.adata, min_dist=0.1, copy=False) + temp2 = self.adata.obsm["X_umap"] + self.adata.obsm["X_umap"] = temp + self.adata.obsm["X_diff_umap"] = temp2 + + def run_diff_map( + self, + use_rep: str = "X_pca", + metric: str = "euclidean", + n_comps: int = 15, + method: str = "gauss", + **kwargs: Any, + ) -> None: + """Compute diffusion map embedding. + + Requires scanpy. + + Parameters + ---------- + use_rep : str, optional + Key in obsm to use. Default is 'X_pca'. + metric : str, optional + Distance metric. Default is 'euclidean'. + n_comps : int, optional + Number of components. Default is 15. + method : str, optional + Method for neighbors. Default is 'gauss'. + **kwargs + Additional arguments. + """ + import scanpy.api as sc + + k = self.adata.uns["run_args"].get("k", 20) + distance = self.adata.uns["run_args"].get("distance", "correlation") + sc.pp.neighbors(self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method) + sc.tl.diffmap(self.adata, n_comps=n_comps + 1) + self.adata.obsm["X_diffmap"] = self.adata.obsm["X_diffmap"][:, 1:] + + def density_clustering( + self, + X: NDArray[np.floating[Any]] | None = None, + eps: float = 1, + metric: str = "euclidean", + **kwargs: Any, + ) -> NDArray[np.int64] | None: + """Perform DBSCAN clustering. + + Parameters + ---------- + X : NDArray | None, optional + Input coordinates. If None, uses X_umap. + eps : float, optional + DBSCAN epsilon parameter. Default is 1. + metric : str, optional + Distance metric. Default is 'euclidean'. + **kwargs + Additional arguments passed to DBSCAN. + + Returns + ------- + NDArray | None + Cluster labels if X provided, None otherwise. + """ + from sklearn.cluster import DBSCAN + + if X is None: + X = self.adata.obsm["X_umap"] + save = True + else: + save = False + + cl = DBSCAN(eps=eps, metric=metric, **kwargs).fit_predict(X) + k = self.adata.uns["run_args"].get("k", 20) + idx0 = np.where(cl != -1)[0] + idx1 = np.where(cl == -1)[0] + if idx1.size > 0 and idx0.size > 0: + xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :]) + knn = np.argsort(xcmap.T, axis=1)[:, :k] + nnm = np.zeros(xcmap.shape).T + nnm[ + np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(), + knn.flatten(), + ] = 1 + nnmc = np.zeros((nnm.shape[0], cl.max() + 1)) + for i in range(cl.max() + 1): + nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1) + + cl[idx1] = np.argmax(nnmc, axis=1) + + if save: + self.adata.obs["dbscan_clusters"] = pd.Categorical(cl) + return None + return cl + + def clustering( + self, + X: sp.spmatrix | NDArray[np.floating[Any]] | None = None, + param: float | int | None = None, + method: Literal[ + "leiden", "leiden_sig", "louvain", "louvain_sig", "kmeans", "hdbscan", "dbscan" + ] = "leiden", + ) -> NDArray[np.int64] | None: + """Wrapper for various clustering algorithms. + + Parameters + ---------- + X : sparse matrix | NDArray | None, optional + Input data. Type depends on method. If None, uses internal data. + param : float | int | None, optional + Method-specific parameter. Resolution for leiden/louvain, + number of clusters for kmeans, eps for dbscan. + method : str, optional + Clustering method. Default is 'leiden'. + + Returns + ------- + NDArray | None + Cluster labels if X provided, None otherwise. + """ + if method == "leiden": + if param is None: + param = 1 + return self.leiden_clustering(X=X, res=param, method="modularity") + elif method == "leiden_sig": + if param is None: + param = 1 + return self.leiden_clustering(X=X, res=param, method="significance") + elif method == "louvain": + if param is None: + param = 1 + return self.louvain_clustering(X=X, res=param, method="modularity") + elif method == "louvain_sig": + if param is None: + param = 1 + return self.louvain_clustering(X=X, res=param, method="significance") + elif method == "kmeans": + if param is None: + param = 6 + return self.kmeans_clustering(int(param), X=X)[0] + elif method == "hdbscan": + if param is None: + param = 25 + return self.hdbknn_clustering(npcs=int(param)) + elif method == "dbscan": + if param is None: + param = 0.5 + return self.density_clustering(eps=param) + return None + + def louvain_clustering( + self, + X: sp.spmatrix | None = None, + res: float = 1, + method: Literal["modularity", "significance"] = "modularity", + ) -> NDArray[np.int64] | None: + """Perform Louvain clustering. + + Requires louvain and igraph packages. + + Parameters + ---------- + X : sparse matrix | None, optional + Adjacency matrix. If None, uses connectivities. + res : float, optional + Resolution parameter. Default is 1. + method : str, optional + Optimization method. Default is 'modularity'. + + Returns + ------- + NDArray | None + Cluster labels if X provided, None otherwise. + """ + if X is None: + X = self.adata.obsp["connectivities"] + save = True + else: + if not sp.isspmatrix_csr(X): + X = sp.csr_matrix(X) + save = False + + import igraph as ig + import louvain + + adjacency = X + sources, targets = adjacency.nonzero() + weights = adjacency[sources, targets] + if isinstance(weights, np.matrix): + weights = np.asarray(weights).flatten() + g = ig.Graph(directed=True) + g.add_vertices(adjacency.shape[0]) + g.add_edges(list(zip(sources, targets, strict=False))) + try: + g.es["weight"] = weights + except (ValueError, TypeError): + pass + + if method == "significance": + cl = louvain.find_partition(g, louvain.SignificanceVertexPartition) + else: + cl = louvain.find_partition( + g, louvain.RBConfigurationVertexPartition, resolution_parameter=res + ) + + if save: + if method == "modularity": + self.adata.obs["louvain_clusters"] = pd.Categorical(np.array(cl.membership)) + elif method == "significance": + self.adata.obs["louvain_sig_clusters"] = pd.Categorical(np.array(cl.membership)) + return None + return np.array(cl.membership) + + def kmeans_clustering( + self, + numc: int, + X: NDArray[np.floating[Any]] | None = None, + npcs: int = 25, + ) -> tuple[NDArray[np.int64], Any]: + """Perform k-means clustering. + + Parameters + ---------- + numc : int + Number of clusters. + X : NDArray | None, optional + Input coordinates. If None, uses X_pca. + npcs : int, optional + Unused parameter for backward compatibility. + + Returns + ------- + tuple + (cluster_labels, kmeans_object) + """ + from sklearn.cluster import KMeans + + if X is None: + X = self.adata.obsm["X_pca"] + + km = KMeans(n_clusters=numc) + cl = km.fit_predict(Normalizer().fit_transform(X)) + + self.adata.obs["kmeans_clusters"] = pd.Categorical(cl) + return cl, km + + def leiden_clustering( + self, + X: sp.spmatrix | None = None, + res: float = 1, + method: Literal["modularity", "significance"] = "modularity", + seed: int = 0, + ) -> NDArray[np.int64] | None: + """Perform Leiden clustering. + + Requires leidenalg and igraph packages. + + Parameters + ---------- + X : sparse matrix | None, optional + Adjacency matrix. If None, uses connectivities. + res : float, optional + Resolution parameter. Default is 1. + method : str, optional + Optimization method. Default is 'modularity'. + seed : int, optional + Random seed. Default is 0. + + Returns + ------- + NDArray | None + Cluster labels if X provided, None otherwise. + """ + if X is None: + X = self.adata.obsp["connectivities"] + save = True + else: + if not sp.isspmatrix_csr(X): + X = sp.csr_matrix(X) + save = False + + import igraph as ig + import leidenalg + + adjacency = X + sources, targets = adjacency.nonzero() + weights = adjacency[sources, targets] + if isinstance(weights, np.matrix): + weights = np.asarray(weights).flatten() + g = ig.Graph(directed=True) + g.add_vertices(adjacency.shape[0]) + g.add_edges(list(zip(sources, targets, strict=False))) + try: + g.es["weight"] = weights + except (ValueError, TypeError): + pass + + if method == "significance": + cl = leidenalg.find_partition(g, leidenalg.SignificanceVertexPartition, seed=seed) + else: + cl = leidenalg.find_partition( + g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res, seed=seed + ) + + if save: + if method == "modularity": + self.adata.obs["leiden_clusters"] = pd.Categorical(np.array(cl.membership)) + elif method == "significance": + self.adata.obs["leiden_sig_clusters"] = pd.Categorical(np.array(cl.membership)) + return None + return np.array(cl.membership) + + def hdbknn_clustering( + self, + X: NDArray[np.floating[Any]] | None = None, + k: int | None = None, + npcs: int = 15, + **kwargs: Any, + ) -> NDArray[np.int64] | None: + """Perform HDBSCAN clustering. + + Requires hdbscan package. + + Parameters + ---------- + X : NDArray | None, optional + Input coordinates. If None, uses X_pca. + k : int | None, optional + Number of neighbors for unassigned cells. If None, uses run_args k. + npcs : int, optional + Unused parameter for backward compatibility. + **kwargs + Additional arguments passed to HDBSCAN. + + Returns + ------- + NDArray | None + Cluster labels if X provided, None otherwise. + """ + import hdbscan + + if X is None: + X = self.adata.obsm["X_pca"] + X = Normalizer().fit_transform(X) + save = True + else: + save = False + + if k is None: + k = self.adata.uns["run_args"].get("k", 20) + + hdb = hdbscan.HDBSCAN(metric="euclidean", **kwargs) + + cl = hdb.fit_predict(X) + + idx0 = np.where(cl != -1)[0] + idx1 = np.where(cl == -1)[0] + if idx1.size > 0 and idx0.size > 0: + xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :]) + knn = np.argsort(xcmap.T, axis=1)[:, :k] + nnm = np.zeros(xcmap.shape).T + nnm[ + np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(), + knn.flatten(), + ] = 1 + nnmc = np.zeros((nnm.shape[0], cl.max() + 1)) + for i in range(cl.max() + 1): + nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1) + + cl[idx1] = np.argmax(nnmc, axis=1) + + if save: + self.adata.obs["hdbscan_clusters"] = pd.Categorical(cl) + return None + return cl + + def identify_marker_genes_rf( + self, + labels: str | NDArray[Any] | None = None, + clusters: int | str | Sequence[int | str] | None = None, + n_genes: int = 4000, + ) -> tuple[dict[Any, NDArray[Any]], dict[Any, NDArray[np.floating[Any]]]]: + """Identify marker genes using random forest classification. + + Parameters + ---------- + labels : str | NDArray | None, optional + Cluster labels or key in obs. If None, auto-detects clusters. + clusters : int | str | Sequence | None, optional + Specific cluster(s) to analyze. If None, analyzes all. + n_genes : int, optional + Number of genes for classifier training. Default is 4000. + + Returns + ------- + tuple + (markers dict, scores dict) mapping cluster IDs to gene arrays. + """ + if labels is None: + keys = np.array(list(self.adata.obs_keys())) + matches = ut.search_string(keys, "_clusters") + if matches[0] == -1: + logger.error( + "Please generate cluster labels first or set the 'labels' keyword argument." + ) + return {}, {} + lbls = self.get_labels(matches[0][0]) + elif isinstance(labels, str): + lbls = self.get_labels(labels) + else: + lbls = labels + + from sklearn.ensemble import RandomForestClassifier + + markers: dict[Any, NDArray[Any]] = {} + markers_scores: dict[Any, NDArray[np.floating[Any]]] = {} + if clusters is None: + lblsu = np.unique(lbls) + else: + lblsu = np.unique(clusters) + + indices = np.argsort(-self.adata.var["weights"].values) + X = self.adata.layers["X_disp"][:, indices[:n_genes]].toarray() + for K in range(lblsu.size): + y = np.zeros(lbls.size) + + y[lbls == lblsu[K]] = 1 + + clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0) + + clf.fit(X, y) + + idx = np.argsort(-clf.feature_importances_) + + markers[lblsu[K]] = self.adata.uns["ranked_genes"][idx] + markers_scores[lblsu[K]] = clf.feature_importances_[idx] + + if clusters is None: + if isinstance(labels, str): + self.adata.uns["rf_" + labels] = markers + else: + self.adata.uns["rf"] = markers + + return markers, markers_scores + + def identify_marker_genes_sw( + self, + labels: str | NDArray[Any] | None = None, + clusters: int | str | Sequence[int | str] | None = None, + inplace: bool = True, + ) -> pd.DataFrame | None: + """Identify marker genes using spatial dispersion weights. + + Parameters + ---------- + labels : str | NDArray | None, optional + Cluster labels or key in obs. + clusters : int | str | Sequence | None, optional + Specific cluster(s) to analyze. + inplace : bool, optional + If True, stores scores in var. Default is True. + + Returns + ------- + pd.DataFrame | None + DataFrame of scores if inplace=False, None otherwise. + """ + if labels is None: + keys = np.array(list(self.adata.obs_keys())) + matches = ut.search_string(keys, "_clusters") + if matches[0] == -1: + logger.error( + "Please generate cluster labels first or set the 'labels' keyword argument." + ) + return None + lbls = self.get_labels(matches[0][0]) + elif isinstance(labels, str): + lbls = self.get_labels(labels) + else: + lbls = labels + + markers_scores = [] + if clusters is None: + lblsu = np.unique(lbls) + else: + lblsu = np.unique(clusters) + + if "X_knn_avg" not in list(self.adata.layers.keys()): + logger.info("Performing kNN-averaging...") + self.dispersion_ranking_NN(save_avgs=True) + layer = self.adata.layers["X_knn_avg"] + m = np.asarray(layer.mean(0)).flatten() + cells = np.array(list(self.adata.obs_names)) + for K in range(lblsu.size): + selected = np.where(np.isin(cells, self.get_cells(lblsu[K], labels)))[0] + ms = np.asarray(layer[selected, :].mean(0)).flatten() + lsub = layer[selected, :] + lsub.data[:] = lsub.data**2 + ms2 = np.asarray(lsub.mean(0)).flatten() + v = ms2 - 2 * ms * m + m**2 + wmu = np.zeros(v.size) + wmu[m > 0] = v[m > 0] / m[m > 0] + markers_scores.append(wmu) + A = pd.DataFrame( + data=np.vstack(markers_scores), index=lblsu, columns=self.adata.var_names + ).T + if inplace: + A.columns = labels + ";;" + A.columns.astype("str").astype("object") + for Ac in A.columns: + self.adata.var[Ac] = A[Ac] + return None + return A + + def identify_marker_genes_ratio( + self, labels: str | NDArray[Any] | None = None + ) -> dict[Any, NDArray[Any]]: + """Identify marker genes using SAM-weighted expression ratio. + + Parameters + ---------- + labels : str | NDArray | None, optional + Cluster labels or key in obs. + + Returns + ------- + dict + Mapping cluster IDs to ranked gene arrays. + """ + if labels is None: + keys = np.array(list(self.adata.obs_keys())) + matches = ut.search_string(keys, "_clusters") + if matches[0] == -1: + logger.error( + "Please generate cluster labels first or set the 'labels' keyword argument." + ) + return {} + lbls = self.get_labels(matches[0][0]) + elif isinstance(labels, str): + lbls = self.get_labels(labels) + else: + lbls = labels + + all_gene_names = np.array(list(self.adata.var_names)) + + markers: dict[Any, NDArray[Any]] = {} + + s = np.array(self.adata.layers["X_disp"].sum(0)).flatten() + lblsu = np.unique(lbls) + for i in lblsu: + d = np.array(self.adata.layers["X_disp"][lbls == i, :].sum(0)).flatten() + rat = np.zeros(d.size) + rat[s > 0] = d[s > 0] ** 2 / s[s > 0] * self.adata.var["weights"].values[s > 0] + x = np.argsort(-rat) + markers[i] = all_gene_names[x[:]] + + self.adata.uns["marker_genes_ratio"] = markers + + return markers + + def save(self, fn: str) -> None: + """Save the SAM object to a pickle file. + + Parameters + ---------- + fn : str + Output file path. + """ + import dill + + if len(fn.split(".pkl")) == 1: + fn = fn + ".pkl" + self.path_to_file = fn + d = {} + for k in self.__dict__.keys(): + d[k] = self.__dict__[k] + with open(fn, "wb") as f: + dill.dump(d, f) + + def load(self, fn: str) -> None: + """Load a SAM object from a pickle file. + + Parameters + ---------- + fn : str + Input file path. + """ + import dill + + if len(fn.split(".pkl")) == 1: + fn = fn + ".pkl" + with open(fn, "rb") as f: + self.__dict__ = dill.load(f) diff --git a/src/samalg/utilities.py b/src/samalg/utilities.py new file mode 100644 index 0000000..7c86035 --- /dev/null +++ b/src/samalg/utilities.py @@ -0,0 +1,844 @@ +"""Utility functions for the SAM algorithm. + +This module provides helper functions for PCA, nearest neighbor computation, +gene correlation analysis, and other common operations. +""" + +from __future__ import annotations + +import errno +import os +from typing import TYPE_CHECKING, Any + +import numpy as np +import scipy as sp +import sklearn.utils.sparsefuncs as sf +from scipy import sparse +from sklearn.decomposition import PCA +from sklearn.utils import check_array, check_random_state +from sklearn.utils.extmath import svd_flip +from umap.umap_ import nearest_neighbors + +from ._logging import get_logger + +if TYPE_CHECKING: + from numpy.typing import NDArray + + from .sam import SAM + +__version__ = "2.0.0" + +# Get module logger +logger = get_logger("utilities") + + +def find_corr_genes(sam: SAM, input_gene: str) -> NDArray[Any] | None: + """Rank genes by correlation of spatially averaged expression patterns. + + Parameters + ---------- + sam : SAM + The analyzed SAM object. + input_gene : str + The gene ID with respect to which correlations will be computed. + + Returns + ------- + NDArray | None + A ranked list of gene IDs based on correlation to the input gene, + or None if the gene is not found. + """ + all_gene_names = np.array(list(sam.adata.var_names)) + + D_avg = sam.adata.layers["X_knn_avg"] + + input_gene_idx = np.where(all_gene_names == input_gene)[0] + + if input_gene_idx.size == 0: + logger.warning( + "Gene not found in the filtered dataset. Note that genes are case sensitive." + ) + return None + + pw_corr = generate_correlation_map(D_avg.T.toarray(), D_avg[:, input_gene_idx].T.toarray()) + return all_gene_names[np.argsort(-pw_corr.flatten())] + + +def _pca_with_sparse( + X: sparse.spmatrix, + npcs: int, + solver: str = "arpack", + mu: NDArray[np.floating[Any]] | None = None, + seed: int = 0, + mu_axis: int = 0, +) -> dict[str, NDArray[np.floating[Any]]]: + """Perform PCA on sparse matrices using iterative SVD. + + Parameters + ---------- + X : sparse.spmatrix + Input sparse matrix. + npcs : int + Number of principal components. + solver : str, optional + SVD solver to use. Default is 'arpack'. + mu : NDArray | None, optional + Pre-computed mean. If None, computed from X. + seed : int, optional + Random seed. Default is 0. + mu_axis : int, optional + Axis along which mean was computed. Default is 0. + + Returns + ------- + dict + Dictionary with keys 'X_pca', 'variance', 'variance_ratio', 'components'. + """ + random_state = check_random_state(seed) + np.random.set_state(random_state.get_state()) + random_init = np.random.rand(np.min(X.shape)) + X = check_array(X, accept_sparse=["csr", "csc"]) + + if mu is None: + if mu_axis == 0: + mu = np.asarray(X.mean(0)).flatten()[None, :] + else: + mu = np.asarray(X.mean(1)).flatten()[:, None] + + if mu_axis == 0: + mdot = mu.dot + mmat = mdot + mhdot = mu.T.dot + mhmat = mu.T.dot + Xdot = X.dot + Xmat = Xdot + XHdot = X.T.conj().dot + XHmat = XHdot + ones = np.ones(X.shape[0])[None, :].dot + + def matvec(x: NDArray[Any]) -> NDArray[Any]: + return Xdot(x) - mdot(x) + + def matmat(x: NDArray[Any]) -> NDArray[Any]: + return Xmat(x) - mmat(x) + + def rmatvec(x: NDArray[Any]) -> NDArray[Any]: + return XHdot(x) - mhdot(ones(x)) + + def rmatmat(x: NDArray[Any]) -> NDArray[Any]: + return XHmat(x) - mhmat(ones(x)) + + else: + mdot = mu.dot + mmat = mdot + mhdot = mu.T.dot + mhmat = mu.T.dot + Xdot = X.dot + Xmat = Xdot + XHdot = X.T.conj().dot + XHmat = XHdot + ones = np.ones(X.shape[1])[None, :].dot + + def matvec(x: NDArray[Any]) -> NDArray[Any]: + return Xdot(x) - mdot(ones(x)) + + def matmat(x: NDArray[Any]) -> NDArray[Any]: + return Xmat(x) - mmat(ones(x)) + + def rmatvec(x: NDArray[Any]) -> NDArray[Any]: + return XHdot(x) - mhdot(x) + + def rmatmat(x: NDArray[Any]) -> NDArray[Any]: + return XHmat(x) - mhmat(x) + + XL = sp.sparse.linalg.LinearOperator( + matvec=matvec, + dtype=X.dtype, + matmat=matmat, + shape=X.shape, + rmatvec=rmatvec, + rmatmat=rmatmat, + ) + + u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init) + u, v = svd_flip(u, v) + idx = np.argsort(-s) + v = v[idx, :] + + X_pca = (u * s)[:, idx] + ev = s[idx] ** 2 / (X.shape[0] - 1) + + total_var = sf.mean_variance_axis(X, axis=0)[1].sum() + ev_ratio = ev / total_var + + output = { + "X_pca": X_pca, + "variance": ev, + "variance_ratio": ev_ratio, + "components": v, + } + return output + + +def nearest_neighbors_wrapper( + X: NDArray[np.floating[Any]], + n_neighbors: int = 15, + metric: str = "correlation", + metric_kwds: dict[str, Any] | None = None, + angular: bool = True, + random_state: int = 0, +) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]: + """Wrapper for UMAP's nearest neighbors function. + + Parameters + ---------- + X : NDArray + Input data matrix. + n_neighbors : int, optional + Number of neighbors. Default is 15. + metric : str, optional + Distance metric. Default is 'correlation'. + metric_kwds : dict | None, optional + Additional metric arguments. + angular : bool, optional + Use angular distance. Default is True. + random_state : int, optional + Random seed. Default is 0. + + Returns + ------- + tuple + (indices, distances) arrays. + """ + if metric_kwds is None: + metric_kwds = {} + random_state_obj = np.random.RandomState(random_state) + return nearest_neighbors(X, n_neighbors, metric, metric_kwds, angular, random_state_obj)[:2] + + +def knndist( + nnma: sparse.spmatrix, +) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]: + """Extract k-nearest neighbor indices and distances from sparse matrix. + + Parameters + ---------- + nnma : sparse.spmatrix + Sparse nearest neighbor matrix. + + Returns + ------- + tuple + (indices, distances) arrays. + """ + _, y = nnma.nonzero() + data = nnma.data + knn = y.reshape((nnma.shape[0], nnma[0, :].data.size)) + val = data.reshape(knn.shape) + return knn, val + + +def save_figures( + filename: str, + fig_IDs: int | list[int] | None = None, + **kwargs: Any, +) -> None: + """Save matplotlib figures to file. + + Parameters + ---------- + filename : str + Output filename. + fig_IDs : int | list[int] | None, optional + Figure IDs to save. If list, saves as PDF. If int, saves as PNG. + If None, saves all open figures as PDF. + **kwargs + Additional arguments passed to matplotlib.pyplot.savefig. + """ + import matplotlib.pyplot as plt + + if fig_IDs is not None: + if isinstance(fig_IDs, list): + savetype = "pdf" + else: + savetype = "png" + else: + savetype = "pdf" + + if savetype == "pdf": + from matplotlib.backends.backend_pdf import PdfPages + + if len(filename.split(".")) == 1: + filename = filename + ".pdf" + else: + filename = ".".join(filename.split(".")[:-1]) + ".pdf" + + pdf = PdfPages(filename) + + if fig_IDs is None: + figs = [plt.figure(n) for n in plt.get_fignums()] + else: + figs = [plt.figure(n) for n in fig_IDs] + + for fig in figs: + fig.savefig(pdf, format="pdf", **kwargs) + pdf.close() + elif savetype == "png": + plt.figure(fig_IDs).savefig(filename, **kwargs) + + +def weighted_PCA( + mat: NDArray[np.floating[Any]], + do_weight: bool = True, + npcs: int | None = None, + solver: str = "auto", + seed: int = 0, +) -> tuple[NDArray[np.floating[Any]], PCA]: + """Perform PCA with optional eigenvalue weighting. + + Parameters + ---------- + mat : NDArray + Input data matrix. + do_weight : bool, optional + If True, weight PCs by eigenvalues. Default is True. + npcs : int | None, optional + Number of components. If None, uses min(mat.shape). + solver : str, optional + SVD solver. Default is 'auto'. + seed : int, optional + Random seed. Default is 0. + + Returns + ------- + tuple + (reduced_weighted, pca_object) + """ + if do_weight: + if min(mat.shape) >= 10000 and npcs is None: + logger.warning( + "More than 10,000 cells. Running with 'npcs' set to < 1000 is recommended." + ) + + if npcs is None: + ncom = min(mat.shape) + else: + ncom = min((min(mat.shape), npcs)) + + pca = PCA(svd_solver=solver, n_components=ncom, random_state=check_random_state(seed)) + reduced = pca.fit_transform(mat) + scaled_eigenvalues = pca.explained_variance_ + scaled_eigenvalues = scaled_eigenvalues / scaled_eigenvalues.max() + reduced_weighted = reduced * scaled_eigenvalues[None, :] ** 0.5 + else: + pca = PCA(n_components=npcs, svd_solver=solver, random_state=check_random_state(seed)) + reduced = pca.fit_transform(mat) + if reduced.shape[1] == 1: + pca = PCA(n_components=2, svd_solver=solver, random_state=check_random_state(seed)) + reduced = pca.fit_transform(mat) + reduced_weighted = reduced + + return reduced_weighted, pca + + +def transform_wPCA(mat: NDArray[np.floating[Any]], pca: PCA) -> NDArray[np.floating[Any]]: + """Transform data using a fitted weighted PCA model. + + Parameters + ---------- + mat : NDArray + Input data matrix. + pca : PCA + Fitted PCA object. + + Returns + ------- + NDArray + Transformed and weighted data. + """ + mat = mat - pca.mean_ + reduced = mat.dot(pca.components_.T) + v = pca.explained_variance_ + scaled_eigenvalues = v / v.max() + reduced_weighted = np.array(reduced) * scaled_eigenvalues[None, :] ** 0.5 + return reduced_weighted + + +def search_string( + vec: NDArray[Any] | list[str], + s: str | list[str], + case_sensitive: bool = False, + invert: bool = False, +) -> tuple[NDArray[Any] | int, NDArray[np.int64] | int]: + """Search for strings matching a pattern. + + Parameters + ---------- + vec : NDArray | list + Array of strings to search. + s : str | list + Pattern(s) to search for. + case_sensitive : bool, optional + Whether search is case-sensitive. Default is False. + invert : bool, optional + If True, return non-matching strings. Default is False. + + Returns + ------- + tuple + (matching_strings, indices) or (-1, -1) if no matches. + """ + vec = np.array(vec) + + if isinstance(s, list): + S = s + else: + S = [s] + + V: list[NDArray[Any]] = [] + M: list[NDArray[np.int64]] = [] + for pattern in S: + m = [] + if not case_sensitive: + pattern = pattern.lower() + for i in range(len(vec)): + if case_sensitive: + st = vec[i] + else: + st = vec[i].lower() + b = st.find(pattern) + if (not invert and b != -1) or (invert and b == -1): + m.append(i) + if len(m) > 0: + V.append(vec[np.array(m)]) + M.append(np.array(m)) + if len(V) > 0: + i = len(V) + if not invert: + V_arr = np.concatenate(V) + M_arr = np.concatenate(M) + if i > 1: + ix = np.sort(np.unique(M_arr, return_index=True)[1]) + V_arr = V_arr[ix] + M_arr = M_arr[ix] + return V_arr, M_arr + else: + for j in range(len(V)): + V[j] = list(set(V[j]).intersection(*V)) + V_arr = vec[np.isin(vec, np.unique(np.concatenate(V)))] + M_arr = np.array([np.where(vec == x)[0][0] for x in V_arr]) + return V_arr, M_arr + else: + return -1, -1 + + +def distance_matrix_error( + dist1: NDArray[np.floating[Any]], dist2: NDArray[np.floating[Any]] +) -> float: + """Compute correlation-based error between two distance matrices. + + Parameters + ---------- + dist1 : NDArray + First distance matrix. + dist2 : NDArray + Second distance matrix. + + Returns + ------- + float + Error value (1 - mean correlation). + """ + s = 0.0 + for k in range(dist1.shape[0]): + s += np.corrcoef(dist1[k, :], dist2[k, :])[0, 1] + return 1 - s / dist1.shape[0] + + +def generate_euclidean_map( + A: NDArray[np.floating[Any]], B: NDArray[np.floating[Any]] +) -> NDArray[np.floating[Any]]: + """Compute pairwise Euclidean distances between two sets of points. + + Parameters + ---------- + A : NDArray + First set of points (n x d). + B : NDArray + Second set of points (m x d). + + Returns + ------- + NDArray + Distance matrix (n x m). + """ + a = (A**2).sum(1).flatten() + b = (B**2).sum(1).flatten() + x = a[:, None] + b[None, :] - 2 * np.dot(A, B.T) + x[x < 0] = 0 + return np.sqrt(x) + + +def generate_correlation_map( + x: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]] +) -> NDArray[np.floating[Any]]: + """Compute pairwise correlations between two sets of vectors. + + Parameters + ---------- + x : NDArray + First set of vectors (n x d). + y : NDArray + Second set of vectors (m x d). + + Returns + ------- + NDArray + Correlation matrix (n x m). + """ + mu_x = x.mean(1) + mu_y = y.mean(1) + n = x.shape[1] + if n != y.shape[1]: + raise ValueError("x and y must have the same number of timepoints.") + s_x = x.std(1, ddof=n - 1) + s_y = y.std(1, ddof=n - 1) + s_x[s_x == 0] = 1 + s_y[s_y == 0] = 1 + cov = np.dot(x, y.T) - n * np.dot(mu_x[:, None], mu_y[None, :]) + return cov / np.dot(s_x[:, None], s_y[None, :]) + + +def extract_annotation( + cn: NDArray[Any], + x: int | None, + c: str = "_", +) -> NDArray[Any] | list[NDArray[Any]]: + """Extract annotations from cell names by splitting on delimiter. + + Parameters + ---------- + cn : NDArray + Array of cell names. + x : int | None + Index of annotation field to extract. If None, returns all fields. + c : str, optional + Delimiter character. Default is '_'. + + Returns + ------- + NDArray | list + Extracted annotations. + """ + m = [] + if x is not None: + for i in range(cn.size): + f = cn[i].split(c) + x = min(len(f) - 1, x) + m.append(f[x]) + return np.array(m) + else: + ms: list[list[str]] = [] + ls = [] + for i in range(cn.size): + f = cn[i].split(c) + m_inner = [] + for field_x in range(len(f)): + m_inner.append(f[field_x]) + ms.append(m_inner) + ls.append(len(m_inner)) + ml = max(ls) + for i in range(len(ms)): + ms[i].extend([""] * (ml - len(ms[i]))) + if ml - len(ms[i]) > 0: + ms[i] = list(np.concatenate(ms[i])) + ms_arr = np.vstack(ms) + MS = [] + for i in range(ms_arr.shape[1]): + MS.append(ms_arr[:, i]) + return MS + + +def isolate( + dt: NDArray[np.floating[Any]], x1: float, x2: float, y1: float, y2: float +) -> NDArray[np.int64]: + """Get indices of points within a rectangular region. + + Parameters + ---------- + dt : NDArray + 2D coordinates (n x 2). + x1, x2 : float + X-axis bounds. + y1, y2 : float + Y-axis bounds. + + Returns + ------- + NDArray + Indices of points within the region. + """ + return np.where( + np.logical_and( + np.logical_and(dt[:, 0] > x1, dt[:, 0] < x2), + np.logical_and(dt[:, 1] > y1, dt[:, 1] < y2), + ) + )[0] + + +def to_lower(y: NDArray[Any]) -> NDArray[Any]: + """Convert string array to lowercase. + + Parameters + ---------- + y : NDArray + Array of strings. + + Returns + ------- + NDArray + Lowercase strings. + """ + x = y.copy().flatten() + for i in range(x.size): + x[i] = x[i].lower() + return x + + +def to_upper(y: NDArray[Any]) -> NDArray[Any]: + """Convert string array to uppercase. + + Parameters + ---------- + y : NDArray + Array of strings. + + Returns + ------- + NDArray + Uppercase strings. + """ + x = y.copy().flatten() + for i in range(x.size): + x[i] = x[i].upper() + return x + + +def create_folder(path: str) -> None: + """Create a directory if it doesn't exist. + + Parameters + ---------- + path : str + Directory path to create. + """ + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def convert_annotations(A: NDArray[Any]) -> NDArray[np.int64]: + """Convert categorical annotations to integer codes. + + Parameters + ---------- + A : NDArray + Array of categorical values. + + Returns + ------- + NDArray + Integer codes. + """ + x = np.unique(A) + y = np.zeros(A.size) + z = 0 + for i in x: + y[i == A] = z + z += 1 + + return y.astype("int") + + +def nearest_neighbors_hnsw( + x: NDArray[np.floating[Any]], + ef: int = 200, + M: int = 48, + n_neighbors: int = 100, +) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]: + """Compute approximate nearest neighbors using HNSW algorithm. + + Parameters + ---------- + x : NDArray + Input data matrix. + ef : int, optional + HNSW ef parameter. Default is 200. + M : int, optional + HNSW M parameter. Default is 48. + n_neighbors : int, optional + Number of neighbors. Default is 100. + + Returns + ------- + tuple + (indices, distances) arrays. + """ + import hnswlib + + labels = np.arange(x.shape[0]) + p = hnswlib.Index(space="cosine", dim=x.shape[1]) + p.init_index(max_elements=x.shape[0], ef_construction=ef, M=M) + p.add_items(x, labels) + p.set_ef(ef) + idx, dist = p.knn_query(x, k=n_neighbors) + return idx, dist + + +def calc_nnm( + g_weighted: NDArray[np.floating[Any]], + k: int, + distance: str | None = None, +) -> sparse.csr_matrix: + """Calculate k-nearest neighbor matrix. + + Parameters + ---------- + g_weighted : NDArray + Input coordinates. + k : int + Number of neighbors. + distance : str | None, optional + Distance metric. If 'cosine', uses HNSW. + + Returns + ------- + sparse.csr_matrix + Sparse k-NN matrix with distances. + """ + if g_weighted.shape[0] > 0: + if distance == "cosine": + nnm, dists = nearest_neighbors_hnsw(g_weighted, n_neighbors=k) + else: + nnm, dists = nearest_neighbors_wrapper(g_weighted, n_neighbors=k, metric=distance) + EDM = gen_sparse_knn(nnm, dists) + EDM = EDM.tocsr() + return EDM + + +def compute_distances(A: NDArray[np.floating[Any]], dm: str) -> NDArray[np.floating[Any]]: + """Compute pairwise distance matrix. + + Parameters + ---------- + A : NDArray + Input data matrix. + dm : str + Distance metric ('euclidean', 'correlation', or scipy metric). + + Returns + ------- + NDArray + Square distance matrix. + """ + if dm == "euclidean": + m = np.dot(A, A.T) + h = np.diag(m) + x = h[:, None] + h[None, :] - 2 * m + x[x < 0] = 0 + dist = np.sqrt(x) + elif dm == "correlation": + dist = 1 - np.corrcoef(A) + else: + dist = sp.spatial.distance.squareform(sp.spatial.distance.pdist(A, metric=dm)) + return dist + + +def dist_to_nn(d: NDArray[np.floating[Any]], K: int) -> NDArray[np.floating[Any]]: + """Convert distance matrix to binary k-NN adjacency matrix. + + Parameters + ---------- + d : NDArray + Square distance matrix. + K : int + Number of neighbors. + + Returns + ------- + NDArray + Binary adjacency matrix. + """ + E = d.copy() + np.fill_diagonal(E, -1) + M = np.max(E) * 2 + x = np.argsort(E, axis=1)[:, :K] + E[ + np.tile(np.arange(E.shape[0]).reshape(E.shape[0], -1), (1, x.shape[1])).flatten(), + x.flatten(), + ] = M + + E[E < M] = 0 + E[E > 0] = 1 + return E + + +def to_sparse_knn(D1: sparse.csr_matrix, k: int) -> sparse.csr_matrix: + """Sparsify matrix to keep only k nearest neighbors per row. + + Parameters + ---------- + D1 : sparse.csr_matrix + Input sparse matrix. + k : int + Number of neighbors to keep. + + Returns + ------- + sparse.csr_matrix + Sparsified matrix. + """ + for i in range(D1.shape[0]): + x = D1.data[D1.indptr[i] : D1.indptr[i + 1]] + idx = np.argsort(x) + if idx.size > k: + x[idx[:-k]] = 0 + D1.data[D1.indptr[i] : D1.indptr[i + 1]] = x + D1.eliminate_zeros() + return D1 + + +def gen_sparse_knn( + knni: NDArray[np.int64], + knnd: NDArray[np.floating[Any]], + shape: tuple[int, int] | None = None, +) -> sparse.csr_matrix: + """Generate sparse k-NN matrix from indices and distances. + + Parameters + ---------- + knni : NDArray + k-NN indices (n x k). + knnd : NDArray + k-NN distances (n x k). + shape : tuple | None, optional + Output shape. If None, uses (n, n). + + Returns + ------- + sparse.csr_matrix + Sparse k-NN matrix. + """ + if shape is None: + shape = (knni.shape[0], knni.shape[0]) + + D1 = sp.sparse.lil_matrix(shape) + + D1[ + np.tile(np.arange(knni.shape[0])[:, None], (1, knni.shape[1])).flatten().astype("int32"), + knni.flatten().astype("int32"), + ] = knnd.flatten() + D1 = D1.tocsr() + return D1 diff --git a/test/test_sam.py b/test/test_sam.py deleted file mode 100644 index b2f2c3a..0000000 --- a/test/test_sam.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -author: Alexander Tarashansky -date: 10/01/2018 -content: Tests for SAM. -""" -# Modules -from samalg import SAM - - -# Script -if __name__ == "__main__": - - sam = SAM() - sam.load_data("example_data/darmanis_data.csv.gz") - sam.load_obs_annotations("example_data/darmanis_ann.csv") - sam.preprocess_data() - sam.run(projection=None) - sam.run_umap() - sam.run_tsne() - sam.kmeans_clustering(4) - sam.identify_marker_genes_ratio() - sam.identify_marker_genes_rf() - umap_obj = sam.umap_obj # checking to see if umap_obj is stored in sam object diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..91207cb --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for the samalg package.""" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..dd4169d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,165 @@ +"""Pytest fixtures for samalg tests.""" + +from __future__ import annotations + +import numpy as np +import pytest +import scipy.sparse as sp +from anndata import AnnData + + +@pytest.fixture +def random_seed() -> int: + """Provide a consistent random seed for reproducibility.""" + return 42 + + +@pytest.fixture +def small_expression_matrix(random_seed: int) -> tuple[sp.csr_matrix, np.ndarray, np.ndarray]: + """Create a small test expression matrix. + + Returns + ------- + tuple + (sparse_matrix, gene_names, cell_names) + """ + np.random.seed(random_seed) + n_cells = 100 + n_genes = 200 + + # Create sparse count matrix with realistic properties + # Most entries are 0, some genes highly expressed + data = np.random.negative_binomial(n=5, p=0.3, size=(n_cells, n_genes)).astype(np.float32) + # Make it sparse (set low values to 0) + data[data < 2] = 0 + sparse_data = sp.csr_matrix(data) + + gene_names = np.array([f"Gene_{i}" for i in range(n_genes)]) + cell_names = np.array([f"Cell_{i}" for i in range(n_cells)]) + + return sparse_data, gene_names, cell_names + + +@pytest.fixture +def medium_expression_matrix(random_seed: int) -> tuple[sp.csr_matrix, np.ndarray, np.ndarray]: + """Create a medium-sized test expression matrix. + + Returns + ------- + tuple + (sparse_matrix, gene_names, cell_names) + """ + np.random.seed(random_seed) + n_cells = 500 + n_genes = 1000 + + # Create sparse count matrix with more realistic expression patterns + # Higher n parameter to ensure more genes are expressed + data = np.random.negative_binomial(n=10, p=0.5, size=(n_cells, n_genes)).astype(np.float32) + # Less aggressive sparsification to keep more genes + sparse_data = sp.csr_matrix(data) + + gene_names = np.array([f"Gene_{i}" for i in range(n_genes)]) + cell_names = np.array([f"Cell_{i}" for i in range(n_cells)]) + + return sparse_data, gene_names, cell_names + + +@pytest.fixture +def simple_adata(small_expression_matrix: tuple) -> AnnData: + """Create a simple AnnData object for testing. + + Parameters + ---------- + small_expression_matrix : tuple + Fixture providing expression matrix data. + + Returns + ------- + AnnData + Annotated data object. + """ + X, gene_names, cell_names = small_expression_matrix + adata = AnnData(X=X) + adata.var_names = gene_names + adata.obs_names = cell_names + return adata + + +@pytest.fixture +def adata_with_clusters(simple_adata: AnnData, random_seed: int) -> AnnData: + """Create an AnnData object with cluster annotations. + + Parameters + ---------- + simple_adata : AnnData + Fixture providing basic AnnData. + random_seed : int + Random seed for reproducibility. + + Returns + ------- + AnnData + AnnData with cluster labels. + """ + np.random.seed(random_seed) + n_cells = simple_adata.n_obs + n_clusters = 4 + + # Assign random cluster labels + clusters = np.random.choice([f"Cluster_{i}" for i in range(n_clusters)], size=n_cells) + simple_adata.obs["leiden_clusters"] = clusters + + return simple_adata + + +@pytest.fixture +def dense_matrix(random_seed: int) -> np.ndarray: + """Create a dense matrix for PCA testing. + + Parameters + ---------- + random_seed : int + Random seed for reproducibility. + + Returns + ------- + np.ndarray + Dense matrix with structure. + """ + np.random.seed(random_seed) + n_samples = 100 + n_features = 50 + + # Create matrix with some structure (for PCA to find) + # 3 underlying components + components = np.random.randn(3, n_features) + loadings = np.random.randn(n_samples, 3) + noise = np.random.randn(n_samples, n_features) * 0.1 + + return loadings @ components + noise + + +@pytest.fixture +def knn_test_data(random_seed: int) -> np.ndarray: + """Create data for k-NN testing with clear clusters. + + Parameters + ---------- + random_seed : int + Random seed for reproducibility. + + Returns + ------- + np.ndarray + Data with 3 clear clusters. + """ + np.random.seed(random_seed) + n_per_cluster = 30 + + # Create 3 well-separated clusters + cluster1 = np.random.randn(n_per_cluster, 10) + np.array([5, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + cluster2 = np.random.randn(n_per_cluster, 10) + np.array([0, 5, 0, 0, 0, 0, 0, 0, 0, 0]) + cluster3 = np.random.randn(n_per_cluster, 10) + np.array([0, 0, 5, 0, 0, 0, 0, 0, 0, 0]) + + return np.vstack([cluster1, cluster2, cluster3]) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..6fe9c77 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1 @@ +"""Integration tests for samalg.""" diff --git a/tests/integration/test_sam_workflow.py b/tests/integration/test_sam_workflow.py new file mode 100644 index 0000000..23ec400 --- /dev/null +++ b/tests/integration/test_sam_workflow.py @@ -0,0 +1,278 @@ +"""Integration tests for the SAM workflow.""" + +from __future__ import annotations + +import numpy as np +import pytest +import scipy.sparse as sp +from anndata import AnnData + +from samalg import SAM +from samalg.exceptions import DataNotLoadedError + + +class TestSAMInitialization: + """Tests for SAM object initialization.""" + + def test_init_empty(self) -> None: + """Test creating an empty SAM object.""" + sam = SAM() + + assert not hasattr(sam, "adata") + assert sam.run_args == {} + assert sam.preprocess_args == {} + + def test_init_with_tuple( + self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test initialization with tuple of (data, genes, cells).""" + X, genes, cells = small_expression_matrix + sam = SAM(counts=(X, genes, cells)) + + assert hasattr(sam, "adata") + assert hasattr(sam, "adata_raw") + assert sam.adata.n_obs == len(cells) + assert sam.adata.n_vars == len(genes) + + def test_init_with_anndata(self, simple_adata: AnnData) -> None: + """Test initialization with AnnData object.""" + sam = SAM(counts=simple_adata) + + assert hasattr(sam, "adata") + assert sam.adata.n_obs == simple_adata.n_obs + assert sam.adata.n_vars == simple_adata.n_vars + + def test_init_inplace(self, simple_adata: AnnData) -> None: + """Test inplace initialization doesn't copy.""" + sam = SAM(counts=simple_adata, inplace=True) + + # Should be the same object + assert sam.adata_raw is simple_adata + + def test_init_not_inplace(self, simple_adata: AnnData) -> None: + """Test non-inplace initialization copies data.""" + sam = SAM(counts=simple_adata, inplace=False) + + # Should be a different object + assert sam.adata_raw is not simple_adata + + +class TestPreprocessing: + """Tests for data preprocessing.""" + + def test_preprocess_no_data(self) -> None: + """Test preprocessing without loaded data raises error.""" + sam = SAM() + + with pytest.raises(DataNotLoadedError): + sam.preprocess_data() + + def test_preprocess_default( + self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test preprocessing with default parameters.""" + X, genes, cells = small_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data() + + # Check preprocessing was stored + assert "preprocess_args" in sam.adata.uns + assert sam.preprocess_args["norm"] == "log" + assert sam.preprocess_args["sum_norm"] == "cell_median" + + def test_preprocess_custom_norm( + self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test preprocessing with custom normalization.""" + X, genes, cells = small_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(norm="ftt", sum_norm=None) + + assert sam.preprocess_args["norm"] == "ftt" + assert sam.preprocess_args["sum_norm"] is None + + def test_preprocess_filters_genes( + self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test that gene filtering works.""" + X, genes, cells = small_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=True, thresh_low=0.1, thresh_high=0.9) + + # Should have filtered some genes (mask_genes should have False values) + assert "mask_genes" in sam.adata.var + # Some genes should be filtered out + n_kept = sam.adata.var["mask_genes"].sum() + assert n_kept < len(genes) + + def test_preprocess_stores_layers( + self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test that preprocessing creates X_disp layer.""" + X, genes, cells = small_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data() + + assert "X_disp" in sam.adata.layers + + +class TestSAMRun: + """Tests for running the SAM algorithm.""" + + def test_run_basic( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test basic SAM run.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection=None, verbose=False) + + # Check outputs + assert "weights" in sam.adata.var + assert "X_pca" in sam.adata.obsm + assert "connectivities" in sam.adata.obsp + assert "ranked_genes" in sam.adata.uns + + def test_run_with_umap( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test SAM run with UMAP projection.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection="umap", verbose=False) + + assert "X_umap" in sam.adata.obsm + assert sam.adata.obsm["X_umap"].shape == (len(cells), 2) + + def test_run_stores_args( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test that run arguments are stored.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=3, k=15, npcs=50, projection=None, verbose=False) + + assert sam.run_args["max_iter"] == 3 + assert sam.run_args["k"] == 15 + assert sam.run_args["npcs"] == 50 + + def test_run_weight_modes( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test different weight modes.""" + X, genes, cells = medium_expression_matrix + + for mode in ["dispersion", "variance", "rms", "combined"]: + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, weight_mode=mode, projection=None, verbose=False) + + assert "weights" in sam.adata.var + # Weights should be between 0 and 1 + weights = sam.adata.var["weights"].values + assert np.all(weights >= 0) and np.all(weights <= 1) + + +class TestClustering: + """Tests for clustering methods.""" + + def test_leiden_clustering( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test Leiden clustering.""" + pytest.importorskip("leidenalg") + pytest.importorskip("igraph") + + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection=None, verbose=False) + sam.leiden_clustering(res=1.0) + + assert "leiden_clusters" in sam.adata.obs + + def test_kmeans_clustering( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test k-means clustering.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection=None, verbose=False) + cl, _km = sam.kmeans_clustering(numc=5) + + assert "kmeans_clusters" in sam.adata.obs + assert len(np.unique(cl)) == 5 + + +class TestDispersionRanking: + """Tests for dispersion ranking.""" + + def test_dispersion_ranking( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test dispersion ranking NN.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection=None, verbose=False) + + weights = sam.dispersion_ranking_NN(weight_mode="dispersion") + + assert len(weights) == sam.adata.n_vars + assert np.all(weights >= 0) and np.all(weights <= 1) + + def test_dispersion_saves_averages( + self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray] + ) -> None: + """Test that save_avgs creates X_knn_avg layer.""" + X, genes, cells = medium_expression_matrix + sam = SAM(counts=(X, genes, cells)) + sam.preprocess_data(filter_genes=False, min_expression=0) + sam.run(max_iter=2, projection=None, verbose=False) + + sam.dispersion_ranking_NN(save_avgs=True) + + assert "X_knn_avg" in sam.adata.layers + + +class TestHelperMethods: + """Tests for helper methods.""" + + def test_get_labels(self, adata_with_clusters: AnnData) -> None: + """Test get_labels method.""" + sam = SAM(counts=adata_with_clusters) + + labels = sam.get_labels("leiden_clusters") + + assert len(labels) == sam.adata.n_obs + + def test_get_labels_un(self, adata_with_clusters: AnnData) -> None: + """Test get_labels_un returns unique labels.""" + sam = SAM(counts=adata_with_clusters) + + unique_labels = sam.get_labels_un("leiden_clusters") + + assert len(unique_labels) == 4 # We created 4 clusters + + def test_get_cells(self, adata_with_clusters: AnnData) -> None: + """Test get_cells retrieves cells with specific label.""" + sam = SAM(counts=adata_with_clusters) + + cells = sam.get_cells("Cluster_0", "leiden_clusters") + + # All returned cells should have the right label + for cell in cells: + idx = np.where(sam.adata.obs_names == cell)[0][0] + assert sam.adata.obs["leiden_clusters"].iloc[idx] == "Cluster_0" + + def test_get_labels_missing_key(self, simple_adata: AnnData) -> None: + """Test get_labels with missing key returns empty array.""" + sam = SAM(counts=simple_adata) + + labels = sam.get_labels("nonexistent_key") + + assert len(labels) == 0 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..7cc8f76 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1 @@ +"""Unit tests for samalg.""" diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py new file mode 100644 index 0000000..f45cb05 --- /dev/null +++ b/tests/unit/test_utilities.py @@ -0,0 +1,246 @@ +"""Unit tests for samalg.utilities module.""" + +from __future__ import annotations + +import numpy as np +import scipy.sparse as sp + +from samalg import utilities as ut + + +class TestWeightedPCA: + """Tests for weighted_PCA function.""" + + def test_basic_pca(self, dense_matrix: np.ndarray) -> None: + """Test basic PCA functionality.""" + reduced, pca = ut.weighted_PCA(dense_matrix, do_weight=False, npcs=10) + + assert reduced.shape[0] == dense_matrix.shape[0] + assert reduced.shape[1] == 10 + assert hasattr(pca, "components_") + assert pca.n_components_ == 10 + + def test_weighted_pca(self, dense_matrix: np.ndarray) -> None: + """Test weighted PCA with eigenvalue scaling.""" + reduced, pca = ut.weighted_PCA(dense_matrix, do_weight=True, npcs=10) + + assert reduced.shape[0] == dense_matrix.shape[0] + assert reduced.shape[1] == 10 + # Weighted variance should be normalized + eigenvalues = pca.explained_variance_ + assert eigenvalues[0] >= eigenvalues[-1] + + def test_pca_npcs_limit(self, dense_matrix: np.ndarray) -> None: + """Test that npcs is correctly bounded.""" + max_npcs = min(dense_matrix.shape) + reduced, _pca = ut.weighted_PCA(dense_matrix, npcs=max_npcs + 100) + + assert reduced.shape[1] <= max_npcs + + def test_pca_reproducibility(self, dense_matrix: np.ndarray) -> None: + """Test PCA is reproducible with same seed.""" + reduced1, _ = ut.weighted_PCA(dense_matrix, npcs=5, seed=42) + reduced2, _ = ut.weighted_PCA(dense_matrix, npcs=5, seed=42) + + np.testing.assert_array_almost_equal(np.abs(reduced1), np.abs(reduced2)) + + +class TestSparsePCA: + """Tests for sparse PCA function.""" + + def test_sparse_pca_basic(self, random_seed: int) -> None: + """Test sparse PCA on sparse matrix.""" + np.random.seed(random_seed) + n, m = 100, 50 + data = np.random.randn(n, m) + data[data < 0.5] = 0 + sparse_data = sp.csr_matrix(data) + + result = ut._pca_with_sparse(sparse_data, npcs=10) + + assert "X_pca" in result + assert "variance" in result + assert "components" in result + assert result["X_pca"].shape == (n, 10) + assert result["components"].shape == (10, m) + + def test_sparse_pca_variance_ordering(self, random_seed: int) -> None: + """Test that variance is in descending order.""" + np.random.seed(random_seed) + data = np.random.randn(100, 50) + data[data < 0.5] = 0 + sparse_data = sp.csr_matrix(data) + + result = ut._pca_with_sparse(sparse_data, npcs=10) + variance = result["variance"] + + # Variance should be in descending order + assert np.all(variance[:-1] >= variance[1:]) + + +class TestNearestNeighbors: + """Tests for nearest neighbor functions.""" + + def test_calc_nnm_basic(self, knn_test_data: np.ndarray) -> None: + """Test basic k-NN computation.""" + nnm = ut.calc_nnm(knn_test_data, k=5, distance="euclidean") + + assert sp.issparse(nnm) + assert nnm.shape[0] == nnm.shape[1] == knn_test_data.shape[0] + + def test_calc_nnm_cosine(self, knn_test_data: np.ndarray) -> None: + """Test k-NN with cosine distance (HNSW).""" + nnm = ut.calc_nnm(knn_test_data, k=5, distance="cosine") + + assert sp.issparse(nnm) + assert nnm.shape[0] == knn_test_data.shape[0] + + def test_gen_sparse_knn(self) -> None: + """Test sparse k-NN matrix generation.""" + n_samples, k = 10, 3 + indices = np.tile(np.arange(k)[None, :], (n_samples, 1)) + distances = np.random.rand(n_samples, k) + + sparse_nnm = ut.gen_sparse_knn(indices, distances) + + assert sp.issparse(sparse_nnm) + assert sparse_nnm.shape == (n_samples, n_samples) + + +class TestDistanceFunctions: + """Tests for distance computation functions.""" + + def test_euclidean_map(self) -> None: + """Test Euclidean distance computation.""" + A = np.array([[0, 0], [1, 0], [0, 1]]) + B = np.array([[0, 0], [2, 0]]) + + dist = ut.generate_euclidean_map(A, B) + + assert dist.shape == (3, 2) + np.testing.assert_almost_equal(dist[0, 0], 0.0) # A[0] to B[0] + np.testing.assert_almost_equal(dist[1, 0], 1.0) # A[1] to B[0] + np.testing.assert_almost_equal(dist[0, 1], 2.0) # A[0] to B[1] + + def test_correlation_map(self) -> None: + """Test correlation computation.""" + x = np.array([[1, 2, 3], [4, 5, 6]]) + y = np.array([[1, 2, 3], [-1, -2, -3]]) + + corr = ut.generate_correlation_map(x, y) + + assert corr.shape == (2, 2) + # Perfect positive correlation + np.testing.assert_almost_equal(corr[0, 0], 1.0, decimal=5) + # Perfect negative correlation + np.testing.assert_almost_equal(corr[0, 1], -1.0, decimal=5) + + +class TestSearchString: + """Tests for string search function.""" + + def test_basic_search(self) -> None: + """Test basic string search.""" + vec = np.array(["Gene_A", "Gene_B", "Other_C", "gene_d"]) + + matches, indices = ut.search_string(vec, "Gene") + + assert len(matches) == 3 # Gene_A, Gene_B, gene_d (case insensitive) + assert len(indices) == 3 + + def test_case_sensitive_search(self) -> None: + """Test case-sensitive search.""" + vec = np.array(["Gene_A", "Gene_B", "gene_c"]) + + matches, _indices = ut.search_string(vec, "Gene", case_sensitive=True) + + assert len(matches) == 2 # Only Gene_A, Gene_B + assert "gene_c" not in matches + + def test_inverted_search(self) -> None: + """Test inverted (non-matching) search.""" + vec = np.array(["Gene_A", "Gene_B", "Other_C"]) + + matches, _indices = ut.search_string(vec, "Gene", invert=True) + + assert len(matches) == 1 + assert matches[0] == "Other_C" + + def test_no_matches(self) -> None: + """Test search with no matches.""" + vec = np.array(["Gene_A", "Gene_B"]) + + matches, indices = ut.search_string(vec, "NotFound") + + assert matches == -1 + assert indices == -1 + + +class TestConvertAnnotations: + """Tests for annotation conversion.""" + + def test_basic_conversion(self) -> None: + """Test converting string labels to integers.""" + labels = np.array(["A", "B", "A", "C", "B"]) + + codes = ut.convert_annotations(labels) + + assert codes.dtype == np.int64 + assert len(np.unique(codes)) == 3 + # Same labels should have same codes + assert codes[0] == codes[2] # Both "A" + assert codes[1] == codes[4] # Both "B" + + def test_numeric_labels(self) -> None: + """Test with numeric labels.""" + labels = np.array([1, 2, 1, 3, 2]) + + codes = ut.convert_annotations(labels) + + assert codes.dtype == np.int64 + assert len(np.unique(codes)) == 3 + + +class TestDistToNN: + """Tests for distance to nearest neighbor conversion.""" + + def test_basic_dist_to_nn(self) -> None: + """Test basic distance matrix to k-NN conversion.""" + # Simple 4x4 distance matrix + dist = np.array([[0, 1, 2, 3], [1, 0, 1.5, 2.5], [2, 1.5, 0, 1], [3, 2.5, 1, 0]]) + + nn = ut.dist_to_nn(dist, K=2) + + assert nn.shape == dist.shape + # Each row should have K neighbors (value 1) + assert np.sum(nn[0]) == 2 + assert np.sum(nn[1]) == 2 + + +class TestHelperFunctions: + """Tests for miscellaneous helper functions.""" + + def test_to_lower(self) -> None: + """Test lowercase conversion.""" + arr = np.array(["GENE_A", "Gene_B", "gene_c"]) + result = ut.to_lower(arr) + + assert np.all(result == np.array(["gene_a", "gene_b", "gene_c"])) + + def test_to_upper(self) -> None: + """Test uppercase conversion.""" + arr = np.array(["gene_a", "Gene_B", "GENE_C"]) + result = ut.to_upper(arr) + + assert np.all(result == np.array(["GENE_A", "GENE_B", "GENE_C"])) + + def test_isolate(self) -> None: + """Test point isolation in rectangular region.""" + coords = np.array([[0, 0], [1, 1], [5, 5], [10, 10]]) + + # Get points in region (0.5, 6) x (0.5, 6) + idx = ut.isolate(coords, 0.5, 6, 0.5, 6) + + assert len(idx) == 2 + assert 1 in idx # (1, 1) + assert 2 in idx # (5, 5)