diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..969cf3a
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,39 @@
+name: CI
+
+on:
+  push:
+    branches: [master, main]
+  pull_request:
+    branches: [master, main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[all]"
+
+      - name: Lint with ruff
+        run: |
+          ruff check src/ tests/
+          ruff format --check src/ tests/
+
+      - name: Run tests
+        run: |
+          pytest --cov=samalg --cov-report=xml --cov-report=term
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v4
+        with:
+          files: ./coverage.xml
+          fail_ci_if_error: false
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..f0ebd9d
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,46 @@
+name: Release to PyPI
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install build tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install build
+
+      - name: Build package
+        run: python -m build
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    environment: pypi
+    permissions:
+      id-token: write
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5a2a789
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,102 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Jupyter Notebook
+.ipynb_checkpoints/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# ruff
+.ruff_cache/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+*.h5
+*.hdf5
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..27af5a4
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: check-toml
+
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.0
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.8.0
+    hooks:
+      - id: mypy
+        additional_dependencies:
+          - numpy>=1.26.0
+          - pandas-stubs
+          - types-requests
+        args: [--ignore-missing-imports]
+        files: ^src/
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 5b7485d..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Try container-based infrastucture
-#sudo: false
-
-notifications:
-  email: false
-
-language: python
-
-matrix:
-  include:
-    - python: 3.7
-      os: linux
-      dist: bionic
-      sudo: true
-    - python: 3.8
-      os: linux
-      dist: bionic
-      sudo: true
-    
-before_install:
-  - ./.travis_before_install.sh
-
-install:
-  - ./.travis_install.sh
- 
-script:
-  - ./.travis_test.sh
-
-#after_success:
-#  - ./.travis_deploy.sh
diff --git a/.travis_before_install.sh b/.travis_before_install.sh
deleted file mode 100755
index d570dfb..0000000
--- a/.travis_before_install.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-if [ $TRAVIS_OS_NAME == 'linux' ]; then
-  echo "Installing deps for linux"
-  sudo apt-get update
-  sudo apt-get install libhdf5-dev
-  #sudo apt-get install libgcc-5-dev
-
-  #sudo add-apt-repository ppa:nschloe/swig-backports -y
-  #sudo apt-get -qq update
-  #sudo apt-get install -y swig3.0
-else
-  echo "OS not recognized: $TRAVIS_OS_NAME"
-  exit 1
-fi
diff --git a/.travis_install.sh b/.travis_install.sh
deleted file mode 100755
index dc373d5..0000000
--- a/.travis_install.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-if [ $TRAVIS_OS_NAME == 'osx' ]; then
-  export PATH="$HOME/miniconda/bin:$PATH"
-  source activate
-  ## Somehow we need this to execute the setup.py at all...
-  #pip install numpy
-fi
-
-python setup.py install
-## setuptools < 18.0 has issues with Cython as a dependency
-#pip install Cython
-#if [ $? != 0 ]; then
-#    exit 1
-#fi
-
-# deps #FIXME: do better
-#python setup.py install
-#pip install numpy
-#pip install scipy
-#pip install pandas
-#pip install scikit-learn
-#pip install matplotlib
-
-# old setuptools also has a bug for extras, but it still compiles
-#pip install -v '.'
-#if [ $? != 0 ]; then
-#    exit 1
-#fi
diff --git a/.travis_test.sh b/.travis_test.sh
deleted file mode 100755
index c6f7065..0000000
--- a/.travis_test.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-if [ "$TRAVIS_OS_NAME" == 'osx' ]; then
-  export PATH="$HOME/miniconda/bin:$PATH"
-  source $HOME/miniconda/bin/activate
-  PYTHON="$HOME/miniconda/bin/python$PYTHON_VERSION"
-else
-  PYTHON=${PYTHON:-python}
-fi
-
-echo "python: ${PYTHON}"
-
-echo 'Running tests...'
-
-echo 'SAM test...'
-${PYTHON} "test/test_sam.py"
-if [ $? != 0 ]; then
-    exit 1
-fi
-
-echo 'done!'
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
deleted file mode 100644
index dd1d95d..0000000
--- a/Docker/Dockerfile
+++ /dev/null
@@ -1,49 +0,0 @@
-FROM debian:buster-slim
-ARG PATH="/root/miniconda/bin:${PATH}"
-ENV PATH="/root/miniconda/bin:${PATH}"
-
-RUN apt-get update \
-        && apt-get install -y wget \
-        && apt-get install -y g++ \ 
-        && apt-get install -y build-essential \
-	&& apt-get install -y git \ 
-        && apt-get install -y libcurl4-gnutls-dev libxml2-dev libssl-dev \ 
-        && rm -rf /var/lib/apt/lists/*
-
-RUN wget -O /tmp/miniconda.sh \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash /tmp/miniconda.sh -b -p /root/miniconda \
-    && rm -f /tmp/miniconda.sh \
-    && conda update -n base -c defaults conda \ 
-    && conda install python=3.7 pip \ 
-    && conda clean -afy
-
-RUN conda install -c plotly -c conda-forge \ 
-                            jupyter==1.0.0 \ 
-                            plotly==4.0.0 \ 
-                            colorlover==0.3.0 \ 
-                            ipyevents==0.8.1 \ 
-                            numpy==1.19.2 \ 
-                            scipy==1.5.2 \ 
-                            python-igraph==0.8.3 \ 
-                            leidenalg==0.8.3 \
-                            pandas==1.0.0 \
-                            umap-learn==0.4.6 \ 
-                            && conda clean -afy 
-                            
-RUN git clone https://github.com/atarashansky/self-assembling-manifold.git && \
-                /root/miniconda/bin/pip install self-assembling-manifold/. && rm -rf ~/.cache
-RUN /root/miniconda/bin/pip install hnswlib==0.4.0 && rm -rf ~/.cache
-                
-RUN chmod ugo+rwx /root
-RUN chmod ugo+rwx /tmp && mkdir /jupyter && mkdir /jupyter/notebooks
-WORKDIR /jupyter/
-
-ARG USER_ID
-ARG GROUP_ID
-RUN addgroup --gid $GROUP_ID user
-RUN adduser --disabled-password --gecos '' --uid $USER_ID --gid $GROUP_ID user
-USER user
-
-CMD jupyter notebook --port=$PORT --no-browser --ip=0.0.0.0 --allow-root --NotebookApp.password="" --NotebookApp.token=""
diff --git a/Docker/build_image.sh b/Docker/build_image.sh
deleted file mode 100644
index 325fc7e..0000000
--- a/Docker/build_image.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-read -e -p "Docker image name: " image
-docker build . -f Dockerfile -t $image --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g)
diff --git a/Docker/run_image.sh b/Docker/run_image.sh
deleted file mode 100644
index 72c0fe7..0000000
--- a/Docker/run_image.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-read -e -p "Docker image name: " image
-read -e -p "Docker container name: " name
-read -e -p "Volume name: " folder
-read -e -p "Jupyter notebook port: " port
-
-path="${folder/#\~/$HOME}"
-parentdir="$(dirname "$path")"
-chmod 755 $parentdir
-
-docker run -d --rm --name=$name \
-           -v $path:/jupyter/notebooks \
-           -p $port:$port -e PORT=$port $image
diff --git a/README.md b/README.md
index 2fe2c8e..cc22d41 100644
--- a/README.md
+++ b/README.md
@@ -68,14 +68,14 @@ Having activated the environment, SAM can be downloaded from the PyPI repository
 
 PIP install:
 ```
-pip install sam-algorithm
+pip install sc-sam
 ```
 
 Development version install:
 ```
 git clone https://github.com/atarashansky/self-assembling-manifold.git
 cd self-assembling-manifold
-python setup.py install
+pip install -e .
 ```
 For plotting, install `matplotlib`:
 
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..d393ae4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,193 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "sc-sam"
+version = "2.0.2"
+description = "The Self-Assembling-Manifold algorithm for single-cell RNA sequencing analysis"
+readme = "README.md"
+license = "MIT"
+authors = [
+    { name = "Alexander J. Tarashansky", email = "tarashan@stanford.edu" }
+]
+keywords = ["scrnaseq", "analysis", "manifold", "reconstruction", "single-cell"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+requires-python = ">=3.11"
+dependencies = [
+    "numpy>=1.26.0",
+    "scipy>=1.11.0",
+    "pandas>=2.0.0",
+    "scikit-learn>=1.3.0",
+    "numba>=0.59.0",
+    "anndata>=0.10.0",
+    "h5py>=3.10.0",
+    "umap-learn>=0.5.0",
+    "hnswlib>=0.8.0",
+    "dill>=0.3.7",
+    "packaging>=23.0",
+]
+
+[project.optional-dependencies]
+clustering = [
+    "leidenalg>=0.10.0",
+    "louvain>=0.8.0",
+    "python-igraph>=0.11.0",
+]
+viz = [
+    "matplotlib>=3.8",
+    "scanpy>=1.10.0",
+]
+hdbscan = [
+    "hdbscan>=0.8.33",
+]
+harmony = [
+    "harmonypy>=0.0.5",
+]
+dev = [
+    "pytest>=8.0",
+    "pytest-cov>=4.1",
+    "ruff>=0.2",
+    "mypy>=1.8",
+    "pre-commit>=3.6",
+]
+all = [
+    "sc-sam[clustering,viz,hdbscan,harmony,dev]",
+]
+
+[project.urls]
+Homepage = "https://github.com/atarashansky/self-assembling-manifold"
+Documentation = "https://github.com/atarashansky/self-assembling-manifold#readme"
+Repository = "https://github.com/atarashansky/self-assembling-manifold"
+Issues = "https://github.com/atarashansky/self-assembling-manifold/issues"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/samalg"]
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/tests",
+    "/README.md",
+    "/pyproject.toml",
+]
+
+# Ruff configuration
+[tool.ruff]
+target-version = "py311"
+line-length = 100
+src = ["src", "tests"]
+
+[tool.ruff.lint]
+select = [
+    "E",      # pycodestyle errors
+    "W",      # pycodestyle warnings
+    "F",      # Pyflakes
+    "I",      # isort
+    "B",      # flake8-bugbear
+    "C4",     # flake8-comprehensions
+    "UP",     # pyupgrade
+    "SIM",    # flake8-simplify
+    "RUF",    # Ruff-specific rules
+    "NPY",    # NumPy-specific rules
+]
+ignore = [
+    "E501",   # line too long (handled by formatter)
+    "B008",   # do not perform function calls in argument defaults
+    "B028",   # no stacklevel in warnings
+    "C901",   # too complex
+    "NPY002", # legacy numpy random (too noisy to fix)
+    "SIM101", # multiple isinstance calls
+    "SIM102", # nested if statements
+    "SIM105", # use contextlib.suppress
+    "SIM108", # Use ternary operator instead of if-else-block
+    "SIM113", # use enumerate
+    "SIM115", # use context manager for open
+    "SIM118", # use key in dict instead of key in dict.keys()
+    "UP038",  # use X | Y in isinstance (removed in newer ruff)
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["S101", "NPY002"]  # Allow assert and legacy numpy random in tests
+"__init__.py" = ["F401"]  # Allow unused imports in __init__.py
+
+[tool.ruff.lint.isort]
+known-first-party = ["samalg"]
+
+# Mypy configuration
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
+strict_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+show_error_codes = true
+exclude = [
+    "tests/",
+    "build/",
+    "dist/",
+]
+
+[[tool.mypy.overrides]]
+module = [
+    "scipy.*",
+    "sklearn.*",
+    "numba.*",
+    "umap.*",
+    "anndata.*",
+    "h5py.*",
+    "hnswlib.*",
+    "harmonypy.*",
+    "dill.*",
+    "leidenalg.*",
+    "louvain.*",
+    "igraph.*",
+    "hdbscan.*",
+    "matplotlib.*",
+    "scanpy.*",
+]
+ignore_missing_imports = true
+
+# Pytest configuration
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--tb=short",
+    "--strict-markers",
+]
+filterwarnings = [
+    "ignore::numba.core.errors.NumbaWarning",
+    "ignore::DeprecationWarning",
+]
+
+# Coverage configuration
+[tool.coverage.run]
+source = ["src/samalg"]
+branch = true
+omit = [
+    "*/tests/*",
+    "*/__init__.py",
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "raise NotImplementedError",
+    "if TYPE_CHECKING:",
+    "if __name__ == .__main__.:",
+]
diff --git a/samalg/__init__.py b/samalg/__init__.py
deleted file mode 100755
index a01470a..0000000
--- a/samalg/__init__.py
+++ /dev/null
@@ -1,1837 +0,0 @@
-import gc
-import numpy as np
-from anndata import AnnData
-import anndata
-import scipy.sparse as sp
-import time
-from sklearn.preprocessing import Normalizer, StandardScaler
-import pickle
-import pandas as pd
-from . import utilities as ut
-import sklearn.manifold as man
-import sklearn.utils.sparsefuncs as sf
-from packaging import version
-import warnings
-from numba.core.errors import NumbaWarning
-from sklearn.utils.extmath import svd_flip
-from scipy.linalg import block_diag
-import harmonypy
-
-warnings.filterwarnings("ignore", category=NumbaWarning)
-
-__version__ = "1.0.0"
-
-"""
-Copyright 2018, Alexander J. Tarashansky, All rights reserved.
-Email: <tarashan@stanford.edu>
-"""
-
-class SAM(object):
-    """Self-Assembling Manifolds single-cell RNA sequencing analysis tool.
-
-    SAM iteratively rescales the input gene expression matrix to emphasize
-    genes that are spatially variable along the intrinsic manifold of the data.
-    It outputs the gene weights, nearest neighbor matrix, and a 2D projection.
-
-    Parameters
-    ----------
-    counts : tuple or list (scipy.sparse matrix, numpy.ndarray,numpy.ndarray),
-        OR tuple or list (numpy.ndarray, numpy.ndarray,numpy.ndarray), OR
-        pandas.DataFrame, OR anndata.AnnData
-
-        If a tuple or list, it should contain the gene expression data
-        (scipy.sparse or numpy.ndarray) matrix (cells x genes), numpy array of
-        gene IDs, and numpy array of cell IDs in that order.
-
-        If a pandas.DataFrame, it should be (cells x genes)
-
-        Only use this argument if you want to pass in preloaded data. Otherwise
-        use one of the load functions.
-
-
-    Attributes
-    ----------
-
-    preprocess_args: dict
-        Dictionary of arguments used for the 'preprocess_data' function.
-
-    run_args: dict
-        Dictionary of arguments used for the 'run' function.
-
-    adata_raw: AnnData
-        An AnnData object containing the raw, unfiltered input data.
-
-    adata: AnnData
-        An AnnData object containing all processed data and SAM outputs.
-
-    """
-
-    def __init__(self, counts=None, inplace=False):
-
-        if isinstance(counts, tuple) or isinstance(counts, list):
-            raw_data, all_gene_names, all_cell_names = counts
-            if isinstance(raw_data, np.ndarray):
-                raw_data = sp.csr_matrix(raw_data)
-
-            self.adata_raw = AnnData(
-                X=raw_data,
-                obs={"obs_names": all_cell_names},
-                var={"var_names": all_gene_names},
-            )
-
-        elif isinstance(counts, pd.DataFrame):
-            raw_data = sp.csr_matrix(counts.values)
-            all_gene_names = np.array(list(counts.columns.values))
-            all_cell_names = np.array(list(counts.index.values))
-
-            self.adata_raw = AnnData(
-                X=raw_data,
-                obs={"obs_names": all_cell_names},
-                var={"var_names": all_gene_names},
-            )
-
-        elif isinstance(counts, AnnData):
-            all_cell_names = np.array(list(counts.obs_names))
-            all_gene_names = np.array(list(counts.var_names))
-            if counts.is_view:
-                counts = counts.copy()
-                
-            if inplace:
-                self.adata_raw = counts
-            else:
-                self.adata_raw = counts.copy()            
-
-        elif counts is not None:
-            raise Exception(
-                "'counts' must be either a tuple/list of "
-                "(data,gene IDs,cell IDs) or a Pandas DataFrame of"
-                "cells x genes"
-            )
-
-        if counts is not None:
-            if np.unique(all_gene_names).size != all_gene_names.size:
-                self.adata_raw.var_names_make_unique()
-            if np.unique(all_cell_names).size != all_cell_names.size:
-                self.adata_raw.obs_names_make_unique()
-
-            if inplace:
-                self.adata = self.adata_raw
-            else:
-                self.adata = self.adata_raw.copy()
-
-            if "X_disp" not in self.adata_raw.layers.keys():
-                self.adata.layers["X_disp"] = self.adata.X
-
-        self.run_args = {}
-        self.preprocess_args = {}
-
-    def preprocess_data(
-        self,
-        div=1,
-        downsample=0,
-        sum_norm="cell_median",
-        norm="log",
-        min_expression=1,
-        thresh_low=0.0,
-        thresh_high=0.96,
-        thresh = None,
-        filter_genes=True,
-    ):
-        """Log-normalizes and filters the expression data.
-
-        Parameters
-        ----------
-
-        div : float, optional, default 1
-            The factor by which the gene expression will be divided prior to
-            normalization (e.g. log normalization).
-
-        downsample : float, optional, default 0
-            The factor by which to randomly downsample the data. If 0, the
-            data will not be downsampled.
-
-        sum_norm : str or float, optional, default "cell_median"
-            If a float, the total number of transcripts in each cell will be
-            normalized to this value prior to normalization and filtering.
-            Otherwise, nothing happens. If 'cell_median', each cell is
-            normalized to have the median total read count per cell. If
-            'gene_median', each gene is normalized to have the median total
-            read count per gene.
-
-        norm : str, optional, default 'log'
-            If 'log', log-normalizes the expression data. If 'ftt', applies the
-            Freeman-Tukey variance-stabilization transformation. If
-            'multinomial', applies the Pearson-residual transformation (this is
-            experimental and should only be used for raw, un-normalized UMI
-            datasets). If None, the data is not normalized.
-
-        min_expression : float, optional, default 1
-            The threshold above which a gene is considered
-            expressed. Gene expression values less than 'min_expression' are
-            set to zero.
-
-        thresh_low : float, optional, default 0.0
-            Keep genes expressed in greater than 'thresh_low'*100 % of cells,
-            where a gene is considered expressed if its expression value
-            exceeds 'min_expression'.
-
-        thresh_high : float, optional, default 0.96
-            Keep genes expressed in less than 'thresh_high'*100 % of cells,
-            where a gene is considered expressed if its expression value
-            exceeds 'min_expression'.
-
-        filter_genes : bool, optional, default True
-            Setting this to False turns off filtering operations.
-
-        """
-        if thresh is not None:
-            thresh_low = thresh
-            thresh_high = 1-thresh
-
-        self.preprocess_args = {
-            "div": div,
-            "sum_norm": sum_norm,
-            "norm": norm,
-            "min_expression": min_expression,
-            "thresh_low": thresh_low,
-            "thresh_high":thresh_high,
-            "filter_genes": filter_genes,
-        }
-        
-        self.run_args = self.adata.uns.get('run_args',{})
-        # load data
-        try:
-            D = self.adata_raw.X
-            self.adata = self.adata_raw.copy()
-
-        except AttributeError:
-            print("No data loaded")
-
-        D = self.adata.X
-        if isinstance(D, np.ndarray):
-            D = sp.csr_matrix(D, dtype="float32")
-        else:
-            if str(D.dtype) != "float32":
-                D = D.astype("float32")
-            D.sort_indices()
-
-        if D.getformat() == "csc":
-            D = D.tocsr()
-
-        # sum-normalize
-        if sum_norm == "cell_median" and norm != "multinomial":
-            s = D.sum(1).A.flatten()
-            sum_norm = np.median(s)
-            D = D.multiply(1 / s[:, None] * sum_norm).tocsr()
-        elif sum_norm == "gene_median" and norm != "multinomial":
-            s = D.sum(0).A.flatten()
-            sum_norm = np.median(s[s > 0])
-            s[s == 0] = 1
-            D = D.multiply(1 / s[None, :] * sum_norm).tocsr()
-
-        elif sum_norm is not None and norm != "multinomial":
-            D = D.multiply(1 / D.sum(1).A.flatten()[:, None] * sum_norm).tocsr()
-
-        # normalize
-        self.adata.X = D
-        if norm is None:
-            D.data[:] = D.data / div
-
-        elif norm.lower() == "log":
-            D.data[:] = np.log2(D.data / div + 1)
-
-        elif norm.lower() == "ftt":
-            D.data[:] = np.sqrt(D.data / div) + np.sqrt(D.data / div + 1) - 1
-
-        elif norm.lower() == "asin":
-            D.data[:] = np.arcsinh(D.data / div)
-        elif norm.lower() == "multinomial":
-            ni = D.sum(1).A.flatten()  # cells
-            pj = (D.sum(0) / D.sum()).A.flatten()  # genes
-            col = D.indices
-            row = []
-            for i in range(D.shape[0]):
-                row.append(i * np.ones(D.indptr[i + 1] - D.indptr[i]))
-            row = np.concatenate(row).astype("int32")
-            mu = sp.coo_matrix((ni[row] * pj[col], (row, col))).tocsr()
-            mu2 = mu.copy()
-            mu2.data[:] = mu2.data ** 2
-            mu2 = mu2.multiply(1 / ni[:, None])
-            mu.data[:] = (D.data - mu.data) / np.sqrt(mu.data - mu2.data)
-
-            self.adata.X = mu
-            if sum_norm is None:
-                sum_norm = np.median(ni)
-            D = D.multiply(1 / ni[:, None] * sum_norm).tocsr()
-            D.data[:] = np.log2(D.data / div + 1)
-
-        else:
-            D.data[:] = D.data / div
-
-        # zero-out low-expressed genes
-        idx = np.where(D.data <= min_expression)[0]
-        D.data[idx] = 0
-
-        # filter genes
-        idx_genes = np.arange(D.shape[1])
-        if filter_genes:
-            a, ct = np.unique(D.indices, return_counts=True)
-            c = np.zeros(D.shape[1])
-            c[a] = ct
-
-            keep = np.where(
-                np.logical_and(c / D.shape[0] > thresh_low, c / D.shape[0] <= thresh_high)
-            )[0]
-
-            idx_genes = np.array(list(set(keep) & set(idx_genes)))
-
-        mask_genes = np.zeros(D.shape[1], dtype="bool")
-        mask_genes[idx_genes] = True
-
-        self.adata.X = self.adata.X.multiply(mask_genes[None, :]).tocsr()
-        self.adata.X.eliminate_zeros()
-        self.adata.var["mask_genes"] = mask_genes
-
-        if norm == "multinomial":
-            self.adata.layers["X_disp"] = D.multiply(mask_genes[None, :]).tocsr()
-            self.adata.layers["X_disp"].eliminate_zeros()
-        else:
-            self.adata.layers["X_disp"] = self.adata.X
-
-        self.calculate_mean_var()
-
-        self.adata.uns["preprocess_args"] = self.preprocess_args
-        self.adata.uns['run_args'] = self.run_args
-
-    def calculate_mean_var(self,adata=None):
-        if adata is None:
-            adata = self.adata
-
-        if sp.issparse(adata.X):
-            mu,var = sf.mean_variance_axis(adata.X,axis=0)
-        else:
-            mu = adata.X.mean(0)
-            var = adata.X.var(0)
-            
-        adata.var['means'] = mu
-        adata.var['variances'] = var
-        
-    def get_avg_obsm(self, keym, keyl):
-        clu = self.get_labels_un(keyl)
-        cl = self.get_labels(keyl)
-        x = []
-        for i in range(clu.size):
-            x.append(self.adata.obsm[keym][cl == clu[i]].mean(0))
-        x = np.vstack(x)
-        return x
-
-    def get_labels_un(self, key):
-        if key not in list(self.adata.obs.keys()):
-            print("Key does not exist in `obs`.")
-            return np.array([])
-        else:
-            return np.array(list(np.unique(self.adata.obs[key])))
-
-    def get_labels(self, key):
-        if key not in list(self.adata.obs.keys()):
-            print("Key does not exist in `obs`.")
-            return np.array([])
-        else:
-            return np.array(list(self.adata.obs[key]))
-
-    def get_cells(self, label, key):
-        """Retrieves cells of a particular annotation.
-
-        Parameters
-        ----------
-        label - The annotation to retrieve
-        key - The key in `obs` from which to retrieve the annotation.
-
-        """
-        if key not in list(self.adata.obs.keys()):
-            print("Key does not exist in `obs`.")
-            return np.array([])
-        else:
-            return np.array(
-                list(self.adata.obs_names[np.array(list(self.adata.obs[key])) == label])
-            )
-
-    def load_data(
-        self,
-        filename,
-        transpose=True,
-        save_sparse_file=None,
-        sep=",",
-        calculate_avg=False,
-        **kwargs
-    ):
-        """Loads the specified data file. The file can be a table of
-        read counts (i.e. '.csv' or '.txt'), with genes as rows and cells
-        as columns by default. The file can also be a pickle file (output from
-        'save_sparse_data') or an h5ad file (output from 'save_anndata').
-
-        This function that loads the file specified by 'filename'.
-
-        Parameters
-        ----------
-        filename - string
-            The path to the tabular raw expression counts file.
-
-        sep - string, optional, default ','
-            The delimeter used to read the input data table. By default
-            assumes the input table is delimited by commas.
-
-        save_sparse_file - str, optional, default None
-            Path to which the sparse data will be saved ('.h5ad').
-            Writes the SAM 'adata_raw' object to a h5ad file (the native AnnData
-            file format) for faster loading in the future.
-
-        transpose - bool, optional, default True
-            By default, assumes file is (genes x cells). Set this to False if
-            the file has dimensions (cells x genes).
-
-        calculate_avg - bool, optional, default True
-            If nearest neighbors are already calculated in the .h5ad file,
-            setting this parameter to True performs knn averaging and stores
-            the result in adata.layers['X_knn_avg']. This is a fairly dense
-            matrix, so set this to False if you do not need the averaged
-            expressions.
-
-        """
-        if filename.split(".")[-1] == "p":
-            raw_data, all_cell_names, all_gene_names = pickle.load(open(filename, "rb"))
-
-            if transpose:
-                raw_data = raw_data.T
-                if raw_data.getformat() == "csc":
-                    print("Converting sparse matrix to csr format...")
-                    raw_data = raw_data.tocsr()
-
-            save_sparse_file = None
-        elif filename.split(".")[-1] != "h5ad":
-            df = pd.read_csv(filename, sep=sep, index_col=0, **kwargs)
-            if transpose:
-                dataset = df.T
-            else:
-                dataset = df
-
-            raw_data = sp.csr_matrix(dataset.values)
-            all_cell_names = np.array(list(dataset.index.values))
-            all_gene_names = np.array(list(dataset.columns.values))
-
-        if filename.split(".")[-1] != "h5ad":
-            self.adata_raw = AnnData(
-                X=raw_data,
-                obs={"obs_names": all_cell_names},
-                var={"var_names": all_gene_names},
-            )
-
-            if np.unique(all_gene_names).size != all_gene_names.size:
-                self.adata_raw.var_names_make_unique()
-            if np.unique(all_cell_names).size != all_cell_names.size:
-                self.adata_raw.obs_names_make_unique()
-
-            self.adata = self.adata_raw.copy()
-            self.adata.layers["X_disp"] = raw_data
-
-        else:
-            self.adata = anndata.read_h5ad(filename, **kwargs)
-            if self.adata.raw is not None:
-                self.adata_raw = AnnData(X=self.adata.raw.X)
-                self.adata_raw.var_names = self.adata.var_names
-                self.adata_raw.obs_names = self.adata.obs_names
-                self.adata_raw.obs = self.adata.obs
-
-                if version.parse(str(anndata.__version__)) >= version.parse("0.7rc1"):
-                    del self.adata.raw
-                else:
-                    self.adata.raw = None
-
-                if (
-                    "X_knn_avg" not in self.adata.layers.keys()
-                    and "connectivities" in self.adata.obsp.keys()
-                    and calculate_avg
-                ):
-                    self.dispersion_ranking_NN(save_avgs=True)
-            else:
-                self.adata_raw = self.adata
-
-            if "X_disp" not in list(self.adata.layers.keys()):
-                self.adata.layers["X_disp"] = self.adata.X
-            save_sparse_file = None
-        filename = '.'.join(filename.split('.')[:-1])+'.h5ad'
-        self.adata.uns['path_to_file'] = filename
-        self.adata_raw.uns['path_to_file'] = filename        
-        if save_sparse_file is not None:
-            if save_sparse_file.split(".")[-1] == "p":
-                self.save_sparse_data(save_sparse_file)
-            elif save_sparse_file.split(".")[-1] == "h5ad":
-                self.save_anndata(save_sparse_file)
-
-    def save_anndata(self, fname='', save_knn=False, **kwargs):
-        """Saves `adata_raw` to a .h5ad file (AnnData's native file format).
-
-        Parameters
-        ----------
-        fname - string, default ''
-            The filename of the output file. If empty, defaults to `.adata.uns['path_to_file']`.
-
-        save_knn - bool, optional, default = False
-            If True, saves `.layers['X_knn_avg']`. If False, does not save
-            this layer. Default value is set to False as the nearest-neighbor
-            averaged expression values can be quite dense.
-
-        """
-        if not save_knn:
-            try:
-                Xknn = self.adata.layers["X_knn_avg"]
-                del self.adata.layers["X_knn_avg"]
-            except:
-                0
-        if fname == '':
-            try:
-                fname = self.adata.uns['path_to_file']
-            except KeyError:
-                raise KeyError('Path to file not known.')
-                
-        x = self.adata
-        x.raw = self.adata_raw
-        
-        # fixing weird issues when index name is an integer.
-        for y in [x.obs.columns,x.var.columns,
-                  x.obs.index,x.var.index,
-                  x.raw.var.index,x.raw.var.columns]:
-            y.name = str(y.name) if y.name is not None else None
-
-        x.write_h5ad(fname, **kwargs)
-        if version.parse(str(anndata.__version__)) >= version.parse("0.7rc1"):
-            del x.raw
-        else:
-            x.raw = None
-
-        try:
-            self.adata.layers["X_knn_avg"] = Xknn
-        except:
-            0
-
-    def load_var_annotations(self, aname, sep=",", key_added="annotations"):
-        """Loads gene annotations.
-
-        Loads the gene annotations into .adata_raw.var and .adata.var. The keys
-        added correspond to the column labels in the input table.
-
-        Parameters
-        ----------
-        aname - string or pandas.DataFrame
-            If string, it is the path to the annotations file, which should be
-            a table with the first column being the gene IDs and the first row
-            being the column names for the annotations. Alternatively, you can
-            directly pass in a preloaded pandas.DataFrame.
-
-        sep - string, default ','
-            The delimeter used in the file. Ignored if passing in a preloaded
-            pandas.DataFrame.
-
-        """
-        if isinstance(aname, pd.DataFrame):
-            ann = aname
-        else:
-            ann = pd.read_csv(aname, sep=sep, index_col=0)
-
-        for i in range(ann.shape[1]):
-            self.adata_raw.var[ann.columns[i]] = ann[ann.columns[i]]
-            self.adata.var[ann.columns[i]] = ann[ann.columns[i]]
-
-    def load_obs_annotations(self, aname, sep=","):
-        """Loads cell annotations.
-
-        Loads the cell annotations into .adata_raw.obs and .adata.obs. The keys
-        added correspond to the column labels in the input table.
-
-        Parameters
-        ----------
-        aname - string or pandas.DataFrame
-            If string, it is the path to the annotations file, which should be
-            a table with the first column being the cell IDs and the first row
-            being the column names for the annotations. Alternatively, you can
-            directly pass in a preloaded pandas.DataFrame.
-
-        sep - string, default ','
-            The delimeter used in the file. Ignored if passing in a preloaded
-            pandas.DataFrame.
-
-        """
-        if isinstance(aname, pd.DataFrame):
-            ann = aname
-        else:
-            ann = pd.read_csv(aname, sep=sep, index_col=0)
-
-        for i in range(ann.shape[1]):
-            self.adata_raw.obs[ann.columns[i]] = ann[ann.columns[i]]
-            self.adata.obs[ann.columns[i]] = ann[ann.columns[i]]
-
-    def scatter(
-        self,
-        projection=None,
-        c=None,
-        colorspec=None,
-        cmap="rainbow",
-        linewidth=0.0,
-        edgecolor="k",
-        axes=None,
-        colorbar=True,
-        s=10,
-        **kwargs
-    ):
-
-        """Display a scatter plot.
-        Displays a scatter plot using the SAM projection or another input
-        projection.
-
-        Parameters
-        ----------
-        projection - string, numpy.ndarray, default None
-            A case-sensitive string indicating the projection to display (a key
-            in adata.obsm) or a 2D numpy array with cell coordinates. If None,
-            projection defaults to UMAP.
-
-        c - string, numpy.ndarray, default None
-            Categorical data to be mapped to a colormap and overlaid on top of
-            the projection. Can be a key from adata.obs or a 1D numpy array.
-
-        colorspec - string, numpy.ndarray, default None
-            A string specifying a color or an array specifying the color
-            for each point (can be strings, RGBA, RGB, etc). Colorbar will be
-            turned off if `colorspec` is not None.
-
-        axes - matplotlib axis, optional, default None
-            Plot output to the specified, existing axes. If None, create new
-            figure window.
-
-        **kwargs - all keyword arguments in matplotlib.pyplot.scatter are eligible.
-        """
-
-        try:
-            import matplotlib.pyplot as plt
-
-            if isinstance(projection, str):
-                try:
-                    dt = self.adata.obsm[projection]
-                except KeyError:
-                    print(
-                        "Please create a projection first using run_umap or" "run_tsne"
-                    )
-
-            elif projection is None:
-                try:
-                    dt = self.adata.obsm["X_umap"]
-                except KeyError:
-                    try:
-                        dt = self.adata.obsm["X_tsne"]
-                    except KeyError:
-                        print(
-                            "Please create either a t-SNE or UMAP projection" "first."
-                        )
-                        return
-            else:
-                dt = projection
-
-            if axes is None:
-                plt.figure()
-                axes = plt.gca()
-
-            if colorspec is not None:
-                axes.scatter(
-                    dt[:, 0],
-                    dt[:, 1],
-                    s=s,
-                    linewidth=linewidth,
-                    edgecolor=edgecolor,
-                    c=colorspec,
-                    **kwargs
-                )
-            elif c is None:
-                axes.scatter(
-                    dt[:, 0],
-                    dt[:, 1],
-                    s=s,
-                    linewidth=linewidth,
-                    edgecolor=edgecolor,
-                    **kwargs
-                )
-            else:
-
-                if isinstance(c, str):
-                    try:
-                        c = self.get_labels(c)
-                    except KeyError:
-                        0  # do nothing
-
-                if (isinstance(c[0], str) or isinstance(c[0], np.str_)) and (
-                    isinstance(c, np.ndarray) or isinstance(c, list)
-                ):
-                    i = ut.convert_annotations(c)
-                    ui, ai = np.unique(i, return_index=True)
-                    cax = axes.scatter(
-                        dt[:, 0],
-                        dt[:, 1],
-                        c=i,
-                        cmap=cmap,
-                        s=s,
-                        linewidth=linewidth,
-                        edgecolor=edgecolor,
-                        **kwargs
-                    )
-
-                    if colorbar:
-                        cbar = plt.colorbar(cax, ax=axes, ticks=ui)
-                        cbar.ax.set_yticklabels(c[ai])
-                else:
-                    if not (isinstance(c, np.ndarray) or isinstance(c, list)):
-                        colorbar = False
-                    i = c
-
-                    cax = axes.scatter(
-                        dt[:, 0],
-                        dt[:, 1],
-                        c=i,
-                        cmap=cmap,
-                        s=s,
-                        linewidth=linewidth,
-                        edgecolor=edgecolor,
-                        **kwargs
-                    )
-
-                    if colorbar:
-                        plt.colorbar(cax, ax=axes)
-            return axes
-        except ImportError:
-            print("matplotlib not installed!")
-
-    def show_gene_expression(self, gene, avg=True, axes=None, **kwargs):
-        """Display a gene's expressions.
-        Displays a scatter plot using the SAM projection or another input
-        projection with a particular gene's expressions overlaid.
-        Parameters
-        ----------
-        gene - string
-            a case-sensitive string indicating the gene expression pattern
-            to display.
-        avg - bool, optional, default True
-            If True, the plots use the k-nearest-neighbor-averaged expression
-            values to smooth out noisy expression patterns and improves
-            visualization.
-        axes - matplotlib axis, optional, default None
-            Plot output to the specified, existing axes. If None, create new
-            figure window.
-        **kwargs - all keyword arguments in 'SAM.scatter' are eligible.
-        """
-        all_gene_names = np.array(list(self.adata.var_names))
-        cell_names = np.array(list(self.adata.obs_names))
-        all_cell_names = np.array(list(self.adata_raw.obs_names))
-        idx2 = np.where(np.in1d(all_cell_names, cell_names))[0]
-        idx = np.where(all_gene_names == gene)[0]
-        name = gene
-        if idx.size == 0:
-            print(
-                "Gene note found in the filtered dataset. Note that genes "
-                "are case sensitive."
-            )
-            return
-
-        if avg:
-            a = self.adata.layers["X_knn_avg"][:, idx].toarray().flatten()
-            if a.sum() == 0:
-                a = self.adata_raw.X[:, idx].toarray().flatten()[idx2]
-                try:
-                    norm = self.preprocess_args["norm"]
-                except KeyError:
-                    norm = "log"
-                if norm is not None:
-                    if norm.lower() == "log":
-                        a = np.log2(a + 1)
-
-                    elif norm.lower() == "ftt":
-                        a = np.sqrt(a) + np.sqrt(a + 1)
-                    elif norm.lower() == "asin":
-                        a = np.arcsinh(a)
-        else:
-            a = self.adata_raw.X[:, idx].toarray().flatten()[idx2]
-            try:
-                norm = self.preprocess_args["norm"]
-            except KeyError:
-                norm = "log"
-
-            if norm is not None:
-                if norm.lower() == "log":
-                    a = np.log2(a + 1)
-
-                elif norm.lower() == "ftt":
-                    a = np.sqrt(a) + np.sqrt(a + 1)
-                elif norm.lower() == "asin":
-                    a = np.arcsinh(a)
-
-        axes = self.scatter(c=a, axes=axes, **kwargs)
-        axes.set_title(name)
-
-        return axes, a
-
-    def dispersion_ranking_NN(self, nnm=None, num_norm_avg=50, weight_mode='combined',save_avgs=False, adata=None):
-        """Computes the spatial dispersion factors for each gene.
-
-        Parameters
-        ----------
-        nnm - scipy.sparse, default None
-            Square cell-to-cell nearest-neighbor matrix. If None, uses the
-            nearest neighbor matrix in .adata.obsp['connectivities']
-
-        num_norm_avg - int, optional, default 50
-            The top 'num_norm_avg' dispersions are averaged to determine the
-            normalization factor when calculating the weights. This ensures
-            that outlier genes do not significantly skew the weight
-            distribution.
-
-        Returns:
-        -------
-        weights - ndarray, float
-            The vector of gene weights.
-        """
-        if adata is None:
-            adata = self.adata
-
-        if nnm is None:
-            nnm = adata.obsp["connectivities"]
-        f = nnm.sum(1).A
-        f[f==0]=1
-        D_avg = (nnm.multiply(1 / f)).dot(adata.layers["X_disp"])
-
-        if save_avgs:
-            adata.layers["X_knn_avg"] = D_avg.copy()
-
-        if sp.issparse(D_avg):
-            mu, var = sf.mean_variance_axis(D_avg, axis=0)            
-            if weight_mode == 'rms':
-                D_avg.data[:]=D_avg.data**2
-                mu,_ =sf.mean_variance_axis(D_avg, axis=0)
-                mu=mu**0.5
-                
-            if weight_mode == 'combined':
-                D_avg.data[:]=D_avg.data**2
-                mu2,_ =sf.mean_variance_axis(D_avg, axis=0)
-                mu2=mu2**0.5                
-        else:
-            mu = D_avg.mean(0)
-            var = D_avg.var(0)
-            if weight_mode == 'rms':
-                mu =(D_avg**2).mean(0)**0.5
-            if weight_mode == 'combined':
-                mu2 =(D_avg**2).mean(0)**0.5
-            
-                
-        if not save_avgs:        
-            del D_avg
-            gc.collect()
-
-        if weight_mode == 'dispersion' or weight_mode == 'rms' or weight_mode == 'combined':
-            dispersions = np.zeros(var.size)
-            dispersions[mu > 0] = var[mu > 0] / mu[mu > 0]
-            adata.var["spatial_dispersions"] = dispersions.copy()
-            
-            if weight_mode == 'combined':
-                dispersions2 = np.zeros(var.size)
-                dispersions2[mu2 > 0] = var[mu2 > 0] / mu2[mu2 > 0]
-                
-
-        elif weight_mode == 'variance':
-            dispersions = var
-            adata.var["spatial_variances"] = dispersions.copy()
-        else:
-            raise ValueError('`weight_mode` ',weight_mode,' not recognized.')
-
-        ma = np.sort(dispersions)[-num_norm_avg:].mean()
-        dispersions[dispersions >= ma] = ma
-
-        weights = ((dispersions / dispersions.max()) ** 0.5).flatten()
-        
-        if weight_mode == 'combined':
-            ma = np.sort(dispersions2)[-num_norm_avg:].mean()
-            dispersions2[dispersions2 >= ma] = ma
-
-            weights2 = ((dispersions2 / dispersions2.max()) ** 0.5).flatten()
-            weights = np.vstack((weights,weights2)).max(0)
-
-        return weights
-
-    def run(
-        self,
-        max_iter=10,
-        verbose=True,
-        projection="umap",
-        stopping_condition=1e-2,
-        num_norm_avg=50,
-        k=20,
-        distance="cosine",
-        preprocessing="StandardScaler",
-        npcs=150,
-        n_genes=3000,
-        weight_PCs=False,
-        sparse_pca=False,
-        proj_kwargs={},
-        seed = 0,
-        weight_mode='rms',
-        components=None,
-        batch_key=None
-    ):
-        """Runs the Self-Assembling Manifold algorithm.
-
-        Parameters
-        ----------
-        k - int, optional, default 20
-            The number of nearest neighbors to identify for each cell.
-
-        distance : string, optional, default 'correlation'
-            The distance metric to use when identifying nearest neighbors.
-            Can be any of the distance metrics supported by sklearn's 'pdist'.
-
-        max_iter - int, optional, default 10
-            The maximum number of iterations SAM will run.
-
-        stopping_condition - float, optional, default 5e-3
-            The stopping condition threshold for the RMSE between gene weights
-            in adjacent iterations.
-
-        verbose - bool, optional, default True
-            If True, the iteration number and error between gene weights in
-            adjacent iterations will be displayed.
-
-        projection - str, optional, default 'umap'
-            If 'tsne', generates a t-SNE embedding. If 'umap', generates a UMAP
-            embedding. Otherwise, no embedding will be generated.
-
-        preprocessing - str, optional, default 'StandardScaler'
-            If 'Normalizer', use sklearn.preprocessing.Normalizer, which
-            normalizes expression data prior to PCA such that each cell has
-            unit L2 norm. If 'StandardScaler', use
-            sklearn.preprocessing.StandardScaler, which normalizes expression
-            data prior to PCA such that each gene has zero mean and unit
-            variance. Otherwise, do not normalize the expression data. We
-            recommend using 'StandardScaler' for large datasets and
-            'Normalizer' otherwise.
-        
-        npcs - int, optional, default 150
-            Sets the number of principal components to use.
-            
-        n_genes - int, optional, default 3000
-            Sets the number of genes to include prior to PCA. Genes are selected
-            based on their SAM weights. If `None`, all genes are selected.
-            In this case, it is recommended to set `sparse_pca` equal to True.
-            
-        num_norm_avg - int, optional, default 50
-            The top 'num_norm_avg' dispersions are averaged to determine the
-            normalization factor when calculating the weights. This prevents
-            genes with large spatial dispersions from skewing the distribution
-            of weights.
-
-        sparse_pca - bool, optional, default False
-            If True, uses an implementation of PCA that accepts sparse inputs.
-            This is worth setting True for large datasets, where memory
-            constraints start becoming noticeable.
-
-        weight_PCs - bool, optional, default False
-            Scale the principal components by their eigenvalues. If many
-            cell populations are expected, it is recommended to set this False
-            as the populations may be found on lesser-varying PCs.
-            
-        weight_mode - str, optional, default 'combined'
-            Can be one of 'dispersion', 'variance', 'rms', 'combined'.
-                'dispersion' - Gene weights are calculated as the fano factors
-                (variance / mean) of kNN-averaged expressions.
-                
-                'variance' - Gene weights are calculated as the variances of
-                kNN-averaged expressions.
-                
-                'rms' - Gene weights are calculated as a modified fano factor
-                (variance / root-mean square) of kNN-averaged expressions.
-                This makes the weight more robust to genes with extremely low
-                mean expressions.
-                
-                'combined' - Gene weights are calculated as the maximum of the 
-                weights calculated from 'dispersion' and 'rms' modes.
-                
-
-        proj_kwargs - dict, optional, default {}
-            A dictionary of keyword arguments to pass to the projection
-            functions.
-        """
-        D = self.adata.X
-        if k < 5:
-            k = 5
-        if k > D.shape[0] - 1:
-            k = D.shape[0] - 2
-
-        if preprocessing not in ["StandardScaler", "Normalizer", None, "None"]:
-            raise ValueError(
-                "preprocessing must be 'StandardScaler', 'Normalizer', or None"
-            )
-        if weight_mode not in ["dispersion", "variance", "rms", "combined"]:
-            raise ValueError(
-                "weight_mode must be 'dispersion', 'variance', 'rms', or 'combined'."
-            )
-
-        if self.adata.layers['X_disp'].min() < 0 and weight_mode == 'dispersion':
-            print("`X_disp` layer contains negative values. Setting `weight_mode` to 'rms'.")
-            weight_mode = 'rms'
-
-        numcells = D.shape[0]
-
-        if n_genes == None:
-            n_genes = self.adata.shape[1]
-            if not sparse_pca and numcells > 10000:
-                warnings.warn('All genes are being used. It is recommended '
-                              'to set `sparse_pca=False` to satisfy memory '
-                              'constraints for datasets with more than '
-                              '10,000 cells. Setting `sparse_pca` to True.')
-                sparse_pca=True
-
-        if not sparse_pca:
-            n_genes = min(n_genes, (D.sum(0) > 0).sum())
-            
-        self.run_args = {
-            "max_iter": max_iter,
-            "verbose": verbose,
-            "projection": projection,
-            "stopping_condition": stopping_condition,
-            "num_norm_avg": num_norm_avg,
-            "k": k,
-            "distance": distance,
-            "preprocessing": preprocessing,
-            "npcs": npcs,
-            "n_genes": n_genes,
-            "weight_PCs": weight_PCs,
-            "proj_kwargs": proj_kwargs,
-            "sparse_pca": sparse_pca,
-            "weight_mode": weight_mode,
-            "seed": seed,
-            "components": components,
-        }
-        self.adata.uns["run_args"] = self.run_args
-        
-        
-        tinit = time.time()
-        np.random.seed(seed)
-
-        if verbose:
-            print("RUNNING SAM")
-
-        W = np.ones(D.shape[1])
-        self.adata.var['weights'] = W
-
-        old = np.zeros(W.size)
-        new = W
-
-        i = 0
-        err = ((new - old) ** 2).mean() ** 0.5
-
-        if max_iter < 5:
-            max_iter = 5
-
-        nnas = num_norm_avg
-
-        while i < max_iter and err > stopping_condition:
-
-            conv = err
-            if verbose:
-                print("Iteration: " + str(i) + ", Convergence: " + str(conv))
-
-            i += 1
-            old = new
-            if i == 1:                
-                first=True
-            else:
-                first=False
-
-            W = self.calculate_nnm(batch_key=batch_key,
-                n_genes=n_genes, preprocessing=preprocessing, npcs=npcs, num_norm_avg=nnas,
-                weight_PCs=weight_PCs, sparse_pca=sparse_pca,weight_mode=weight_mode,seed=seed,components=components,first=first
-                )  
-            gc.collect()
-            new = W
-            err = ((new - old) ** 2).mean() ** 0.5
-            self.adata.var['weights']=W
-
-        all_gene_names = np.array(list(self.adata.var_names))
-        indices = np.argsort(-W)
-        ranked_genes = all_gene_names[indices]
-
-        self.adata.uns["ranked_genes"] = ranked_genes
-
-        if projection == "tsne":
-            if verbose:
-                print("Computing the t-SNE embedding...")
-            self.run_tsne(**proj_kwargs)
-        elif projection == "umap":
-            if verbose:
-                print("Computing the UMAP embedding...")
-            self.run_umap(seed=seed,**proj_kwargs)
-        elif projection == "diff_umap":
-            if verbose:
-                print("Computing the diffusion UMAP embedding...")
-            self.run_diff_umap(**proj_kwargs)
-
-        elapsed = time.time() - tinit
-        if verbose:
-            print("Elapsed time: " + str(elapsed) + " seconds")
-
-    def calculate_nnm(
-        self, adata=None,batch_key=None,g_weighted=None, n_genes=3000, preprocessing='StandardScaler', npcs=150, num_norm_avg=50, weight_PCs=False, sparse_pca=False,
-        update_manifold=True,weight_mode='dispersion',seed=0,components=None,first=False
-    ):
-        if adata is None:
-            adata = self.adata
-
-        numcells = adata.shape[0]
-        k = adata.uns['run_args'].get("k", 20)
-        distance = adata.uns['run_args'].get("distance", "correlation")
-                
-        D = adata.X
-        W = adata.var["weights"].values
-
-
-        if 'means' not in adata.var.keys() or 'variances' not in adata.var.keys():
-            self.calculate_mean_var(adata)
-        
-        if n_genes is None:
-            gkeep = np.arange(W.size)
-        else:
-            if first:
-                mu = np.array(list(adata.var['means']))
-                var = np.array(list(adata.var['variances']))
-                mu[mu==0]=1
-                dispersions = var/mu
-                gkeep = np.sort(np.argsort(-dispersions)[:n_genes])
-            else:    
-                gkeep = np.sort(np.argsort(-W)[:n_genes])
-
-        if g_weighted is None:
-            if preprocessing == "Normalizer":
-                Ds = D[:, gkeep]
-                if sp.issparse(Ds) and not sparse_pca:
-                    Ds = Ds.toarray()
-
-                Ds = Normalizer().fit_transform(Ds)
-
-            elif preprocessing == "StandardScaler":
-                if not sparse_pca:
-                    Ds = D[:, gkeep]
-                    if sp.issparse(Ds):
-                        Ds = Ds.toarray()
-
-                    v = adata.var['variances'].values[gkeep]
-                    m = adata.var['means'].values[gkeep]
-                    v[v==0]=1
-                    Ds = (Ds - m)/v**0.5
-
-                    Ds[Ds > 10] = 10
-                    Ds[Ds < -10] = -10
-                else:
-                    Ds = D[:, gkeep]
-                    v = adata.var['variances'].values[gkeep]
-                    v[v==0]=1
-                    Ds = Ds.multiply(1/v**0.5).tocsr()
-
-            else:
-                Ds = D[:, gkeep].toarray()
-
-            if sp.issparse(Ds):
-                D_sub = Ds.multiply(W[gkeep]).tocsr()
-            else:
-                D_sub = Ds * (W[gkeep])
-
-            if components is None:
-                if not sparse_pca:
-                    npcs = min(npcs, min((D.shape[0],gkeep.size)))
-                    if numcells > 500:
-                        g_weighted, pca = ut.weighted_PCA(
-                            D_sub,
-                            npcs=npcs,
-                            do_weight=weight_PCs,
-                            solver="auto",seed=seed
-                        )
-                    else:
-                        g_weighted, pca = ut.weighted_PCA(
-                            D_sub,
-                            npcs=npcs,
-                            do_weight=weight_PCs,
-                            solver="full",seed=seed
-                        )
-                    pca_obj = pca
-                    components = pca.components_
-
-                else:
-                    npcs=min(npcs, min((D.shape[0],gkeep.size)) - 1)
-                    v = adata.var['variances'].values[gkeep]
-                    v[v==0]=1
-                    m = adata.var['means'].values[gkeep]*W[gkeep]
-                    no = (m/v**0.5) if preprocessing=='StandardScaler' else D_sub.mean(0).A.flatten()
-                    mean_correction = no
-                    output = ut._pca_with_sparse(D_sub, npcs,mu = (no)[None,:], seed = seed)
-                    components = output['components']
-                    g_weighted = output['X_pca']
-
-                    if weight_PCs:
-                        ev = output['variance']
-                        ev = ev / ev.max()
-                        g_weighted = g_weighted * (ev ** 0.5)
-            else:
-                components = components[:,gkeep]
-                v = adata.var['variances'].values[gkeep]
-                v[v==0]=1
-                m = adata.var['means'].values[gkeep]*W[gkeep]
-                ns = (m/v**0.5) if preprocessing=='StandardScaler' else D_sub.mean(0).A.flatten()
-                mean_correction = ns
-
-                if sp.issparse(D_sub):
-                    g_weighted = D_sub.dot(components.T) - ns.flatten().dot(components.T)
-                else:
-                    g_weighted = (D_sub-ns).dot(components.T)
-                if weight_PCs:
-                    ev = g_weighted.var(0) #fix this?
-                    ev = ev / ev.max()
-                    g_weighted = g_weighted * (ev ** 0.5)
-            
-            adata.varm["PCs"] = np.zeros(shape=(adata.n_vars, npcs))
-            adata.varm["PCs"][gkeep] = components.T
-            adata.obsm["X_processed"] = D_sub
-            adata.uns["dimred_indices"] = gkeep
-            if sparse_pca:
-                mc = np.zeros(adata.shape[1])
-                mc[gkeep] = mean_correction
-                adata.var["mean_correction"] = mc
-
-        if distance == "euclidean":
-            pass;#g_weighted = Normalizer().fit_transform(g_weighted)
-
-        if batch_key is not None:
-            harmony_out = harmonypy.run_harmony(g_weighted, adata.obs, batch_key, verbose=False)
-            g_weighted = harmony_out.Z_corr.T            
-
-
-        if update_manifold:
-            edm = ut.calc_nnm(g_weighted, k, distance)
-            adata.obsp['distances'] = edm.tolil()
-            adata.obsp['distances'].setdiag(0)
-            adata.obsp['distances']=adata.obsp['distances'].tocsr()
-            
-            EDM = edm.copy()
-            EDM.data[:] = 1
-            EDM = EDM.tolil(); EDM.setdiag(1); EDM = EDM.tocsr();
-
-            adata.obsp['connectivities'] = EDM
-
-            if distance in ['correlation','cosine']: #keep edge weights and store in nnm if distance is bounded
-                edm.data[:] = 1-edm.data
-                edm = edm.tolil(); edm.setdiag(1); edm = edm.tocsr();
-                edm.data[edm.data<0]=0.001 #if negative correlation, set close to zero but not zero to preserve kNN structure
-                adata.obsp['nnm'] = edm
-            else:
-                adata.obsp['nnm'] = EDM
-            W = self.dispersion_ranking_NN(EDM, weight_mode=weight_mode, num_norm_avg=num_norm_avg, adata=adata)
-            adata.obsm["X_pca"] = g_weighted
-            ge = np.array(list(adata.var_names[gkeep]))
-            return W
-        else:
-            print('Not updating the manifold...')
-            PCs = np.zeros(shape=(adata.n_vars, npcs))
-            PCs[gkeep] = components.T
-            return PCs,g_weighted
-
-    def run_tsne(self, X=None, metric="correlation", **kwargs):
-        """Wrapper for sklearn's t-SNE implementation.
-
-        See sklearn for the t-SNE documentation. All arguments are the same
-        with the exception that 'metric' is set to 'correlation' by default.
-        """
-        if X is not None:
-            dt = man.TSNE(metric=metric, **kwargs).fit_transform(X)
-            return dt
-
-        else:
-            distance = self.adata.uns['run_args'].get("distance", "correlation")
-            dt = man.TSNE(metric=distance, **kwargs).fit_transform(
-                self.adata.obsm["X_pca"]
-            )
-            tsne2d = dt
-            self.adata.obsm["X_tsne"] = tsne2d
-
-    def run_umap(self, X="X_pca", metric=None,seed = None, **kwargs):
-        """Wrapper for umap-learn.
-
-        See https://github.com/lmcinnes/umap sklearn for the documentation
-        and source code.
-        """
-
-        import umap as umap
-
-        if metric is None:
-            metric = self.adata.uns['run_args'].get("distance", "correlation")
-            
-        if seed is None:
-            seed = self.adata.uns['run_args'].get('seed',0)
-
-        if type(X) is str:
-            if X == "":
-                X = self.adata.X
-            else:
-                X = self.adata.obsm[X]
-            # print(X.shape)
-            umap_obj = umap.UMAP(metric=metric,random_state=seed, **kwargs)
-            umap2d = umap_obj.fit_transform(X)
-            self.adata.obsm["X_umap"] = umap2d
-            return umap_obj
-        else:
-            umap_obj = umap.UMAP(metric=metric,random_state=seed, **kwargs)
-            dt = umap_obj.fit_transform(X)
-            return dt, umap_obj
-
-    def run_diff_umap(
-        self, use_rep="X_pca", metric="euclidean", n_comps=15, method="gauss", **kwargs
-    ):
-        """
-        Experimental -- running UMAP on the diffusion components. Requires scanpy.
-        """
-        import scanpy.api as sc
-
-        k = self.adata.uns['run_args'].get("k", 20)
-        distance = self.adata.uns['run_args'].get("distance", "correlation")
-        sc.pp.neighbors(
-            self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method
-        )
-        sc.tl.diffmap(self.adata, n_comps=n_comps)
-        sc.pp.neighbors(
-            self.adata,
-            use_rep="X_diffmap",
-            n_neighbors=k,
-            metric="euclidean",
-            method=method,
-        )
-
-        if "X_umap" in self.adata.obsm.keys():
-            temp = self.adata.obsm["X_umap"].copy()
-
-        sc.tl.umap(self.adata, min_dist=0.1, copy=False)
-        temp2 = self.adata.obsm["X_umap"]
-        self.adata.obsm["X_umap"] = temp
-        self.adata.obsm["X_diff_umap"] = temp2
-
-    def run_diff_map(
-        self, use_rep="X_pca", metric="euclidean", n_comps=15, method="gauss", **kwargs
-    ):
-        import scanpy.api as sc
-
-        k = self.adata.uns['run_args'].get("k", 20)
-        distance = self.adata.uns['run_args'].get("distance", "correlation")
-        sc.pp.neighbors(
-            self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method
-        )
-        sc.tl.diffmap(self.adata, n_comps=n_comps + 1)
-        self.adata.obsm["X_diffmap"] = self.adata.obsm["X_diffmap"][:, 1:]
-
-    def density_clustering(self, X=None, eps=1, metric="euclidean", **kwargs):
-        from sklearn.cluster import DBSCAN
-
-        if X is None:
-            X = self.adata.obsm["X_umap"]
-            save = True
-        else:
-            save = False
-
-        cl = DBSCAN(eps=eps, metric=metric, **kwargs).fit_predict(X)
-        k = self.adata.uns['run_args'].get("k", 20)
-        idx0 = np.where(cl != -1)[0]
-        idx1 = np.where(cl == -1)[0]
-        if idx1.size > 0 and idx0.size > 0:
-            xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :])
-            knn = np.argsort(xcmap.T, axis=1)[:, :k]
-            nnm = np.zeros(xcmap.shape).T
-            nnm[
-                np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(),
-                knn.flatten(),
-            ] = 1
-            nnmc = np.zeros((nnm.shape[0], cl.max() + 1))
-            for i in range(cl.max() + 1):
-                nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1)
-
-            cl[idx1] = np.argmax(nnmc, axis=1)
-
-        if save:
-            self.adata.obs["dbscan_clusters"] = pd.Categorical(cl)
-        else:
-            return cl
-
-    def clustering(self, X=None, param=None, method="leiden"):
-        """A wrapper for clustering the SAM output using various clustering
-        algorithms
-
-        Parameters
-        ----------
-        X - data, optional, default None
-            Data to be passed into the selected clustering algorithm. If None,
-            uses the data in the SAM AnnData object. Different clustering
-            algorithms accept different types of data. For example, Louvain
-            and Leiden accept a scipy.sparse adjacency matrix representing the
-            nearest neighbor graph. Dbscan, kmeans, and Hdbscan accept
-            coordinates (like UMAP coordinates or PCA coordinates). If None,
-            cluster results are saved to the 'obs' attribute in the AnnData
-            object. Otherwise, cluster results are returned.
-
-        param : float, optional, default None
-            The parameter used for the different clustering algorithms. For
-            louvain and leiden, it is the resolution parameter. For dbscan, it
-            is the distance parameter. For kmeans, it is the number of clusters.
-
-        method : string, optional, default 'leiden'
-            Determines which clustering method is run.
-            'leiden' - Leiden clustering with modularity optimization
-            'leiden_sig' - Leiden clustering with significance optimization
-            'louvain' - Louvain clustering with modularity optimization
-            'louvain_sig' - Louvain clustering with significance optimization
-            'kmeans' - Kmeans clustering
-            'dbscan' - DBSCAN clustering
-            'hdbscan' - HDBSCAN clustering
-            If X is None, cluster assignments are saved in '.adata.obs' with
-            key name equal to method + '_clusters'. Otherwise, they are returned.
-        """
-        if method == "leiden":
-            if param is None:
-                param = 1
-            cl = self.leiden_clustering(X=X, res=param, method="modularity")
-        elif method == "leiden_sig":
-            if param is None:
-                param = 1
-            cl = self.leiden_clustering(X=X, res=param, method="significance")
-        elif method == "louvain":
-            if param is None:
-                param = 1
-            cl = self.louvain_clustering(X=X, res=param, method="modularity")
-        elif method == "louvain_sig":
-            if param is None:
-                param = 1
-            cl = self.louvain_clustering(X=X, res=param, method="significance")
-        elif method == "kmeans":
-            if param is None:
-                param = 6
-            cl = self.kmeans_clustering(param, X=X)
-        elif method == "hdbscan":
-            if param is None:
-                param = 25
-            cl = self.hdbknn_clustering(npcs=param)
-        elif method == "dbscan":
-            if param is None:
-                param = 0.5
-            cl = self.density_clustering(eps=param)
-        else:
-            cl = None
-        return cl
-
-    def louvain_clustering(self, X=None, res=1, method="modularity"):
-        """Runs Louvain clustering using the vtraag implementation. Assumes
-        that 'louvain' optional dependency is installed.
-
-        Parameters
-        ----------
-        res - float, optional, default 1
-            The resolution parameter which tunes the number of clusters Louvain
-            finds.
-
-        method - str, optional, default 'modularity'
-            Can be 'modularity' or 'significance', which are two different
-            optimizing funcitons in the Louvain algorithm.
-
-        """
-
-        if X is None:
-            X = self.adata.obsp["connectivities"]
-            save = True
-        else:
-            if not sp.isspmatrix_csr(X):
-                X = sp.csr_matrix(X)
-            save = False
-
-        import igraph as ig
-        import louvain
-
-        adjacency = X
-        sources, targets = adjacency.nonzero()
-        weights = adjacency[sources, targets]
-        if isinstance(weights, np.matrix):
-            weights = weights.A1
-        g = ig.Graph(directed=True)
-        g.add_vertices(adjacency.shape[0])
-        g.add_edges(list(zip(sources, targets)))
-        try:
-            g.es["weight"] = weights
-        except BaseException:
-            pass
-
-        if method == "significance":
-            cl = louvain.find_partition(g, louvain.SignificanceVertexPartition)
-        else:
-            cl = louvain.find_partition(
-                g, louvain.RBConfigurationVertexPartition, resolution_parameter=res
-            )
-
-        if save:
-            if method == "modularity":
-                self.adata.obs["louvain_clusters"] = pd.Categorical(
-                    np.array(cl.membership)
-                )
-            elif method == "significance":
-                self.adata.obs["louvain_sig_clusters"] = pd.Categorical(
-                    np.array(cl.membership)
-                )
-        else:
-            return np.array(cl.membership)
-
-    def kmeans_clustering(self, numc, X=None, npcs=25):
-        """Performs k-means clustering.
-
-        Parameters
-        ----------
-        numc - int
-            Number of clusters
-
-        npcs - int, optional, default 25
-            Number of principal components to use as inpute for k-means
-            clustering.
-
-        """
-
-        from sklearn.cluster import KMeans
-
-        if X is None:
-            X = self.adata.obsm['X_pca']
-
-        km = KMeans(n_clusters=numc)
-        cl = km.fit_predict(Normalizer().fit_transform(X))
-
-        self.adata.obs["kmeans_clusters"] = pd.Categorical(cl)
-        return cl, km
-
-    def leiden_clustering(self, X=None, res=1, method="modularity", seed = 0):
-
-        if X is None:
-            X = self.adata.obsp["connectivities"]
-            save = True
-        else:
-            if not sp.isspmatrix_csr(X):
-                X = sp.csr_matrix(X)
-            save = False
-
-        import igraph as ig
-        import leidenalg
-
-        adjacency = X
-        sources, targets = adjacency.nonzero()
-        weights = adjacency[sources, targets]
-        if isinstance(weights, np.matrix):
-            weights = weights.A1
-        g = ig.Graph(directed=True)
-        g.add_vertices(adjacency.shape[0])
-        g.add_edges(list(zip(sources, targets)))
-        try:
-            g.es["weight"] = weights
-        except BaseException:
-            pass
-
-        if method == "significance":
-            cl = leidenalg.find_partition(g, leidenalg.SignificanceVertexPartition,seed=seed)
-        else:
-            cl = leidenalg.find_partition(
-                g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res,seed=seed
-            )
-
-        if save:
-            if method == "modularity":
-                self.adata.obs["leiden_clusters"] = pd.Categorical(
-                    np.array(cl.membership)
-                )
-            elif method == "significance":
-                self.adata.obs["leiden_sig_clusters"] = pd.Categorical(
-                    np.array(cl.membership)
-                )
-        else:
-            return np.array(cl.membership)
-
-    def hdbknn_clustering(self, X=None, k=None, npcs=15, **kwargs):
-        import hdbscan
-
-        if X is None:
-            X = self.adata.obsm['X_pca']
-            X = Normalizer().fit_transform(X)
-            save = True
-        else:
-            save = False
-
-        if k is None:
-            k = self.adata.uns['run_args'].get("k", 20)
-
-        hdb = hdbscan.HDBSCAN(metric="euclidean", **kwargs)
-
-        cl = hdb.fit_predict(X)
-
-        idx0 = np.where(cl != -1)[0]
-        idx1 = np.where(cl == -1)[0]
-        if idx1.size > 0 and idx0.size > 0:
-            xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :])
-            knn = np.argsort(xcmap.T, axis=1)[:, :k]
-            nnm = np.zeros(xcmap.shape).T
-            nnm[
-                np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(),
-                knn.flatten(),
-            ] = 1
-            nnmc = np.zeros((nnm.shape[0], cl.max() + 1))
-            for i in range(cl.max() + 1):
-                nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1)
-
-            cl[idx1] = np.argmax(nnmc, axis=1)
-
-        if save:
-            self.adata.obs["hdbscan_clusters"] = pd.Categorical(cl)
-        else:
-            return cl
-
-    def identify_marker_genes_rf(self, labels=None, clusters=None, n_genes=4000):
-        """
-        Ranks marker genes for each cluster using a random forest
-        classification approach.
-
-        Parameters
-        ----------
-
-        labels - numpy.array or str, optional, default None
-            Cluster labels to use for marker gene identification.
-            Can also be a string corresponding to any of the keys
-            in adata.obs.
-
-        clusters - int/string or array-like, default None
-            A cluster ID (int or string depending on the labels used)
-            or vector corresponding to the specific cluster ID(s) for
-            which marker genes will be calculated. If None, marker genes
-            will be computed for all clusters, and the result will be written
-            to adata.uns.
-
-        n_genes - int, optional, default 4000
-            By default, trains the classifier on the top 4000 SAM-weighted
-            genes.
-
-        Returns
-        -------
-        (dictionary of markers for each cluster,
-        dictionary of marker scores for each cluster)
-        """
-
-        if labels is None:
-            try:
-                keys = np.array(list(self.adata.obs_keys()))
-                lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0])
-            except KeyError:
-                print(
-                    "Please generate cluster labels first or set the "
-                    "'labels' keyword argument."
-                )
-                return
-        elif isinstance(labels, str):
-            lbls = self.get_labels(labels)
-        else:
-            lbls = labels
-
-        from sklearn.ensemble import RandomForestClassifier
-
-        markers = {}
-        markers_scores = {}
-        if clusters == None:
-            lblsu = np.unique(lbls)
-        else:
-            lblsu = np.unique(clusters)
-
-        indices = np.argsort(-self.adata.var["weights"].values)
-        X = self.adata.layers["X_disp"][:, indices[:n_genes]].toarray()
-        for K in range(lblsu.size):
-            # print(K)
-            y = np.zeros(lbls.size)
-
-            y[lbls == lblsu[K]] = 1
-
-            clf = RandomForestClassifier(
-                n_estimators=100, max_depth=None, random_state=0
-            )
-
-            clf.fit(X, y)
-
-            idx = np.argsort(-clf.feature_importances_)
-
-            markers[lblsu[K]] = self.adata.uns["ranked_genes"][idx]
-            markers_scores[lblsu[K]] = clf.feature_importances_[idx]
-
-        if clusters is None:
-            if isinstance(labels, str):
-                self.adata.uns["rf_" + labels] = markers
-            else:
-                self.adata.uns["rf"] = markers
-
-        return markers, markers_scores
-
-    def identify_marker_genes_sw(self, labels=None, clusters=None, inplace=True):
-        """
-        Ranks marker genes for each cluster using partial sums of spatial
-        dispersions.
-
-        Parameters
-        ----------
-
-        labels - numpy.array or str, optional, default None
-            Cluster labels to use for marker gene identification.
-            Can also be a string corresponding to any of the keys
-            in adata.obs.
-
-        clusters - int/string or array-like, default None
-            A cluster ID (int or string depending on the labels used)
-            or vector corresponding to the specific cluster ID(s) for
-            which marker genes will be calculated. If None, marker genes
-            will be computed for all clusters, and the result will be written
-            to adata.uns.
-
-        inplace - bool, default True
-            If True, returns nothing and places marker scores in `var`
-
-        """
-
-        if labels is None:
-            try:
-                keys = np.array(list(self.adata.obs_keys()))
-                lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0])
-            except KeyError:
-                print(
-                    "Please generate cluster labels first or set the "
-                    "'labels' keyword argument."
-                )
-                return
-        elif isinstance(labels, str):
-            lbls = self.get_labels(labels)
-        else:
-            lbls = labels
-
-        markers_scores = []
-        if clusters == None:
-            lblsu = np.unique(lbls)
-        else:
-            lblsu = np.unique(clusters)
-
-        if "X_knn_avg" not in list(self.adata.layers.keys()):
-            print("Performing kNN-averaging...")
-            self.dispersion_ranking_NN(save_avgs=True)
-        l = self.adata.layers["X_knn_avg"]
-        m = l.mean(0).A.flatten()
-        cells = np.array(list(self.adata.obs_names))
-        for K in range(lblsu.size):
-            selected = np.where(np.in1d(cells, self.get_cells(lblsu[K], labels)))[0]
-            ms = l[selected, :].mean(0).A.flatten()
-            lsub = l[selected, :]
-            lsub.data[:] = lsub.data ** 2
-            ms2 = lsub.mean(0).A.flatten()
-            v = ms2 - 2 * ms * m + m ** 2
-            wmu = np.zeros(v.size)
-            wmu[m > 0] = v[m > 0] / m[m > 0]
-            markers_scores.append(wmu)
-        A = pd.DataFrame(
-            data=np.vstack(markers_scores), index=lblsu, columns=self.adata.var_names
-        ).T
-        if inplace:
-            A.columns = labels + ";;" + A.columns.astype('str').astype('object')
-            for Ac in A.columns:
-                self.adata.var[Ac]=A[Ac]
-        else:
-            return A
-
-    def identify_marker_genes_ratio(self, labels=None):
-        """
-        Ranks marker genes for each cluster using a SAM-weighted
-        expression-ratio approach (works quite well).
-
-        Parameters
-        ----------
-
-        labels - numpy.array or str, optional, default None
-            Cluster labels to use for marker gene identification. If None,
-            assumes that one of SAM's clustering algorithms has been run. Can
-            be a string (i.e. 'louvain_clusters', 'kmeans_clusters', etc) to
-            specify specific cluster labels in adata.obs.
-
-        """
-        if labels is None:
-            try:
-                keys = np.array(list(self.adata.obs_keys()))
-                lbls = self.get_labels(ut.search_string(keys, "_clusters")[0][0])
-            except KeyError:
-                print(
-                    "Please generate cluster labels first or set the "
-                    "'labels' keyword argument."
-                )
-                return
-        elif isinstance(labels, str):
-            lbls = self.get_labels(labels)
-        else:
-            lbls = labels
-
-        all_gene_names = np.array(list(self.adata.var_names))
-
-        markers = {}
-
-        s = np.array(self.adata.layers["X_disp"].sum(0)).flatten()
-        lblsu = np.unique(lbls)
-        for i in lblsu:
-            d = np.array(self.adata.layers["X_disp"][lbls == i, :].sum(0)).flatten()
-            rat = np.zeros(d.size)
-            rat[s > 0] = (
-                d[s > 0] ** 2 / s[s > 0] * self.adata.var["weights"].values[s > 0]
-            )
-            x = np.argsort(-rat)
-            markers[i] = all_gene_names[x[:]]
-
-        self.adata.uns["marker_genes_ratio"] = markers
-
-        return markers
-
-    def gui(self):
-        if 'SamGui' in self.__dict__.keys():
-            return self.SamGui.SamPlot
-        else:
-            from samalg.gui import SAMGUI
-            self.SamGui = SAMGUI(self)
-            return self.SamGui.SamPlot
-            
-    def save(self,fn):
-        import dill
-        if len(fn.split('.pkl')) == 1:
-            fn = fn + '.pkl'
-        self.path_to_file = fn
-        d = {}
-        for k in self.__dict__.keys():
-            d[k] = self.__dict__[k]
-        if 'SamGui' in d.keys():
-            del d['SamGui']
-        with open(fn,'wb') as f:
-            dill.dump(d,f)
-            if 'SamGui' in self.__dict__.keys():
-                dill.dump(self.SamGui,f)
-            
-    def load(self,fn):
-        import dill
-        if len(fn.split('.pkl')) == 1:
-            fn = fn + '.pkl'    
-        with open(fn,'rb') as f:
-            self.__dict__ = dill.load(f)
-            try:
-                self.__dict__['SamGui'] = dill.load(f)
-            except:
-                pass;
diff --git a/samalg/gui.py b/samalg/gui.py
deleted file mode 100644
index e7fcd35..0000000
--- a/samalg/gui.py
+++ /dev/null
@@ -1,1987 +0,0 @@
-import pickle
-import colorlover as cl
-import numpy as np
-import scipy.sparse as sp
-import ipywidgets as widgets
-import plotly.graph_objs as go
-from . import SAM
-from . import utilities as ut
-import pandas as pd
-
-from ipyevents import Event
-from ipywidgets import Widget, Layout
-import warnings
-from numba.core.errors import NumbaWarning
-warnings.filterwarnings("ignore", category=NumbaWarning)
-
-__version__ = "0.8.7"
-
-
-class SAMGUI(object):
-    def __init__(self, sam=None,title=None, close_all_widgets=False, default_proj='X_umap',height=600):
-        self.ACTION_LOG = []
-        
-        if close_all_widgets:
-            Widget.close_all()
-        self.widget_height = height
-        self.log('Setting widget height to {}'.format(height))
-        
-        self.default_proj = default_proj
-        self.titles = []
-        self.stab = widgets.Tab(layout={"height": "95%", "width": "50%"})
-        self.stab.observe(self.on_switch_tabs, "selected_index")
-
-        if sam is not None:
-            self.log('Initializing GUI from input SAM object.')
-            if not isinstance(sam,list):
-                sams=[sam]
-            else:
-                sams = sam
-                sam = sams[0] 
-            if title is not None:
-                if not isinstance(title,list):
-                    titles=[title]
-                else:
-                    titles = title
-                    title = titles[0]
-            else:
-                titles = [None]*len(sams)
-            
-            self.init_from_sam(sam)
-            if titles[0] is None:
-                title = 'Full dataset'
-            else:
-                title = titles[0]
-                
-            self.create_plot(0, title)
-            items = [self.stab, self.tab]
-            self.titles = [title]
-            for i in range(1,len(sams)):
-                self.sams.append(sams[i])
-                self.selected.append(np.ones(sams[i].adata.shape[0]).astype("bool"))
-                self.selected_cells.append(np.array(list(sams[i].adata.obs_names)))
-                self.active_labels.append(np.zeros(sams[i].adata.shape[0]))
-                self.dd_opts.append(["Toggle cluster"])
-                self.gene_expressions.append(np.zeros(sams[i].adata.shape[0]))
-                self.marker_genes.append(
-                    np.array(list(sams[i].adata.var_names))[
-                        np.argsort(-sams[i].adata.var["weights"].values)
-                    ]
-                )
-                self.marker_genes_tt.append("Genes ranked by SAM weights.")
-                self.ds.append(0)  
-                if titles[i] is None:
-                    title = 'Full dataset {}'.format(i)
-                else:
-                    title = titles[i]
-                self.titles.append(title)
-                self.create_plot(i, title)
-                self.update_dropdowns(i)            
-        else:
-            self.log('Initializing blank GUI.')
-            tab = widgets.Tab(layout={"height": "95%", "width": "50%"})
-            load_box = self.init_load()
-            children = [load_box]
-            self.SAM_LOADED = False
-            tab.children = children
-            tab.set_title(0, "LOAD_SAM")
-            self.create_plot(0, "SAM not loaded")
-            self.titles = ["SAM not loaded"]
-            items = [self.stab, tab]
-
-        self.current_tab = 0
-        self.current_sam = sam
-
-        self.SamPlot = widgets.HBox(
-            items, layout=Layout(width="auto", height=str(height) + "px")
-        )
-
-        x = self.stab.children[self.stab.selected_index]
-        f = x.layout.xaxis.range
-        x.layout.xaxis.range=(-1,1)
-        x.layout.xaxis.range=f        
-
-        f = x.layout.yaxis.range
-        x.layout.yaxis.range=(-1,1)
-        x.layout.yaxis.range=f
-        
-    def __setstate__(self,d):       
-        self.__init__(sam = d['sams'],title=d['titles'],
-                      height=d['widget_height'],default_proj=d['default_proj'])
-        
-        for k in d.keys():
-            self.__dict__[k] = d[k]
-        self.__dict__['ds'] = [0]*len(self.sams)
-
-        for i in range(len(self.sams)):
-            self.create_plot(i,self.titles[i])
-        
-    def __getstate__(self):
-        d = {}
-        exclude = ['SamPlot','children','ps_dict','rb_dict','cs_dict','stab','rs_box','cs_box',
-                   'pp_box','out','tab','ds']
-        for k in self.__dict__.keys():
-            if k not in exclude:
-                d[k] = self.__dict__[k]
-        return d
-        
-
-
-    def log(self,s):
-        self.ACTION_LOG.append(s)
-        
-    def init_from_sam(self, sam):
-        self.current_sam = sam
-        tab = widgets.Tab(layout={"height": "95%", "width": "50%"})
-        self.load_vars_from_sam(sam)
-        self.pp_box = self.init_preprocess()
-        self.rs_box = self.init_run_sam()
-        self.cs_box = self.init_cs()
-        
-        self.SAM_LOADED = True
-        self.out = widgets.Output(
-            layout={"border": "1px solid black", "height": "95%", "width": "100%"}
-        )
-        children = [self.cs_box, self.rs_box, self.pp_box, self.out]
-        self.children = children
-        names = ["Interact (0)", "Analysis (0)", "Preprocess (0)", "Output (0)"]
-        self.names = ["Interact", "Analysis", "Preprocess", "Output"]
-        tab.children = children
-        tab.set_trait("selected_index", 0)
-        for i in range(len(children)):
-            tab.set_title(i, names[i])
-        tab.set_trait("selected_index", 0)
-        self.tab = tab
-
-    def close_all_tabs(self):
-        self.log('Closing all tabs except tab 0.')
-        self.stab.set_trait("selected_index", 0)
-        I = 1
-        while len(self.sams) > 1:
-            self.ds[I].close()
-            self.stab.children[I].close()
-            del self.ds[I]
-            del self.sams[I]
-            del self.selected[I]
-            del self.selected_cells[I]
-            del self.active_labels[I]
-            del self.dd_opts[I]
-            del self.marker_genes[I]
-            del self.marker_genes_tt[I]
-            del self.gene_expressions[I]
-        self.stab.children = (self.stab.children[0],)
-
-    def get_scatter(self, i):
-        return self.stab.children[i].data[0]
-
-    def load_vars_from_sam(self, sam):
-        self.sams = [sam]
-        self.GENE_KEY = ""
-        self.selected = [np.zeros(sam.adata.shape[0], dtype="bool")]
-        self.selected[0][:] = True
-        self.active_labels = [np.zeros(self.selected[0].size, dtype="int")]
-        self.dd_opts = [["Toggle cluster"]]
-        try:
-            self.marker_genes = [
-                np.array(list(sam.adata.var_names))[
-                    np.argsort(-sam.adata.var["weights"].values)
-                ]
-            ]
-            self.marker_genes_tt = ["Genes ranked by SAM weights."]
-        except KeyError:
-            self.marker_genes = [np.array(list(sam.adata.var_names))]
-            self.marker_genes_tt = ["Unranked genes (SAM not run)."]
-
-        self.selected_cells = [np.array(list(sam.adata.obs_names))]
-        self.ds = [0]
-        self.gene_expressions = [np.zeros(sam.adata.shape[0])]
-        try:
-            self.preprocess_args = sam.adata.uns["preprocess_args"].copy()
-            for i in list(self.preprocess_args.keys()):
-                if isinstance(self.preprocess_args[i], np.ndarray):
-                    self.preprocess_args[i] = self.preprocess_args[i][0]
-        except:
-            self.preprocess_args = {}
-
-        rm = list(invalidArgs(self.sams[0].preprocess_data,self.preprocess_args))
-        for k in rm:
-            del self.preprocess_args[k]            
-            
-        self.preprocess_args['thresh'] = self.preprocess_args.get('thresh_low',0.0)
-        self.preprocess_args_init = self.preprocess_args.copy()
-
-        try:
-            self.run_args = sam.adata.uns["run_args"].copy()
-            for i in list(self.run_args.keys()):
-                if isinstance(self.run_args[i], np.ndarray):
-                    self.run_args[i] = self.run_args[i][0]
-        except:
-            self.run_args = {}
-        
-        rm = list(invalidArgs(self.sams[0].run,self.run_args))
-        for k in rm:
-            del self.run_args[k]
-        self.run_args_init = self.run_args.copy()
-        
-
-
-    def create_plot(self, i, title):
-        if self.SAM_LOADED:
-            projs = list(self.sams[i].adata.obsm.keys())
-            if self.default_proj in projs:
-                p = self.default_proj
-            elif len(projs) > 1:
-                p = np.array(projs)[np.where(np.array(projs) != "X_pca")[0][0]]
-            else:
-                p = "X_pca"
-
-            if p not in projs:
-                xdata = []
-                ydata = []
-            else:
-                xdata = self.sams[i].adata.obsm[p][:, 0]
-                ydata = self.sams[i].adata.obsm[p][:, 1]
-
-            if i < len(self.stab.children):
-                f1 = self.stab.children[i]
-                f1.update_layout(autosize=False)
-                f1.data = []
-            else:
-                f1 = go.FigureWidget()
-
-            f1.add_scattergl(x=xdata, y=ydata)
-
-            f1.for_each_trace(self.init_graph)
-            f1.update_layout(
-                margin_l=0,
-                margin_r=0,
-                margin_t=40,
-                margin_b=0,
-                xaxis_ticks="",
-                xaxis_showticklabels=False,
-                height=self.widget_height * 0.85,
-                title="",
-                # xaxis_showgrid=False,
-                # xaxis_zeroline=False,
-                # yaxis_showgrid=False,
-                # yaxis_zeroline=False,
-                yaxis_ticks="",
-                yaxis_showticklabels=False,
-                autosize=True,
-                dragmode="select",
-            )
-            f1.update_yaxes(autorange=True)
-            f1.update_xaxes(autorange=True)
-
-            f1.data[0].text = list(self.active_labels[i])
-            f1.data[0].hoverinfo = "text"
-            f1.set_trait(
-                "_config",
-                {
-                    "displayModeBar": True,
-                    "scrollZoom": True,
-                    "displaylogo": False,
-                    "edits": {"titleText": False},
-                },
-            )
-
-            if i >= len(self.stab.children):
-                self.stab.children += (f1,)
-                # self.stab.set_trait('selected_index',i)
-
-            if type(self.ds[i]) is int:
-                d = Event(source=f1, watched_events=["keydown"])
-                self.ds[i] = d
-                d.on_dom_event(self.handle_events)
-
-            slider = self.cs_dict["THR"]
-            slider.set_trait("min", slider.value)
-            slider.set_trait("max", slider.value)
-
-        else:
-            f1 = go.FigureWidget()
-            f1.add_scattergl(x=[], y=[])
-            f1.update_layout(
-                margin_l=0,
-                margin_r=0,
-                margin_t=40,
-                margin_b=0,
-                height=self.widget_height * 0.85,
-                xaxis_ticks="",
-                xaxis_showticklabels=False,
-                yaxis_ticks="",
-                yaxis_showticklabels=False,
-                autosize=True,
-                dragmode="select",
-            )
-            f1.set_trait(
-                "_config",
-                {
-                    "displayModeBar": True,
-                    "scrollZoom": True,
-                    "displaylogo": False,
-                    "edits": {"titleText": False},
-                },
-            )
-            self.stab.children += (f1,)
-
-        self.stab.set_title(i, title)
-
-    def handle_events(self, event):
-        if event["type"] == "keydown":
-            key = event["key"]
-            if key == "ArrowRight":
-                self.log('Scrolling ranked genes slider one to the right.')
-                self.cs_dict["RGENES"].set_trait(
-                    "value", self.cs_dict["RGENES"].value + 1
-                )
-            elif key == "ArrowLeft":
-                self.log('Scrolling ranked genes slider one to the left.')
-                x = self.cs_dict["RGENES"].value - 1
-                if x < 0:
-                    x = 0
-                self.cs_dict["RGENES"].set_trait("value", x)
-
-            elif key == "Shift":
-                self.irm_genes(None)
-            elif key == "Enter":
-                self.ism_genes(None)
-            elif key == "x":
-                self.unselect_all(None)
-            elif key == "c":
-                self.select_all(None)
-            elif key == "v":
-                self.reset_view(None)
-            elif key == "a":
-                self.log('Turning off k-nn averaging of gene expression values.')
-                self.cs_dict["AVG"].set_trait("value", not self.cs_dict["AVG"].value)
-
-    def close_tab(self, event):
-        I = self.stab.selected_index
-        if I > 0:
-            self.log('Closing tab {}'.format(I))
-            self.stab.set_trait("selected_index", I - 1)
-
-            titles = []
-            for i in range(len(self.stab.children)):
-                titles.append(self.stab.get_title(i))
-            del titles[I]
-
-            self.ds[I].close()
-            self.stab.children[I].close()
-            t = list(self.stab.children)
-            del t[I]
-            self.stab.children = t
-            del self.ds[I]
-            del self.sams[I]
-            del self.selected[I]
-            del self.selected_cells[I]
-            del self.gene_expressions[I]
-            del self.active_labels[I]
-            del self.dd_opts[I]
-            del self.marker_genes[I]
-            del self.marker_genes_tt[I]
-
-            for i in range(1, len(self.stab.children)):
-                self.stab.set_title(i, titles[i])
-
-    """ BEGIN PREPROCESS INIT"""
-
-    def init_preprocess(self):
-
-        pdata = widgets.Button(
-            description = 'Preprocess',
-            disabled = False,
-            tooltip = 'Preprocess data',
-            icon = ''
-        )
-        pdata.on_click(self.preprocess_sam)
-
-        fgenes = widgets.Checkbox(
-            value=bool(self.preprocess_args.get("filter_genes", True)),
-            description="Filter genes",
-        )
-        fgenes.observe(self.pp_filtergenes, names="value")
-        dfts = widgets.Button(
-            description="Set defaults",
-            disabled=False,
-            tooltip="Set default options",
-            icon="",
-        )
-        dfts.on_click(self.set_pp_defaults)
-        
-        l1 = widgets.Label("Expr threshold:")
-        expr_thr = widgets.FloatSlider(
-            value=float(self.preprocess_args.get("thresh", 0.01)),
-            min=0,
-            max=0.1,
-            step=0.005,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="2f",
-            style={"description_width": "initial"},
-        )
-        expr_thr.observe(self.et_update, names="value")
-
-        l2 = widgets.Label("Min expr:")
-        min_expr = widgets.FloatSlider(
-            value=float(self.preprocess_args.get("min_expression", 1)),
-            min=0,
-            max=6.0,
-            step=0.02,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="1f",
-            style={"description_width": "initial"},
-        )
-        min_expr.observe(self.me_update, names="value")
-
-        init = self.preprocess_args.get("sum_norm", "None")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = "None"
-        sumnorm = widgets.Dropdown(
-            options=["cell_median", "gene_median", "None"],
-            value=init,
-            description="Library normalization:",
-            disabled=False,
-            style={"description_width": "initial"},
-        )
-        sumnorm.observe(self.sumnorm_submit, "value")
-
-        init = self.preprocess_args.get("norm", "log")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = "None"
-        norm = widgets.Dropdown(
-            options=["log", "ftt", "None"],
-            value=init,
-            description="Data normalization:",
-            disabled=False,
-            style={"description_width": "initial"},
-        )
-        norm.observe(self.norm_submit, "value")
-
-        load = widgets.Button(
-            description="Load data",
-            tooltip="Enter the path to the desired data file you wish to "
-            "load. Accepted filetypes are .csv (comma), .txt (tab), .h5ad "
-            "(AnnData), and .p (pickled SAM dictionary).",
-            disabled=False,
-        )
-        load.on_click(self.load_data)
-        load_data = widgets.Text(value="", layout={"width": "100%%"})
-
-        loada = widgets.Button(
-            description="Load annotations",
-            tooltip="Enter the path to the desired annotations file you wish to "
-            "load. Accepted filetypes are .csv (comma) or .txt (tab).",
-            disabled=False,
-        )
-        loada.on_click(self.load_ann)
-        load_dataa = widgets.Text(value="", layout={"width": "100%%"})
-
-        loadv = widgets.Button(
-            description="Load gene annotations",
-            tooltip="Enter the path to the desired gene annotations file "
-            "you wish to load. Accepted filetypes are .csv (comma) or .txt (tab).",
-            disabled=False,
-        )
-        loadv.on_click(self.load_vann)
-        load_datav = widgets.Text(value="", layout={"width": "100%%"})
-
-        self.ps_dict = {}
-        self.ps_dict["PDATA"] = pdata
-        self.ps_dict["DFTS"] = dfts
-        self.ps_dict["FGENES"] = fgenes
-        self.ps_dict["NORM"] = norm
-        self.ps_dict["SUMNORM"] = sumnorm
-        self.ps_dict["L1"] = l1
-        self.ps_dict["EXPR_THR"] = expr_thr
-        self.ps_dict["L2"] = l2
-        self.ps_dict["MIN_EXPR"] = min_expr
-        self.ps_dict["LOAD"] = load
-        self.ps_dict["LOAD_DATA"] = load_data
-        self.ps_dict["LOADA"] = loada
-        self.ps_dict["LOAD_DATAA"] = load_dataa
-        self.ps_dict["LOADV"] = loadv
-        self.ps_dict["LOAD_DATAV"] = load_datav
-
-        pp = widgets.VBox(
-            [   pdata,
-                widgets.HBox([dfts, fgenes]),
-                norm,
-                sumnorm,
-                widgets.HBox([l1, expr_thr]),
-                widgets.HBox([l2, min_expr]),
-                widgets.HBox([load, load_data]),
-                widgets.HBox([loada, load_dataa]),
-                widgets.HBox([loadv, load_datav]),
-            ],
-            layout={"height": "95%", "width": "100%"},
-        )
-        return pp
-
-    def load_ann(self, event):
-        path = self.ps_dict["LOAD_DATAA"].value
-        self.log('Loading cell annotation file from {}'.format(path))        
-        try:
-            for i in range(len(self.stab.children)):
-                self.sams[i].load_obs_annotations(path)
-                self.update_dropdowns(i)
-        except:
-            with self.out:
-                print("Annotation file not found or was improperly formatted.")
-
-    def load_vann(self, event):
-        path = self.ps_dict["LOAD_DATAV"].value
-        self.log('Loading gene annotation file from {}'.format(path))
-        try:
-            for i in range(len(self.stab.children)):
-                self.sams[i].load_var_annotations(path)
-                self.update_dropdowns(i)
-        except:
-            with self.out:
-                print("Annotation file not found or was improperly formatted.")
-
-    def init_load(self):
-        load = widgets.Button(
-            description="Load data",
-            tooltip="Enter the path to the desired data file you wish to "
-            "load. Accepted filetypes are .csv (comma), .txt (tab), .h5ad "
-            "(AnnData), and .p (pickled SAM dictionary).",
-            disabled=False,
-        )
-        load.on_click(self.load_data)
-        load_data = widgets.Text(value="", layout={"width": "100%"})
-        return widgets.HBox([load, load_data], layout={"width": "50%"})
-
-    def load_data(self, event):
-        try:
-            if not self.SAM_LOADED:
-                path = self.SamPlot.children[1].children[0].children[1].value
-            else:
-                path = self.ps_dict["LOAD_DATA"].value
-            self.log('Loading data file from {}'.format(path))
-            filetype = path.split(".")[-1]
-            if filetype == "gz":
-                filetype = path.split(".")[-2]
-
-            sam = SAM()
-            if filetype == "h5ad" or filetype == "csv":
-                sam.load_data(path)
-            elif filetype == "p":
-                try:
-                    sam.load(path)
-                except:
-                    sam.load_data(path)
-            else:
-                sam.load_data(path, sep="\t")
-
-            if not self.SAM_LOADED:
-                self.SamPlot.children[1].children[0].children[0].close()
-                self.SamPlot.children[1].children[0].children[1].close()
-                self.SamPlot.children[1].children[0].close()
-                self.SamPlot.children[1].close()
-                self.init_from_sam(sam)
-                self.SamPlot.children = [self.stab, self.tab]
-                self.tab.set_trait("selected_index", 1)
-            else:
-                self.close_all_tabs()
-                self.ds[0].close()
-                self.ds[0] = 0
-                self.load_vars_from_sam(sam)
-            self.create_plot(0, "Full dataset")
-            if len(self.titles) == 0:
-                self.titles = ["Full dataset"]
-            else:
-                self.titles[0] = "Full dataset"
-        except FileNotFoundError:
-            0
-            # do nothing
-
-    def me_update(self, val):
-        self.log('Setting min_expression to be {}'.format(val['new']))
-        self.preprocess_args["min_expression"] = val["new"]
-
-    def et_update(self, val):
-        self.log('Setting the threshold value to be {}'.format(val['new']))
-        self.preprocess_args["thresh"] = val["new"]
-
-    def sumnorm_submit(self, txt):
-        self.log('Setting sum_norm value to be {}'.format(txt['new']))
-        if txt["new"] == "None":
-            self.preprocess_args["sum_norm"] = None
-        else:
-            self.preprocess_args["sum_norm"] = txt["new"]
-
-    def set_pp_defaults(self, event):
-        self.log('Setting preprocessing parameters to defaults.')
-        self.preprocess_args = self.preprocess_args_init.copy()
-        fgenes = self.ps_dict["FGENES"]  # checkbox
-        norm = self.ps_dict["NORM"]
-        sumnorm = self.ps_dict["SUMNORM"]
-        expr_thr = self.ps_dict["EXPR_THR"]  # expr_threshold
-        min_expr = self.ps_dict["MIN_EXPR"]  # min_expr
-
-        init = float(self.preprocess_args.get("min_expression", 1))
-        min_expr.set_trait("value", init)
-
-        init = float(self.preprocess_args.get("thresh", 0.01))
-        expr_thr.set_trait("value", init)
-
-        init = self.preprocess_args.get("sum_norm", "None")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            sumnorm.set_trait("value", "None")
-        else:
-            sumnorm.set_trait("value", init)
-
-        init = self.preprocess_args.get("norm", "log")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            norm.set_trait("value", "None")
-        else:
-            norm.set_trait("value", init)
-
-        init = bool(self.preprocess_args.get("filter_genes", True))
-        fgenes.set_trait("value", True)
-        self.preprocess_args["filter_genes"] = init
-
-    def norm_submit(self, txt):
-        self.log('Setting norm parameter to be {}'.format(txt['new']))
-        if txt["new"] == "None":
-            self.preprocess_args["norm"] = None
-        else:
-            self.preprocess_args["norm"] = txt["new"]
-
-    def pp_filtergenes(self, event):
-        t = bool(self.preprocess_args.get("filter_genes", True))
-        self.log('Setting filter_genes parameter to be {}'.format(not t))
-        self.preprocess_args["filter_genes"] = not t
-
-    def preprocess_sam(self, event):
-        i = self.stab.selected_index
-        self.log('Preprocessing SAM object in tab {}'.format(i))
-        self.sams[i].preprocess_data(**self.preprocess_args)
-
-    """ END PREPROCESS INIT"""
-    
-    """ BEGIN RUN INIT"""
-
-    def init_run_sam(self):
-        cpm = widgets.Dropdown(
-            options=["UMAP", "t-SNE", "Diffusion map", "Diffusion UMAP"],
-            value="UMAP",
-            description="",
-            layout={"width": "60%"},
-        )
-
-        cp = widgets.Button(
-            description="Compute projection",
-            tooltip="Compute a 2D projection using the selected method in the dropdown menu to the right.",
-            layout={"width": "40%"},
-        )
-        cp.on_click(self.compute_projection)
-
-        clm = widgets.Dropdown(
-            options=[
-                "Louvain cluster",
-                "Density cluster",
-                "Hdbscan cluster",
-                "Kmeans cluster",
-                "Leiden cluster",
-            ],
-            value="Leiden cluster",
-            description="",
-            layout={"width": "60%"},
-        )
-        clm.observe(self.rewire_cluster_slider, "value")
-        cl = widgets.Button(
-            description="Cluster",
-            tooltip="Cluster the data using the selected method in the dropdown menu to the right.",
-            layout={"width": "40%"},
-        )
-        cl.on_click(self.cluster_data)
-
-        l = widgets.Label("Leiden 'res'", layout={"width": "20%"})
-        cslider = widgets.FloatSlider(
-            value=1,
-            min=0.1,
-            max=10,
-            step=0.1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="2f",
-            description="",
-            style={"description_width": "initial"},
-            layout={"width": "80%"},
-        )
-
-        runb = widgets.Button(
-            description="Run SAM",
-            disabled=False,
-            tooltip="Run SAM on the currently selected cells. Enter the title of the new tab in the text box on the right",
-            icon="",
-        )
-        runb.on_click(self.subcluster)
-        title = widgets.Text(value="",)
-        wpca = widgets.Checkbox(
-            value=bool(self.run_args.get("weight_PCs", True)), description="Weight PCs"
-        )
-        wpca.observe(self.weightpcs)
-
-
-        spca = widgets.Checkbox(
-            value=bool(self.run_args.get("sparse_pca", True)), description="Sparse PCA"
-        )
-        spca.observe(self.sparsepca)
-        
-
-        dfts = widgets.Button(
-            description="Set defaults",
-            disabled=False,
-            tooltip="Set default options",
-            icon="",
-        )
-        dfts.on_click(self.set_run_defaults)
-
-        l1 = widgets.Button(
-            description="# genes:",
-            disabled=True,
-            tooltip="The number of highest-weighted genes to select each iteration of SAM.",
-            icon="",
-        )
-        init = self.run_args.get("n_genes", 3000)
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = 3000
-        ngenes = widgets.FloatSlider(
-            value=init,
-            min=min(100, self.sams[0].adata.shape[1]),
-            max=self.sams[0].adata.shape[1],
-            step=100,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="5d",
-            style={"description_width": "initial"},
-        )
-        ngenes.observe(self.ngenes_update, names="value")
-
-        l2 = widgets.Button(
-            description="# PCs:",
-            disabled=True,
-            tooltip="The number of principal components to select in PCA.",
-            icon="",
-        )
-        init = self.run_args.get("npcs", 150)
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = 150
-        npcs = widgets.FloatSlider(
-            value=init,
-            min=10,
-            max=500,
-            step=1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="d",
-            style={"description_width": "initial"},
-        )
-        npcs.observe(self.npcs_update, names="value")
-
-        l3 = widgets.Button(
-            description="# neighbors:",
-            disabled=True,
-            tooltip="The number of nearest neighbors to identify for each cell.",
-            icon="",
-        )
-        init = self.run_args.get("k", 20)
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = 20
-        knn = widgets.FloatSlider(
-            value=init,
-            min=5,
-            max=200,
-            step=1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="d",
-            style={"description_width": "initial"},
-        )
-        knn.observe(self.knn_update, names="value")
-
-        l4 = widgets.Button(
-            description="num_norm_avg:",
-            disabled=True,
-            tooltip="The top 'num_norm_avg' dispersions are averaged to determine the "
-            "normalization factor when calculating the weights. This prevents "
-            "genes with large spatial dispersions from skewing the distribution "
-            "of weights.",
-            icon="",
-        )
-        init = self.run_args.get("num_norm_avg", 50)
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = 50
-        nna = widgets.FloatSlider(
-            value=init,
-            min=1,
-            max=200,
-            step=1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="d",
-            style={"description_width": "initial"},
-        )
-        nna.observe(self.nna_update, names="value")
-
-        init = self.run_args.get("preprocessing", "Normalizer")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = "None"
-        norm = widgets.Dropdown(
-            options=["StandardScaler", "Normalizer", "None"],
-            value=init,
-            description="Preprocessing:",
-            disabled=False,
-            style={"description_width": "initial"},
-        )
-        norm.observe(self.rnorm_update, "value")
-
-        init = self.run_args.get("distance", "correlation")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = "correlation"
-        distance = widgets.Dropdown(
-            options=["correlation", "euclidean", "cosine"],
-            value=init,
-            description="Distance:",
-            disabled=False,
-            style={"description_width": "initial"},
-        )
-        distance.observe(self.dist_update, "value")
-
-        init = self.run_args.get("projection", "umap")
-        if isinstance(init, np.ndarray):
-            init = init[0]
-        if init is None:
-            init = "umap"
-        proj = widgets.Dropdown(
-            options=["umap", "tsne", "diff_umap"],
-            value=init,
-            description="Projection:",
-            disabled=False,
-            style={"description_width": "initial"},
-        )
-        proj.observe(self.proj_update, "value")
-
-        self.rb_dict = {}
-        self.rb_dict['SPCA'] = spca
-        self.rb_dict["RUNB"] = runb
-        self.rb_dict["TITLE"] = title
-        self.rb_dict["DFTS"] = dfts
-        self.rb_dict["WPCA"] = wpca
-        self.rb_dict["L3"] = l3
-        self.rb_dict["L4"] = l4
-        self.rb_dict["NNA"] = nna
-        self.rb_dict["KNN"] = knn
-        self.rb_dict["NORM"] = norm
-        self.rb_dict["DISTANCE"] = distance
-        self.rb_dict["PROJ"] = proj
-        self.rb_dict["L1"] = l1
-        self.rb_dict["L2"] = l2
-        self.rb_dict["NGENES"] = ngenes
-        self.rb_dict["NPCS"] = npcs
-        self.rb_dict["CP"] = cp
-        self.rb_dict["CPM"] = cpm
-        self.rb_dict["CL"] = cl
-        self.rb_dict["CLM"] = clm
-        self.rb_dict["L"] = l
-        self.rb_dict["CSLIDER"] = cslider
-
-        rs = widgets.VBox(
-            [
-                widgets.HBox([runb, title]),
-                widgets.HBox([dfts, wpca, spca]),
-                widgets.HBox([l3, knn]),
-                widgets.HBox([l4, nna]),
-                norm,
-                distance,
-                proj,
-                widgets.HBox([l1, ngenes]),
-                widgets.HBox([l2, npcs]),
-                widgets.HBox([cp, cpm]),
-                widgets.HBox([cl, clm]),
-                widgets.HBox([l, cslider]),
-            ],
-            layout={"height": "95%", "width": "100%"},
-        )
-
-        return rs
-
-    def set_run_defaults(self, event):
-        self.log('Setting run parameters to defaults.')
-        self.run_args = self.run_args_init.copy()
-        # run
-        # defaults,wpca
-        # ,ngenes
-        # ,npcs
-        # ,knn
-        # ,nna
-        # norm,dist,proj
-        wpca = self.rb_dict["WPCA"]
-        ngenes = self.rb_dict["NGENES"]
-        npcs = self.rb_dict["NPCS"]
-        knn = self.rb_dict["KNN"]
-        nna = self.rb_dict["NNA"]
-        rnorm = self.rb_dict["NORM"]
-        dist = self.rb_dict["DISTANCE"]
-        proj = self.rb_dict["PROJ"]
-
-        init = self.run_args.get("num_norm_avg", 50)
-        nna.set_trait("value", init)
-        init = self.run_args.get("k", 20)
-        knn.set_trait("value", init)
-
-        init = self.run_args.get("npcs", 150)
-        if init is None:
-            init = 150
-        npcs.set_trait("value", init)
-
-        init = self.run_args.get("n_genes", 3000)
-        if init is None:
-            init = 3000
-        ngenes.set_trait("value", init)
-
-        init = self.run_args.get("preprocessing", "Normalizer")
-        if init is None:
-            init = ""
-        rnorm.set_trait("value", init)
-        init = self.run_args.get("distance", "correlation")
-        dist.set_trait("value", init)
-        init = self.run_args.get("projection", "umap")
-        proj.set_trait("value", init)
-
-        init = bool(self.run_args.get("weight_PCs", True))
-        wpca.set_trait("value", init)
-        self.run_args["weight_PCs"] = init
-
-    def sam_weights(self, event):
-        self.log('Setting gene rankings to be ordered by the SAM weights.')
-        s = self.sams[self.stab.selected_index]
-        self.marker_genes[self.stab.selected_index] = np.array(
-            list(s.adata.var_names[np.argsort(-s.adata.var["weights"].values)])
-        )
-        self.marker_genes_tt[self.stab.selected_index] = "Genes ranked by SAM weights."
-        self.cs_dict["LGENES"].set_trait(
-            "tooltip", self.marker_genes_tt[self.stab.selected_index]
-        )
-
-        if self.cs_dict["RGENES"].value != 0:
-            self.cs_dict["RGENES"].set_trait("value", 0)
-        else:
-            self.show_expression(str(self.marker_genes[self.stab.selected_index][0]))
-
-    def weightpcs(self, event):
-        t = self.run_args.get("weight_PCs", True)
-        self.log('Setting weight_PCs to be {}'.format(not t))
-        self.run_args["weight_PCs"] = not t
-
-    def sparsepca(self, event):
-        t = self.run_args.get("sparse_pca", True)
-        self.log('Setting sparse_pca to be {}'.format(not t))
-        self.run_args["sparse_pca"] = not t
-        
-    def npcs_update(self, val):
-        self.log('Setting npcs to be {}'.format(int(val['new'])))
-        self.run_args["npcs"] = int(val["new"])
-
-    def nna_update(self, val):
-        self.log('Setting num_norm_avg to be {}'.format(int(val['new'])))
-        self.run_args["num_norm_avg"] = int(val["new"])
-
-    def knn_update(self, val):
-        self.log('Setting k to be {}'.format(int(val['new'])))
-        self.run_args["k"] = int(val["new"])
-
-    def ngenes_update(self, val):
-        self.log('Setting n_genes to be {}'.format(int(val['new'])))
-        self.run_args["n_genes"] = int(val["new"])
-
-    def rnorm_update(self, txt):
-        self.log('Setting preprocessing to be {}'.format(txt['new']))
-        if txt["new"] == "None":
-            self.run_args["preprocessing"] = None
-        else:
-            self.run_args["preprocessing"] = txt["new"]
-
-    def proj_update(self, txt):
-        self.log('Setting projection to be {}'.format(txt['new']))
-        self.run_args["projection"] = txt["new"]
-
-    def dist_update(self, txt):
-        self.log('Setting distance to be {}'.format(txt['new']))
-        self.run_args["distance"] = txt["new"]
-
-    def subcluster(self, event):
-
-        execute = False
-        i = self.stab.selected_index
-
-        selected = self.selected[i]
-        selected_cells = self.selected_cells[i]
-
-        sam = self.sams[i]
-        if not np.all(selected) and selected.sum() > 0:
-            sam_subcluster = SAM(counts=sam.adata[selected_cells, :].copy())
-            sam_subcluster.adata_raw = sam.adata_raw[selected_cells].copy()
-
-            sam_subcluster.adata_raw.obs = sam_subcluster.adata.obs.copy()
-            sam_subcluster.adata_raw.obsm = sam_subcluster.adata.obsm.copy()
-
-            #sam_subcluster.preprocess_data(**self.preprocess_args)
-            
-            self.out.clear_output()
-            with self.out:
-                sam_subcluster.run(**self.run_args)
-            self.sams.append(sam_subcluster)
-            self.selected.append(np.ones(sam_subcluster.adata.shape[0]).astype("bool"))
-            self.selected_cells.append(np.array(list(sam_subcluster.adata.obs_names)))
-            self.active_labels.append(np.zeros(sam_subcluster.adata.shape[0]))
-            self.dd_opts.append(["Toggle cluster"])
-            self.gene_expressions.append(np.zeros(sam_subcluster.adata.shape[0]))
-            self.marker_genes.append(
-                np.array(list(sam_subcluster.adata.var_names))[
-                    np.argsort(-sam_subcluster.adata.var["weights"].values)
-                ]
-            )
-            self.marker_genes_tt.append("Genes ranked by SAM weights.")
-            self.ds.append(0)
-            i = len(self.sams) - 1
-            execute = True
-
-        elif np.all(selected):
-            sam.preprocess_data(**self.preprocess_args)
-
-            self.out.clear_output()
-            with self.out:
-                sam.run(**self.run_args)
-            self.marker_genes[i] = np.array(list(sam.adata.var_names))[
-                np.argsort(-sam.adata.var["weights"].values)
-            ]
-            self.marker_genes_tt[i] = "Genes ranked by SAM weights."
-            execute = True
-            self.current_sam = sam
-
-        if execute:
-            self.log(('Subclustering SAM object in tab {}'.format(i),selected_cells))
-            title = self.rb_dict["TITLE"].value
-            if title == "" and i == 0:
-                title = "Full dataset"
-            elif title == "":
-                title = "Subcluster " + str(i)
-            self.create_plot(i, title)
-            try:
-                self.titles[i] = title
-            except:
-                self.titles.append(title)
-                
-            self.update_dropdowns(i)
-
-    """ END RUN INIT"""
-
-    """ BEGIN CS INIT"""
-
-    def init_cs(self):
-        initl = list(self.sams[0].adata.obsm.keys())
-        initl = [""] + initl
-        if self.default_proj in initl:
-            init = self.default_proj
-        else:
-            init = initl[0]
-
-        dpm = widgets.Dropdown(
-            options=initl, value=init, description="", layout={"width": "60%"}
-        )
-        dp = widgets.Button(
-            description="Display projection",
-            tooltip="Display the 2D projection selected in the dropdown menu to the right.",
-            layout={"width": "40%"},
-        )
-        dp.on_click(self.display_projection)
-
-        initl = list(self.sams[0].adata.obs.keys())
-        initl = [""] + initl
-        dam = widgets.Dropdown(
-            options=initl, value=initl[0], description="", layout={"width": "60%"}
-        )
-        da = widgets.Button(
-            description="Display cell ann.",
-            tooltip="Overlay the annotations selected in the dropdown menu to the right.",
-            layout={"width": "40%"},
-        )
-        da.on_click(self.display_annotation)
-
-        dv = widgets.Button(
-            description="Display gene ann.",
-            tooltip="Include the 'var' annotation for the corresponding gene in the title of the gene expression plots.",
-            layout={"width": "40%"},
-        )
-        dv.on_click(self.display_var_annotation)
-
-        initl = list(self.sams[0].adata.var.keys())
-        initl = [""] + initl
-        dvm = widgets.Dropdown(
-            options=initl, value=initl[0], description="", layout={"width": "60%"}
-        )
-
-        irm = widgets.Button(
-            description="Find markers (RF)",
-            tooltip="Identify marker genes of selected cells using a random forest classifier.",
-            layout={"width": "34%"},
-        )
-        irm.on_click(self.irm_genes)
-        ism = widgets.Button(
-            description="Find markers (SW)",
-            tooltip="Identify marker genes of selected cells by finding genes with large SAM weights among the selected cells.",
-            layout={"width": "33%"},
-        )
-        ism.on_click(self.ism_genes)
-
-        sws = widgets.Button(
-            description="SAM ranking",
-            tooltip="Ranks genes by SAM weights.",
-            layout={"width": "33%"},
-        )
-        sws.on_click(self.sam_weights)
-
-        us = widgets.Button(
-            description="Unselect all (x)",
-            tooltip="Unselect all cells. Pressing 'x' on the keyboard while hovering over the scatter plot will do the same thing.",
-            layout={"width": "34%"},
-        )
-        usa = widgets.Button(
-            description="Select all (c)",
-            tooltip="Select all cells. Pressing 'c' on the keyboard while hovering over the scatter plot will do the same thing.",
-            layout={"width": "33%"},
-        )
-        res = widgets.Button(
-            description="Reset view (v)",
-            tooltip="Resets the current plot to the defautl view. Pressing 'v' on the keyboard while hovering over the scatter plot"
-            " will do the same thing.",
-            layout={"width": "33%"},
-        )
-        us.on_click(self.unselect_all)
-        usa.on_click(self.select_all)
-        res.on_click(self.reset_view)
-
-        avg = widgets.Checkbox(
-            indent=False, value=True, description="Avg expr", layout={"width": "20%"}
-        )
-
-        log = widgets.Checkbox(
-            indent=False, value=False, description="Log cbar", layout={"width": "20%"}
-        )
-
-        show_colorbar = widgets.Checkbox(
-            indent=False, value=False, description="Show cbar", layout={"width": "20%"}
-        )
-        lann = widgets.Button(
-            description="Annotate",
-            tooltip=(
-                "Enter the key of the 'obs' annotation vector you wish to modify."
-                " If the key does not exist, a new vector in 'obs' will be created. Enter the"
-                " label you want to annotate the selected cells with in the right."
-            ),
-            disabled=True,
-            layout={"width": "20%"},
-        )
-        anno_name = widgets.Text(
-            value="", placeholder="Annotation column name", layout={"width": "40%"}
-        )
-        anno = widgets.Text(value="", placeholder="Annotation", layout={"width": "40%"})
-        anno.on_submit(self.annotate_pop)
-
-        lgsm = widgets.Button(
-            description="Get similar genes",
-            tooltip="Rank genes in order of decreasing spatially averaged correlation with respect to the input gene.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        gsm = widgets.Text(
-            value="",
-            placeholder=self.sams[0].adata.var_names[0],
-            layout={"width": "50%"},
-        )
-        gsm.on_submit(self.get_similar_genes)
-        lshg = widgets.Button(
-            description="Show expressions",
-            tooltip="Overlay the input gene expressions over the scatter plot. Only a substring of the gene need be provided.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        shg = widgets.Text(
-            value="",
-            placeholder=self.sams[0].adata.var_names[0],
-            layout={"width": "50%"},
-        )
-        shg.on_submit(self.show_expression)
-
-        lgenes = widgets.Button(
-            description="Ranked genes",
-            tooltip="Genes ranked by SAM weights.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        rgenes = widgets.IntSlider(
-            value=0,
-            min=0,
-            max=200,
-            step=1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            description="",
-            style={"description_width": "initial"},
-            layout={"width": "70%"},
-        )
-        rgenes.observe(self.gene_update, "value")
-
-        lthr = widgets.Button(
-            description="Threshold expr",
-            tooltip="Select genes with greater than the chosen expression threshold.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        thr = widgets.FloatSlider(
-            value=0,
-            min=0,
-            max=0,
-            step=0.05,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            description="",
-            style={"description_width": "initial"},
-            layout={"width": "70%"},
-        )
-        thr.observe(self.threshold_selection, "value")
-
-        lsf = widgets.Button(
-            description="Save",
-            tooltip="Save the current SAM object. Filenames should end with .h5ad.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-
-        sf = widgets.Text(value="", layout={"width": "70%"})
-        sf.on_submit(self.save_data)
-
-        close = widgets.Button(
-            description="Close current tab",
-            tooltip="Closes the currently open subclustering tab.",
-            disabled=False,
-        )  # layout={'width':'30%'})
-        close.on_click(self.close_tab)
-
-        hotkeys = widgets.Button(
-            description="ReadMe",
-            tooltip="""While hovering over the scatter plot, the following keyboard inputs are available:
-                - Left/Right arrows: Scroll through & display ranked genes
-                - Shift: Random Forest classifier marker gene identification of selected cells
-                - Enter: SAM weights-based approach to marker gene identification of selected cells
-                - x: Unselect all cells
-                - c: Select all cells
-                - v: Reset view
-                - a: Toggle expression spatial averaging ON/OFF
-
-                Also note that clicking a cell will select/unselect all cells sharing its label.""",
-            disabled=True,
-        )  # layout={'width':'30%'})
-
-        amslider = widgets.Button(
-            description="Opacity",
-            tooltip="Changes the opacities of unselected points in the current plot.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        aslider = widgets.FloatSlider(
-            value=0.05,
-            min=0.0,
-            max=1,
-            step=0.01,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="2f",
-            description="",
-            style={"description_width": "initial"},
-            layout={"width": "70%"},
-        )
-        aslider.observe(self.change_alpha, "value")
-
-        lmslider = widgets.Button(
-            description="Marker size",
-            tooltip="Changes the marker sizes in the current plot.",
-            disabled=True,
-            layout={"width": "30%"},
-        )
-        mslider = widgets.FloatSlider(
-            value=5,
-            min=0.1,
-            max=20,
-            step=0.1,
-            disabled=False,
-            continuous_update=False,
-            orientation="horizontal",
-            readout=True,
-            readout_format="2f",
-            description="",
-            style={"description_width": "initial"},
-            layout={"width": "70%"},
-        )
-        mslider.observe(self.change_msize, "value")
-
-        acc = widgets.Dropdown(value="Toggle cluster", options=["Toggle cluster"])
-        acc.observe(self.pick_cells_dd, "value")
-
-        self.cs_dict = {}
-        self.cs_dict["DP"] = dp
-        self.cs_dict["DPM"] = dpm
-        self.cs_dict["DA"] = da
-        self.cs_dict["DAM"] = dam
-        self.cs_dict["ACC"] = acc
-        self.cs_dict["DV"] = dv
-        self.cs_dict["DVM"] = dvm
-        self.cs_dict["IRM"] = irm
-        self.cs_dict["ISM"] = ism
-        self.cs_dict["SWS"] = sws
-        self.cs_dict["US"] = us
-        self.cs_dict["USA"] = usa
-        self.cs_dict["RES"] = res
-        self.cs_dict["LANN"] = lann
-        self.cs_dict["ANNO_NAME"] = anno_name
-        self.cs_dict["ANNO"] = anno
-        self.cs_dict["SHOW_COLORBAR"] = show_colorbar
-        self.cs_dict["LGSM"] = lgsm
-        self.cs_dict["GSM"] = gsm
-        self.cs_dict["AVG"] = avg
-        self.cs_dict["LSHG"] = lshg
-        self.cs_dict["SHG"] = shg
-        self.cs_dict["LOG"] = log
-        self.cs_dict["LGENES"] = lgenes
-        self.cs_dict["RGENES"] = rgenes
-        self.cs_dict["LTHR"] = lthr
-        self.cs_dict["THR"] = thr
-        self.cs_dict["LSF"] = lsf
-        self.cs_dict["SF"] = sf
-        self.cs_dict["AMSLIDER"] = amslider
-        self.cs_dict["ASLIDER"] = aslider
-        self.cs_dict["LMSLIDER"] = lmslider
-        self.cs_dict["MSLIDER"] = mslider
-        self.cs_dict["CLOSE"] = close
-        self.cs_dict["HOTKEYS"] = hotkeys
-
-        return widgets.VBox(
-            [
-                widgets.HBox([dp, dpm]),
-                widgets.HBox([da, dam, acc]),
-                widgets.HBox([dv, dvm]),
-                widgets.HBox([irm, ism, sws]),
-                widgets.HBox([us, usa, res]),
-                widgets.HBox([lann, anno_name, anno,]),
-                widgets.HBox([lgsm, gsm,]),
-                widgets.HBox([lshg, shg,]),
-                widgets.HBox([lgenes, rgenes]),
-                widgets.HBox([lthr, thr]),
-                widgets.HBox([amslider, aslider]),
-                widgets.HBox([show_colorbar, avg, log]),
-                widgets.HBox([lmslider, mslider]),
-                widgets.HBox([lsf, sf]),
-                widgets.HBox([close, hotkeys]),
-            ],
-            layout={"height": "95%", "width": "100%"},
-        )
-
-    def reset_view(self, event):
-        i = self.stab.selected_index
-        self.log('Resetting view for tab {}'.format(i))        
-        self.create_plot(i, self.stab.get_title(i))
-        self.marker_genes[i] = np.array(list(self.sams[i].adata.var_names))[
-            np.argsort(-self.sams[i].adata.var["weights"].values)
-        ]
-        self.marker_genes_tt[i] = "Genes ranked by SAM weights."
-        self.cs_dict["LGENES"].set_trait("tooltip", self.marker_genes_tt[i])
-
-    def save_data(self, path):
-        path = path.value
-        self.log('Saving data from tab {} to {}'.format(self.stab.selected_index,path))
-        if path != "":
-            if path.split(".")[-1] == "h5ad":
-                s = self.sams[self.stab.selected_index]
-                s.save_anndata(path)
-            elif (
-                path.split(".")[-1] == "png"
-                or path.split(".")[-1] == "pdf"
-                or path.split(".")[-1] == "eps"
-                or path.split(".")[-1] == "jpg"
-            ):
-                if len(path.split("/")) > 1:
-                    ut.create_folder("/".join(path.split("/")[:-1]))
-                self.stab.children[self.stab.selected_index].write_image(path)
-
-    def show_expression(self, text):
-        if type(text) is not str:
-            gene = text.value
-        else:
-            gene = text
-
-        s = self.sams[self.stab.selected_index]
-        if gene != "":
-
-            try:
-                if not (gene in s.adata.var_names):
-                    genes = ut.search_string(
-                        np.array(list(s.adata.var_names)), gene, case_sensitive=True
-                    )[0]
-                    if genes is not -1:
-                        gene = genes[0]
-
-                if self.cs_dict["AVG"].value:
-                    if "X_knn_avg" not in s.adata.layers.keys():
-                        s.dispersion_ranking_NN(save_avgs=True)
-
-                    x = s.adata[:, gene].layers["X_knn_avg"]
-                    if sp.issparse(x):
-                        a = x.A.flatten()
-                    else:
-                        a = x.flatten()
-                else:
-                    x = s.adata[:, gene][s.adata.obs_names, :].X
-                    if sp.issparse(x):
-                        a = x.A.flatten()
-                    else:
-                        a = x.flatten()
-
-                if self.GENE_KEY != "":
-                    title = gene + "; " + str(s.adata.var[self.GENE_KEY].T[gene])
-                else:
-                    title = gene
-
-                # self.select_all(None)
-                self.update_colors_expr(a, title)
-                self.log('Showing gene expression values for {} in tab {}'.format(title,self.stab.selected_index))
-            except:
-                if self.cs_dict["AVG"].value:
-                    if not (gene in s.adata.var_names):
-                        with self.out:
-                            print(
-                                "X_knn_avg does not exist. Either run"
-                                " sam.dispersion_ranking_NN() or uncheck"
-                                " `avg` in the control panel."
-                            )
-                elif not (gene in s.adata.var_names):
-                    with self.out:
-                        print(
-                            "Gene not found. Check your spelling and " "capitalization."
-                        )
-
-    def get_similar_genes(self, txt):
-        gene = txt.value
-        if gene != "":
-            s = self.sams[self.stab.selected_index]
-            if "X_knn_avg" not in s.adata.layers.keys():
-                s.dispersion_ranking_NN()
-
-            genes = ut.search_string(
-                np.array(list(s.adata.var_names)), gene, case_sensitive=True
-            )[0]
-            if genes is not -1:
-                gene = genes[0]
-            else:
-                return
-                # quit
-
-            markers = ut.find_corr_genes(s, gene).flatten()
-            _, i = np.unique(markers, return_index=True)
-            markers = markers[np.sort(i)]
-            self.marker_genes[self.stab.selected_index] = markers
-
-            self.cs_dict["RGENES"].set_trait("value", 0)
-            self.marker_genes_tt[self.stab.selected_index] = (
-                "Ranked genes from most to least spatially correlated with "
-                + gene
-                + "."
-            )
-            self.cs_dict["LGENES"].set_trait(
-                "tooltip", self.marker_genes_tt[self.stab.selected_index]
-            )
-            self.log('Getting genes with similar expression patterns to {}'.format(gene))
-            self.show_expression(str(gene))
-            
-
-    def annotate_pop(self, text):
-        text = text.value
-        text_name = self.cs_dict["ANNO_NAME"].value
-        selected = self.selected[self.stab.selected_index]
-        selected_cells = self.selected_cells[self.stab.selected_index]
-        if text != "" and text_name != "" and selected.sum() != selected.size:
-            self.log(('Annotating cells with the {} label in the {} key.'.format(text,text_name),selected_cells))
-            for it, s in enumerate(self.sams):
-                try:
-                    x1 = np.array(list(s.adata.obs_names))
-    
-                    if text_name in list(s.adata.obs.keys()):
-                        a = s.get_labels(text_name).copy().astype("<U100")
-                        a[np.in1d(x1, selected_cells)] = text
-                        s.adata.obs[text_name] = pd.Categorical(a)
-    
-                    else:
-                        a = np.zeros(s.adata.shape[0], dtype="<U100")
-                        a[:] = ""
-                        a[np.in1d(x1, selected_cells)] = text
-                        s.adata.obs[text_name] = pd.Categorical(a)
-                except:
-                    pass
-                
-                self.update_dropdowns(it)
-
-    def unselect_all(self, event):
-        self.log('Unselecting all cells.')
-        self.selected[self.stab.selected_index][:] = False
-        self.selected_cells[self.stab.selected_index] = []
-        self.stab.children[self.stab.selected_index].data[0].selectedpoints = np.array(
-            []
-        )
-        self.stab.children[self.stab.selected_index].data[0].unselected = {
-            "marker": {"opacity": self.cs_dict["ASLIDER"].value}
-        }
-
-    def select_all(self, event):
-        self.log('Selecting all cells.')
-        self.selected[self.stab.selected_index][:] = True
-        self.selected_cells[self.stab.selected_index] = np.array(
-            list(self.sams[self.stab.selected_index].adata.obs_names)
-        )
-        self.stab.children[self.stab.selected_index].data[0].selectedpoints = list(
-            np.arange(self.selected[self.stab.selected_index].size)
-        )
-        self.stab.children[self.stab.selected_index].data[0].marker.opacity = 1
-
-    def ism_genes(self, event):
-        selected = self.selected[self.stab.selected_index]
-        s = self.sams[self.stab.selected_index]
-        if not np.all(selected) and selected.sum() > 0:
-            self.log(('Calculating marker genes using SW method in tab {}'.format(self.stab.selected_index),self.selected_cells[self.stab.selected_index]))           
-
-            if "X_knn_avg" not in s.adata.layers.keys():
-                s.dispersion_ranking_NN(save_avgs=True)                
-            l = s.adata.layers["X_knn_avg"]
-            m = l.mean(0).A.flatten()
-            ms = l[selected, :].mean(0).A.flatten()
-            lsub = l[selected, :]
-            lsub.data[:] = lsub.data ** 2
-            ms2 = lsub.mean(0).A.flatten()
-            v = ms2 - 2 * ms * m + m ** 2
-            wmu = np.zeros(v.size)
-            wmu[m > 0] = v[m > 0] / m[m > 0]
-
-            self.marker_genes[self.stab.selected_index] = np.array(
-                list(s.adata.var_names[np.argsort(-wmu)])
-            )
-            if self.cs_dict["RGENES"].value != 0:
-                self.cs_dict["RGENES"].set_trait("value", 0)
-            else:
-                self.show_expression(
-                    str(self.marker_genes[self.stab.selected_index][0])
-                )
-            self.marker_genes_tt[
-                self.stab.selected_index
-            ] = "Ranked genes according to spatial SAM weights among selected cells."
-            self.cs_dict["LGENES"].set_trait(
-                "tooltip", self.marker_genes_tt[self.stab.selected_index]
-            )
-
-    def irm_genes(self, event):
-        selected = self.selected[self.stab.selected_index]
-        s = self.sams[self.stab.selected_index]
-        if not np.all(selected) and selected.sum() > 0:
-            self.log(('Calculating marker genes using RF method in tab {}'.format(self.stab.selected_index),self.selected_cells[self.stab.selected_index]))            
-            a = np.zeros(s.adata.shape[0])
-            a[selected] = 1
-            markers, _ = s.identify_marker_genes_rf(labels=a, clusters=1)
-            self.marker_genes[self.stab.selected_index] = markers[1]
-            if self.cs_dict["RGENES"].value != 0:
-                self.cs_dict["RGENES"].set_trait("value", 0)
-            else:
-                self.show_expression(
-                    str(self.marker_genes[self.stab.selected_index][0])
-                )
-            self.marker_genes_tt[
-                self.stab.selected_index
-            ] = "Ranked genes according to random forest classification."
-            self.cs_dict["LGENES"].set_trait(
-                "tooltip", self.marker_genes_tt[self.stab.selected_index]
-            )
-
-    def gene_update(self, val):
-        val = val["new"]
-        markers = self.marker_genes[self.stab.selected_index]
-
-        if int(val) < markers.size:
-            gene = markers[int(val)]
-        else:
-            gene = markers[-1]
-        self.cs_dict["SHG"].set_trait("value", gene)
-        self.show_expression(str(gene))
-
-    def update_dropdowns(self, i):        
-        s = self.sams[i]
-        self.cs_dict["DPM"].options = [""] + list(s.adata.obsm.keys())
-        self.cs_dict["DAM"].options = [""] + list(s.adata.obs.keys())
-        self.cs_dict["DVM"].options = [""] + list(s.adata.var.keys())
-        self.cs_dict["ACC"].options = self.dd_opts[i]
-
-    def on_switch_tabs(self, event):
-        self.log('Switching to tab {}'.format(self.stab.selected_index))
-        self.update_dropdowns(self.stab.selected_index)
-        self.cs_dict["LGENES"].set_trait(
-            "tooltip", self.marker_genes_tt[self.stab.selected_index]
-        )
-        self.current_tab = self.stab.selected_index
-        self.current_sam = self.sams[self.current_tab]
-        for i in range(len(self.children)):
-            self.tab.set_title(
-                i, self.names[i] + " (" + str(self.stab.selected_index) + ")"
-            )
-
-        x = self.stab.children[self.stab.selected_index]
-        f = x.layout.xaxis.range
-        x.layout.xaxis.range=(-1,1)
-        x.layout.xaxis.range=f        
-
-        f = x.layout.yaxis.range
-        x.layout.yaxis.range=(-1,1)
-        x.layout.yaxis.range=f
-
-            
-
-    def display_var_annotation(self, event):
-        self.GENE_KEY = self.cs_dict["DVM"].value
-        self.log('Setting the var annotations to be from key {}'.format(self.GENE_KEY))
-
-    def display_annotation(self, event):
-        key = self.cs_dict["DAM"].value
-        self.log('Displaying cell annotations from obs key {}'.format(key))
-        if key != "":
-            labels = np.array(list(self.sams[self.stab.selected_index].get_labels(key)))
-
-            self.active_labels[self.stab.selected_index] = labels
-            self.update_colors_anno(labels)
-
-    def update_colors_expr(self, a, title):
-        if self.cs_dict["LOG"].value:
-            a = np.log2(a + 1)
-        if self.cs_dict["SHOW_COLORBAR"].value:
-            showscale = True
-        else:
-            showscale = False
-        self.gene_expressions[self.stab.selected_index] = a
-
-        f1 = self.stab.children[self.stab.selected_index]
-        f1.update_traces(
-            marker=dict(
-                color=a,
-                colorscale="spectral",
-                reversescale=True,
-                showscale=showscale,
-                colorbar_ticks="outside",
-                colorbar_tickmode="auto",
-                colorbar_title="",
-                opacity=1,
-            )
-        )
-
-        f1.update_layout(hovermode="closest", title=title)
-        self.stab.children[self.stab.selected_index].data[0].text = list(a)
-        self.stab.children[self.stab.selected_index].data[0].hoverinfo = "text"
-
-        slider = self.cs_dict["THR"]
-        slider.set_trait("min", 0)
-        slider.set_trait("max", a.max() + (a.max() - a.min()) / 100)
-        slider.set_trait("step", (a.max() - a.min()) / 100)
-        slider.set_trait("value", 0)
-
-    def update_colors_anno(self, labels):
-        nlabels = np.unique(labels).size
-        title = self.cs_dict["DAM"].value.split("_clusters")[0]
-
-        if issubclass(labels.dtype.type, np.number):
-            if issubclass(labels.dtype.type, np.float):
-                self.update_colors_expr(labels, title)
-                return
-
-        if nlabels == 1:
-            x = "spectral"
-        elif nlabels <= 2:
-            x = cl.scales["3"]["qual"]["Paired"][:nlabels]
-        elif nlabels <= 11:
-            nlabels = str(nlabels)
-            x = cl.scales[nlabels]["qual"]["Paired"]
-        else:
-            nlabels = "11"
-            x = cl.scales[nlabels]["qual"]["Paired"]
-            x = cl.interp(x, int(nlabels))
-
-        lbls, inv = np.unique(labels, return_inverse=True)
-        self.dd_opts[self.stab.selected_index] = ["Toggle cluster"] + list(lbls)
-        self.cs_dict["ACC"].options = self.dd_opts[self.stab.selected_index]
-
-        if issubclass(labels.dtype.type, np.character):
-            tickvals = np.arange(lbls.size)
-            ticktext = list(lbls)
-        else:
-            idx = np.round(np.linspace(0, len(lbls) - 1, 6)).astype(int)
-            tickvals = list(idx)
-            ticktext = lbls[tickvals]
-
-        if self.cs_dict["SHOW_COLORBAR"].value:
-            showscale = True
-        else:
-            showscale = False
-
-        f1 = self.stab.children[self.stab.selected_index]
-        f1.update_traces(
-            marker=dict(
-                color=inv,
-                colorscale=x,
-                colorbar_ticks="outside",
-                colorbar_tickmode="array",
-                colorbar_title="",
-                colorbar_tickvals=tickvals,
-                showscale=showscale,
-                colorbar_ticktext=ticktext,
-                opacity=1,
-            )
-        )
-        self.stab.children[self.stab.selected_index].layout.title = title
-        self.stab.children[self.stab.selected_index].data[0].text = list(lbls[inv])
-        self.stab.children[self.stab.selected_index].data[0].hoverinfo = "text"
-        self.stab.children[self.stab.selected_index].layout.hovermode = "closest"
-
-        slider = self.cs_dict["THR"]
-        slider.set_trait("min", slider.value)
-        slider.set_trait("max", slider.value)
-
-    def threshold_selection(self, val):
-        val = val["new"]
-        self.log('Setting gene expression selection threshold to be {}'.format(val))
-        s = self.sams[self.stab.selected_index]
-
-        self.selected[self.stab.selected_index][:] = False
-        self.selected[self.stab.selected_index][
-            self.gene_expressions[self.stab.selected_index] >= val
-        ] = True
-        self.selected_cells[self.stab.selected_index] = np.array(
-            list(s.adata.obs_names)
-        )[self.selected[self.stab.selected_index]]
-        self.stab.children[self.stab.selected_index].data[0].selectedpoints = np.where(
-            self.selected[self.stab.selected_index]
-        )[0]
-
-    def rewire_cluster_slider(self, event):
-        x = self.rb_dict["CSLIDER"]
-        l = self.rb_dict["L"]
-        val = event["new"]
-        if val == "Kmeans cluster":
-            x.set_trait("min", 2)
-            x.set_trait("max", 200)
-            x.set_trait("value", 6)
-            x.set_trait("step", 1)
-            l.set_trait("value", "Kmeans 'k'")
-
-        elif val == "Louvain cluster":
-            x.set_trait("min", 0.1)
-            x.set_trait("max", 10)
-            x.set_trait("value", 1)
-            x.set_trait("step", 0.1)
-            l.set_trait("value", "Louvain 'res'")
-
-        elif val == "Density cluster":
-            x.set_trait("min", 0.1)
-            x.set_trait("max", 2)
-            x.set_trait("value", 0.5)
-            x.set_trait("step", 0.01)
-            l.set_trait("value", "Density 'eps'")
-
-        elif val == "Hdbscan cluster":
-            l.set_trait("value", "Hdbscan 'N/A'")
-
-        elif val == "Leiden cluster":
-            x.set_trait("min", 0.1)
-            x.set_trait("max", 10)
-            x.set_trait("value", 1)
-            x.set_trait("step", 0.1)
-            l.set_trait("value", "Leiden 'res'")
-
-    def cluster_data(self, event):
-        s = self.sams[self.stab.selected_index]
-        val = self.rb_dict["CLM"].value
-        eps = self.rb_dict["CSLIDER"].value
-        self.log('Clustering data using the {} method with parameter {} in tab {}'.format(val,eps,self.stab.selected_index))
-        if val == "Kmeans cluster":
-            s.kmeans_clustering(int(eps))
-
-        elif val == "Louvain cluster":
-            s.louvain_clustering(res=eps)
-
-        elif val == "Density cluster":
-            s.density_clustering(eps=eps)
-
-        elif val == "Hdbscan cluster":
-            s.hdbknn_clustering(k=int(eps))
-
-        elif val == "Leiden cluster":
-            s.leiden_clustering(res=eps)
-
-        self.cs_dict["DAM"].options = [""] + list(s.adata.obs.keys())
-
-    def display_projection(self, event):
-        s = self.sams[self.stab.selected_index]
-        key = self.cs_dict["DPM"].value
-        self.log('Displaying projection from obsm key {} in tab {}'.format(key,self.stab.selected_index))
-        if key != "":
-            X = s.adata.obsm[key][:, :2]
-            self.stab.children[self.stab.selected_index].data[0]["x"] = X[:, 0]
-            self.stab.children[self.stab.selected_index].data[0]["y"] = X[:, 1]
-
-    def compute_projection(self, event):
-        i = self.stab.selected_index
-        s = self.sams[i]
-        val = self.rb_dict["CPM"].value
-        if val == "UMAP":
-            s.run_umap()
-        elif val == "t-SNE":
-            s.run_tsne()
-        elif val == "Diffusion map":
-            s.run_diff_map()
-        elif val == "Diffusion UMAP":
-            s.run_diff_umap()
-        self.log('Computing 2D projection using the {} method'.format(val))
-        self.update_dropdowns(i)
-
-    """ END CS INIT"""
-
-    def select(self, trace, points, selector):
-        if np.array(points.point_inds).size > 0:
-            self.selected[self.stab.selected_index][np.array(points.point_inds)] = True
-            self.selected_cells[self.stab.selected_index] = np.array(
-                list(
-                    self.sams[self.stab.selected_index].adata.obs_names[
-                        self.selected[self.stab.selected_index]
-                    ]
-                )
-            )
-        trace.selectedpoints = list(
-            np.where(self.selected[self.stab.selected_index])[0]
-        )
-        trace.unselected = {"marker": {"opacity": self.cs_dict["ASLIDER"].value}}
-
-    def pick_cells_dd(self, txt):
-        if txt["new"] != "Toggle cluster":
-            al = str(txt["new"])
-            sel = self.selected[self.stab.selected_index]
-            als = self.active_labels[self.stab.selected_index].astype("str")
-            ratio = sel[als == al].sum() / sel[als == al].size
-            if ratio >= 0.5:
-                sel[als == al] = False
-            else:
-                sel[als == al] = True
-
-            self.selected_cells[self.stab.selected_index] = np.array(
-                list(self.sams[self.stab.selected_index].adata.obs_names[sel])
-            )
-
-            self.stab.children[self.stab.selected_index].data[0].selectedpoints = list(
-                np.where(sel)[0]
-            )
-            self.log('Toggling cluster {} in tab {}'.format(al,self.stab.selected_index))
-
-            self.cs_dict["ACC"].value = "Toggle cluster"
-
-    def pick_cells(self, trace, points, selector):
-        tf = self.selected[self.stab.selected_index][points.point_inds[0]]
-        als = self.active_labels[self.stab.selected_index]
-        al = als[points.point_inds[0]]
-
-        if tf:
-            self.selected[self.stab.selected_index][als == al] = False
-        else:
-            self.selected[self.stab.selected_index][als == al] = True
-
-        self.selected_cells[self.stab.selected_index] = np.array(
-            list(
-                self.sams[self.stab.selected_index].adata.obs_names[
-                    self.selected[self.stab.selected_index]
-                ]
-            )
-        )
-        trace.selectedpoints = list(
-            np.where(self.selected[self.stab.selected_index])[0]
-        )
-
-    def change_alpha(self, val):
-        # m = self.stab.children[self.stab.selected_index].data[0].marker.copy()
-        # m['opacity'] = val['new']
-        self.stab.children[self.stab.selected_index].data[0].unselected = {
-            "marker": {"opacity": val["new"]}
-        }
-        self.log('Changing alpha (opacity) to be {}'.format(val['new']))
-
-    def change_msize(self, val):
-        val = val["new"]
-        markers = self.stab.children[self.stab.selected_index].data[0].marker
-        markers.size = val
-        self.log('Changing marker size to be {}'.format(val))
-
-    def init_graph(self, trace):
-        trace.mode = "markers"
-        trace.marker.size = 5
-        trace.on_selection(self.select)
-        trace.on_click(self.pick_cells)
-
-    
-def invalidArgs(func, argdict):
-    import inspect
-    args = inspect.getfullargspec(func).args
-    return set(argdict) - set(args)
-    
diff --git a/samalg/utilities.py b/samalg/utilities.py
deleted file mode 100644
index 7e2a518..0000000
--- a/samalg/utilities.py
+++ /dev/null
@@ -1,453 +0,0 @@
-import pandas as pd
-import numpy as np
-import scipy as sp
-import os
-import errno
-from sklearn.decomposition import PCA
-import umap.distances as dist
-from sklearn.utils.extmath import svd_flip
-from sklearn.utils import check_array, check_random_state
-from scipy import sparse
-import sklearn.utils.sparsefuncs as sf
-from umap.umap_ import nearest_neighbors
-__version__ = "0.8.7"
-
-
-def find_corr_genes(sam, input_gene):
-    """Rank genes by their spatially averaged expression pattern correlations to
-    a desired gene.
-
-    Parameters
-    ----------
-
-    sam - SAM
-        The analyzed SAM object
-
-    input_gene - string
-        The gene ID with respect to which correlations will be computed.
-
-    Returns
-    -------
-    A ranked list of gene IDs based on correlation to the input gene.
-    """
-    all_gene_names = np.array(list(sam.adata.var_names))
-
-    D_avg = sam.adata.layers["X_knn_avg"]
-
-    input_gene = np.where(all_gene_names == input_gene)[0]
-
-    if input_gene.size == 0:
-        print(
-            "Gene note found in the filtered dataset. Note "
-            "that genes are case sensitive."
-        )
-        return
-
-    pw_corr = generate_correlation_map(D_avg.T.A, D_avg[:, input_gene].T.A)
-    return all_gene_names[np.argsort(-pw_corr.flatten())]
-
-def _pca_with_sparse(X, npcs, solver='arpack', mu=None, seed=0, mu_axis=0):
-    random_state = check_random_state(seed)
-    np.random.set_state(random_state.get_state())
-    random_init = np.random.rand(np.min(X.shape))
-    X = check_array(X, accept_sparse=['csr', 'csc'])
-
-    if mu is None:
-        if mu_axis == 0:
-            mu = X.mean(0).A.flatten()[None, :]
-        else:
-            mu = X.mean(1).A.flatten()[:, None]
-    
-    if mu_axis == 0:
-        mdot = mu.dot
-        mmat = mdot
-        mhdot = mu.T.dot
-        mhmat = mu.T.dot
-        Xdot = X.dot
-        Xmat = Xdot
-        XHdot = X.T.conj().dot
-        XHmat = XHdot
-        ones = np.ones(X.shape[0])[None, :].dot
-
-        def matvec(x):
-            return Xdot(x) - mdot(x)
-
-        def matmat(x):
-            return Xmat(x) - mmat(x)
-
-        def rmatvec(x):
-            return XHdot(x) - mhdot(ones(x))
-
-        def rmatmat(x):
-            return XHmat(x) - mhmat(ones(x))
-
-    else:
-        mdot = mu.dot
-        mmat = mdot
-        mhdot = mu.T.dot
-        mhmat = mu.T.dot
-        Xdot = X.dot
-        Xmat = Xdot
-        XHdot = X.T.conj().dot
-        XHmat = XHdot
-        ones = np.ones(X.shape[1])[None, :].dot        
-
-        def matvec(x):
-            return Xdot(x) - mdot(ones(x))
-
-        def matmat(x):
-            return Xmat(x) - mmat(ones(x))
-
-        def rmatvec(x):
-            return XHdot(x) - mhdot(x)
-
-        def rmatmat(x):
-            return XHmat(x) - mhmat(x)
-
-    XL = sp.sparse.linalg.LinearOperator(
-        matvec=matvec,
-        dtype=X.dtype,
-        matmat=matmat,
-        shape=X.shape,
-        rmatvec=rmatvec,
-        rmatmat=rmatmat,
-    )
-
-    u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init)
-    u, v = svd_flip(u, v)
-    idx = np.argsort(-s)
-    v = v[idx, :]
-
-    X_pca = (u * s)[:, idx]
-    ev = s[idx] ** 2 / (X.shape[0] - 1)
-
-    total_var = sf.mean_variance_axis(X, axis=0)[1].sum()
-    ev_ratio = ev / total_var
-
-    output = {
-        'X_pca': X_pca,
-        'variance': ev,
-        'variance_ratio': ev_ratio,
-        'components': v,
-    }
-    return output
-
-
-
-def nearest_neighbors_wrapper(X,n_neighbors=15,metric='correlation',metric_kwds={},angular=True,random_state=0):
-    random_state=np.random.RandomState(random_state)
-    return nearest_neighbors(X,n_neighbors,metric,metric_kwds,angular,random_state)[:2]
-
-def knndist(nnma):
-    x, y = nnma.nonzero()
-    data = nnma.data
-    knn = y.reshape((nnma.shape[0], nnma[0, :].data.size))
-    val = data.reshape(knn.shape)
-    return knn, val
-
-
-def save_figures(filename, fig_IDs=None, **kwargs):
-    """
-    Save figures.
-
-    Parameters
-    ----------
-    filename - str
-        Name of output file
-
-    fig_IDs - int, numpy.array, list, optional, default None
-        A list of open figure IDs or a figure ID that will be saved to a
-        pdf/png file respectively.
-
-    **kwargs -
-        Extra keyword arguments passed into 'matplotlib.pyplot.savefig'.
-
-    """
-    import matplotlib.pyplot as plt
-
-    if fig_IDs is not None:
-        if type(fig_IDs) is list:
-            savetype = "pdf"
-        else:
-            savetype = "png"
-    else:
-        savetype = "pdf"
-
-    if savetype == "pdf":
-        from matplotlib.backends.backend_pdf import PdfPages
-
-        if len(filename.split(".")) == 1:
-            filename = filename + ".pdf"
-        else:
-            filename = ".".join(filename.split(".")[:-1]) + ".pdf"
-
-        pdf = PdfPages(filename)
-
-        if fig_IDs is None:
-            figs = [plt.figure(n) for n in plt.get_fignums()]
-        else:
-            figs = [plt.figure(n) for n in fig_IDs]
-
-        for fig in figs:
-            fig.savefig(pdf, format="pdf", **kwargs)
-        pdf.close()
-    elif savetype == "png":
-        plt.figure(fig_IDs).savefig(filename, **kwargs)
-
-
-def weighted_PCA(mat, do_weight=True, npcs=None, solver="auto",seed = 0):
-    # mat = (mat - np.mean(mat, axis=0))
-    if do_weight:
-        if min(mat.shape) >= 10000 and npcs is None:
-            print(
-                "More than 10,000 cells. Running with 'npcs' set to < 1000 is"
-                " recommended."
-            )
-
-        if npcs is None:
-            ncom = min(mat.shape)
-        else:
-            ncom = min((min(mat.shape), npcs))
-            
-        pca = PCA(svd_solver=solver, n_components=ncom,random_state=check_random_state(seed))
-        reduced = pca.fit_transform(mat)
-        scaled_eigenvalues = pca.explained_variance_
-        scaled_eigenvalues = scaled_eigenvalues / scaled_eigenvalues.max()
-        reduced_weighted = reduced * scaled_eigenvalues[None, :] ** 0.5
-    else:
-        pca = PCA(n_components=npcs, svd_solver=solver,random_state=check_random_state(seed))
-        reduced = pca.fit_transform(mat)
-        if reduced.shape[1] == 1:
-            pca = PCA(n_components=2, svd_solver=solver,random_state=check_random_state(seed))
-            reduced = pca.fit_transform(mat)
-        reduced_weighted = reduced
-
-    return reduced_weighted, pca
-
-
-def transform_wPCA(mat, pca):
-    mat = mat - pca.mean_
-    reduced = mat.dot(pca.components_.T)
-    v = pca.explained_variance_  # .var(0)
-    scaled_eigenvalues = v / v.max()
-    reduced_weighted = np.array(reduced) * scaled_eigenvalues[None, :] ** 0.5
-    return reduced_weighted
-
-def search_string(vec, s, case_sensitive=False, invert=False):
-    vec = np.array(vec)
-
-
-    if isinstance(s,list):
-        S = s
-    else:
-        S = [s]
-
-    V=[]; M=[]
-    for s in S:
-        m = []
-        if not case_sensitive:
-            s = s.lower()
-        for i in range(len(vec)):
-            if case_sensitive:
-                st = vec[i]
-            else:
-                st = vec[i].lower()
-            b = st.find(s)
-            if not invert and b != -1 or invert and b == -1:
-                m.append(i)
-        if len(m) > 0:
-            V.append(vec[np.array(m)]); M.append(np.array(m))
-    if len(V)>0:
-        i = len(V)
-        if not invert:
-            V = np.concatenate(V); M = np.concatenate(M);
-            if i > 1:
-                ix = np.sort(np.unique(M,return_index=True)[1])
-                V=V[ix]; M=M[ix];
-        else:
-            for i in range(len(V)):
-                V[i]=list(set(V[i]).intersection(*V))
-            V = vec[np.in1d(vec,np.unique(np.concatenate(V)))]
-            M = np.array([np.where(vec==x)[0][0] for x in V])
-        return V,M
-    else:
-        return -1,-1
-
-
-def distance_matrix_error(dist1, dist2):
-    s = 0
-    for k in range(dist1.shape[0]):
-        s += np.corrcoef(dist1[k, :], dist2[k, :])[0, 1]
-    return 1 - s / dist1.shape[0]
-
-
-def generate_euclidean_map(A, B):
-    a = (A ** 2).sum(1).flatten()
-    b = (B ** 2).sum(1).flatten()
-    x = a[:, None] + b[None, :] - 2 * np.dot(A, B.T)
-    x[x < 0] = 0
-    return np.sqrt(x)
-
-
-def generate_correlation_map(x, y):
-    mu_x = x.mean(1)
-    mu_y = y.mean(1)
-    n = x.shape[1]
-    if n != y.shape[1]:
-        raise ValueError("x and y must " + "have the same number of timepoints.")
-    s_x = x.std(1, ddof=n - 1)
-    s_y = y.std(1, ddof=n - 1)
-    s_x[s_x == 0] = 1
-    s_y[s_y == 0] = 1
-    cov = np.dot(x, y.T) - n * np.dot(mu_x[:, None], mu_y[None, :])
-    return cov / np.dot(s_x[:, None], s_y[None, :])
-
-
-def extract_annotation(cn, x, c="_"):
-    m = []
-    if x is not None:
-        for i in range(cn.size):
-            f = cn[i].split(c)
-            x = min(len(f) - 1, x)
-            m.append(f[x])
-        return np.array(m)
-    else:
-        ms = []
-        ls = []
-        for i in range(cn.size):
-            f = cn[i].split(c)
-            m = []
-            for x in range(len(f)):
-                m.append(f[x])
-            ms.append(m)
-            ls.append(len(m))
-        ml = max(ls)
-        for i in range(len(ms)):
-            ms[i].extend([""] * (ml - len(ms[i])))
-            if ml - len(ms[i]) > 0:
-                ms[i] = np.concatenate(ms[i])
-        ms = np.vstack(ms)
-        MS = []
-        for i in range(ms.shape[1]):
-            MS.append(ms[:, i])
-        return MS
-
-
-def isolate(dt, x1, x2, y1, y2):
-    return np.where(
-        np.logical_and(
-            np.logical_and(dt[:, 0] > x1, dt[:, 0] < x2),
-            np.logical_and(dt[:, 1] > y1, dt[:, 1] < y2),
-        )
-    )[0]
-
-
-def to_lower(y):
-    x = y.copy().flatten()
-    for i in range(x.size):
-        x[i] = x[i].lower()
-    return x
-
-
-def to_upper(y):
-    x = y.copy().flatten()
-    for i in range(x.size):
-        x[i] = x[i].upper()
-    return x
-
-
-def create_folder(path):
-    try:
-        os.makedirs(path)
-    except OSError as exception:
-        if exception.errno != errno.EEXIST:
-            raise
-
-
-def convert_annotations(A):
-    x = np.unique(A)
-    y = np.zeros(A.size)
-    z = 0
-    for i in x:
-        y[A == i] = z
-        z += 1
-
-    return y.astype("int")
-
-
-def nearest_neighbors_hnsw(x,ef=200,M=48,n_neighbors = 100):
-    import hnswlib
-    labels = np.arange(x.shape[0])
-    p = hnswlib.Index(space = 'cosine', dim = x.shape[1])
-    p.init_index(max_elements = x.shape[0], ef_construction = ef, M = M)
-    p.add_items(x, labels)
-    p.set_ef(ef)
-    idx, dist = p.knn_query(x, k = n_neighbors)
-    return idx,dist
-
-
-def calc_nnm(g_weighted, k, distance=None):
-    if g_weighted.shape[0] > 0:
-        if distance == 'cosine':
-            nnm, dists = nearest_neighbors_hnsw(g_weighted, n_neighbors=k)
-        else:
-            nnm, dists = nearest_neighbors_wrapper(g_weighted, n_neighbors=k, metric=distance)
-        EDM = gen_sparse_knn(nnm, dists)
-        EDM = EDM.tocsr()        
-    return EDM
-
-
-def compute_distances(A, dm):
-    if dm == "euclidean":
-        m = np.dot(A, A.T)
-        h = np.diag(m)
-        x = h[:, None] + h[None, :] - 2 * m
-        x[x < 0] = 0
-        dist = np.sqrt(x)
-    elif dm == "correlation":
-        dist = 1 - np.corrcoef(A)
-    else:
-        dist = sp.spatial.distance.squareform(sp.spatial.distance.pdist(A, metric=dm))
-    return dist
-
-
-def dist_to_nn(d, K):  # , offset = 0):
-    E = d.copy()
-    np.fill_diagonal(E, -1)
-    M = np.max(E) * 2
-    x = np.argsort(E, axis=1)[:, :K]  # offset:K+offset]
-    E[
-        np.tile(
-            np.arange(E.shape[0]).reshape(E.shape[0], -1), (1, x.shape[1])
-        ).flatten(),
-        x.flatten(),
-    ] = M
-
-    E[E < M] = 0
-    E[E > 0] = 1
-    return E  # ,x
-
-
-def to_sparse_knn(D1, k):
-    for i in range(D1.shape[0]):
-        x = D1.data[D1.indptr[i] : D1.indptr[i + 1]]
-        idx = np.argsort(x)
-        if idx.size > k:
-            x[idx[:-k]] = 0
-        D1.data[D1.indptr[i] : D1.indptr[i + 1]] = x
-    D1.eliminate_zeros()
-    return D1
-
-
-def gen_sparse_knn(knni, knnd, shape=None):
-    if shape is None:
-        shape = (knni.shape[0], knni.shape[0])
-
-    D1 = sp.sparse.lil_matrix(shape)
-
-    D1[
-        np.tile(np.arange(knni.shape[0])[:, None], (1, knni.shape[1])).flatten().astype('int32'),
-        knni.flatten().astype('int32'),
-    ] = knnd.flatten()
-    D1 = D1.tocsr()
-    return D1
diff --git a/setup.py b/setup.py
deleted file mode 100644
index f206ba8..0000000
--- a/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
-    name="sam-algorithm",
-    version="1.0.2",
-    description="The Self-Assembling-Manifold algorithm",
-    long_description="The Self-Assembling-Manifold algorithm for analyzing single-cell RNA sequencing data.",
-    long_description_content_type="text/markdown",
-    url="https://github.com/atarashansky/self-assembling-manifold",
-    author="Alexander J. Tarashansky",
-    author_email="tarashan@stanford.edu",
-    keywords="scrnaseq analysis manifold reconstruction",
-    python_requires=">=3.6",
-    # py_modules=["SAM", "utilities", "SAMGUI"],
-    install_requires=[
-        "numpy>=1.19.0",
-        "scipy>=1.3.1",
-        "pandas>1.0.0",
-        "scikit-learn>=0.23.1",
-        "packaging>=0.20.0",
-        "numba>=0.50.1",
-        "umap-learn>=0.4.6",
-        "dill",
-        "anndata>=0.7.4",
-        "h5py",
-        "harmonypy",
-        "hnswlib"
-    ],
-    extras_require={
-        "louvain": ["louvain", "cython", "python-igraph"],
-        "leiden": ["leidenalg", "cython", "python-igraph"],
-        "hdbscan": ["hdbscan"],
-        "plot": ["ipythonwidgets", "jupyter", "plotly==4.0.0", "matplotlib"],
-        "scanpy": ["scanpy"],
-    },
-    packages=find_packages(),
-    include_package_data=True,
-    zip_safe=False,
-)
diff --git a/src/samalg/__init__.py b/src/samalg/__init__.py
new file mode 100644
index 0000000..3ab3eb4
--- /dev/null
+++ b/src/samalg/__init__.py
@@ -0,0 +1,66 @@
+"""SAM (Self-Assembling Manifold) algorithm for single-cell RNA sequencing analysis.
+
+SAM iteratively rescales the input gene expression matrix to emphasize
+genes that are spatially variable along the intrinsic manifold of the data.
+
+Example
+-------
+>>> from samalg import SAM
+>>> sam = SAM()
+>>> sam.load_data("expression_matrix.h5ad")
+>>> sam.preprocess_data()
+>>> sam.run()
+>>> sam.leiden_clustering()
+
+Copyright 2018, Alexander J. Tarashansky, All rights reserved.
+"""
+
+from __future__ import annotations
+
+from ._logging import get_logger, set_verbosity, setup_logging
+from .exceptions import (
+    ClusteringError,
+    ConvergenceError,
+    DataNotLoadedError,
+    DimensionalityReductionError,
+    FileLoadError,
+    FileSaveError,
+    InvalidParameterError,
+    PreprocessingError,
+    ProcessingError,
+    SAMError,
+)
+from .sam import SAM
+from .utilities import (
+    calc_nnm,
+    find_corr_genes,
+    generate_correlation_map,
+    generate_euclidean_map,
+    nearest_neighbors_hnsw,
+    weighted_PCA,
+)
+
+__version__ = "2.0.0"
+
+__all__ = [
+    "SAM",
+    "ClusteringError",
+    "ConvergenceError",
+    "DataNotLoadedError",
+    "DimensionalityReductionError",
+    "FileLoadError",
+    "FileSaveError",
+    "InvalidParameterError",
+    "PreprocessingError",
+    "ProcessingError",
+    "SAMError",
+    "calc_nnm",
+    "find_corr_genes",
+    "generate_correlation_map",
+    "generate_euclidean_map",
+    "get_logger",
+    "nearest_neighbors_hnsw",
+    "set_verbosity",
+    "setup_logging",
+    "weighted_PCA",
+]
diff --git a/src/samalg/_logging.py b/src/samalg/_logging.py
new file mode 100644
index 0000000..8b04332
--- /dev/null
+++ b/src/samalg/_logging.py
@@ -0,0 +1,123 @@
+"""Logging infrastructure for the SAM algorithm.
+
+This module provides a centralized logging configuration for the samalg package.
+All logging throughout the package should use the logger obtained from get_logger().
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    pass
+
+# Package-level logger name
+LOGGER_NAME = "samalg"
+
+# Default format for log messages
+DEFAULT_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+SIMPLE_FORMAT = "%(levelname)s: %(message)s"
+
+
+def get_logger(name: str | None = None) -> logging.Logger:
+    """Get a logger instance for the samalg package.
+
+    Parameters
+    ----------
+    name : str | None, optional
+        The name for the logger. If None, returns the root samalg logger.
+        If provided, returns a child logger (e.g., 'samalg.sam').
+
+    Returns
+    -------
+    logging.Logger
+        A configured logger instance.
+
+    Examples
+    --------
+    >>> logger = get_logger()  # Returns 'samalg' logger
+    >>> logger = get_logger('sam')  # Returns 'samalg.sam' logger
+    """
+    if name is None:
+        return logging.getLogger(LOGGER_NAME)
+    return logging.getLogger(f"{LOGGER_NAME}.{name}")
+
+
+def setup_logging(
+    level: int | str = logging.INFO,
+    format_string: str | None = None,
+    stream: bool = True,
+) -> None:
+    """Configure logging for the samalg package.
+
+    Parameters
+    ----------
+    level : int | str, optional
+        The logging level. Can be an integer (e.g., logging.DEBUG) or
+        a string (e.g., 'DEBUG'). Default is logging.INFO.
+    format_string : str | None, optional
+        The format string for log messages. If None, uses SIMPLE_FORMAT.
+    stream : bool, optional
+        Whether to add a StreamHandler to output to stderr. Default is True.
+
+    Examples
+    --------
+    >>> setup_logging(level='DEBUG')
+    >>> setup_logging(level=logging.WARNING, format_string='%(message)s')
+    """
+    logger = get_logger()
+
+    # Convert string level to int if needed
+    if isinstance(level, str):
+        level = getattr(logging, level.upper(), logging.INFO)
+
+    logger.setLevel(level)
+
+    # Remove existing handlers to avoid duplicates
+    logger.handlers.clear()
+
+    if stream:
+        handler = logging.StreamHandler(sys.stderr)
+        handler.setLevel(level)
+
+        formatter = logging.Formatter(format_string or SIMPLE_FORMAT)
+        handler.setFormatter(formatter)
+
+        logger.addHandler(handler)
+
+
+def set_verbosity(verbose: bool) -> None:
+    """Set the verbosity level of the samalg logger.
+
+    Parameters
+    ----------
+    verbose : bool
+        If True, set logging level to INFO.
+        If False, set logging level to WARNING.
+
+    Examples
+    --------
+    >>> set_verbosity(True)   # Show INFO messages
+    >>> set_verbosity(False)  # Only show WARNING and above
+    """
+    logger = get_logger()
+    if verbose:
+        logger.setLevel(logging.INFO)
+    else:
+        logger.setLevel(logging.WARNING)
+
+
+# Initialize the logger with default settings
+# This ensures the logger exists even if setup_logging is not called
+_logger = get_logger()
+if not _logger.handlers:
+    # Only add handler if none exist (avoids duplicate handlers)
+    _handler = logging.StreamHandler(sys.stderr)
+    _handler.setFormatter(logging.Formatter(SIMPLE_FORMAT))
+    _logger.addHandler(_handler)
+    _logger.setLevel(logging.INFO)
+
+# Convenience: export the main logger directly
+logger = get_logger()
diff --git a/src/samalg/exceptions.py b/src/samalg/exceptions.py
new file mode 100644
index 0000000..d041ee8
--- /dev/null
+++ b/src/samalg/exceptions.py
@@ -0,0 +1,193 @@
+"""Custom exceptions for the SAM algorithm.
+
+This module defines exception classes used throughout the samalg package
+to provide clear, specific error messages for different failure modes.
+"""
+
+from __future__ import annotations
+
+
+class SAMError(Exception):
+    """Base exception for all SAM-related errors.
+
+    All custom exceptions in the samalg package inherit from this class,
+    making it easy to catch any SAM-specific error.
+
+    Examples
+    --------
+    >>> try:
+    ...     sam.run()
+    ... except SAMError as e:
+    ...     print(f"SAM operation failed: {e}")
+    """
+
+    pass
+
+
+class DataNotLoadedError(SAMError):
+    """Raised when an operation requires data that hasn't been loaded.
+
+    This error is raised when attempting to perform analysis operations
+    (preprocessing, running SAM, etc.) before loading expression data.
+
+    Examples
+    --------
+    >>> sam = SAM()
+    >>> sam.run()  # No data loaded yet
+    DataNotLoadedError: No data has been loaded. Use load_data() or pass data to constructor.
+    """
+
+    def __init__(self, message: str | None = None) -> None:
+        if message is None:
+            message = "No data has been loaded. Use load_data() or pass data to the constructor."
+        super().__init__(message)
+
+
+class PreprocessingError(SAMError):
+    """Raised when data preprocessing fails.
+
+    This error indicates a problem during the preprocessing step,
+    such as invalid normalization parameters or data format issues.
+
+    Examples
+    --------
+    >>> sam.preprocess_data(norm='invalid')
+    PreprocessingError: Unknown normalization method: 'invalid'
+    """
+
+    pass
+
+
+class ProcessingError(SAMError):
+    """Raised when a general processing operation fails.
+
+    This is a general error for processing failures that don't fit
+    into more specific categories.
+    """
+
+    pass
+
+
+class ClusteringError(SAMError):
+    """Raised when a clustering operation fails.
+
+    This error is raised when clustering algorithms fail to complete,
+    typically due to invalid parameters or missing dependencies.
+
+    Examples
+    --------
+    >>> sam.leiden_clustering(res=-1)
+    ClusteringError: Resolution parameter must be positive.
+    """
+
+    pass
+
+
+class DimensionalityReductionError(SAMError):
+    """Raised when dimensionality reduction fails.
+
+    This error indicates problems with PCA, UMAP, t-SNE, or other
+    dimensionality reduction operations.
+
+    Examples
+    --------
+    >>> sam.run(npcs=10000)  # More PCs than features
+    DimensionalityReductionError: Cannot compute 10000 PCs with only 5000 genes.
+    """
+
+    pass
+
+
+class ConvergenceError(SAMError):
+    """Raised when the SAM algorithm fails to converge.
+
+    This error is raised when the iterative SAM algorithm doesn't
+    reach the specified convergence threshold within max_iter iterations.
+
+    Examples
+    --------
+    >>> sam.run(max_iter=1, stopping_condition=1e-10)
+    ConvergenceError: SAM did not converge after 1 iterations.
+    """
+
+    def __init__(self, iterations: int, final_error: float | None = None) -> None:
+        message = f"SAM did not converge after {iterations} iterations."
+        if final_error is not None:
+            message += f" Final error: {final_error:.6f}"
+        super().__init__(message)
+        self.iterations = iterations
+        self.final_error = final_error
+
+
+class InvalidParameterError(SAMError):
+    """Raised when an invalid parameter value is provided.
+
+    This error provides clear feedback about which parameter is invalid
+    and what values are acceptable.
+
+    Examples
+    --------
+    >>> sam.run(weight_mode='invalid')
+    InvalidParameterError: Invalid value for 'weight_mode': 'invalid'.
+        Expected one of: 'dispersion', 'variance', 'rms', 'combined'
+    """
+
+    def __init__(
+        self,
+        param_name: str,
+        value: object,
+        valid_values: list[str] | None = None,
+        reason: str | None = None,
+    ) -> None:
+        message = f"Invalid value for '{param_name}': {value!r}."
+        if valid_values:
+            valid_str = ", ".join(repr(v) for v in valid_values)
+            message += f" Expected one of: {valid_str}"
+        elif reason:
+            message += f" {reason}"
+        super().__init__(message)
+        self.param_name = param_name
+        self.value = value
+        self.valid_values = valid_values
+
+
+class FileLoadError(SAMError):
+    """Raised when loading a file fails.
+
+    This error provides context about what file couldn't be loaded
+    and why.
+
+    Examples
+    --------
+    >>> sam.load_data('nonexistent.h5ad')
+    FileLoadError: Failed to load file 'nonexistent.h5ad': File not found.
+    """
+
+    def __init__(self, filepath: str, reason: str | None = None) -> None:
+        message = f"Failed to load file '{filepath}'."
+        if reason:
+            message += f" {reason}"
+        super().__init__(message)
+        self.filepath = filepath
+        self.reason = reason
+
+
+class FileSaveError(SAMError):
+    """Raised when saving a file fails.
+
+    This error provides context about what file couldn't be saved
+    and why.
+
+    Examples
+    --------
+    >>> sam.save_anndata('/readonly/path/data.h5ad')
+    FileSaveError: Failed to save file '/readonly/path/data.h5ad': Permission denied.
+    """
+
+    def __init__(self, filepath: str, reason: str | None = None) -> None:
+        message = f"Failed to save file '{filepath}'."
+        if reason:
+            message += f" {reason}"
+        super().__init__(message)
+        self.filepath = filepath
+        self.reason = reason
diff --git a/src/samalg/py.typed b/src/samalg/py.typed
new file mode 100644
index 0000000..e69de29
diff --git a/src/samalg/sam.py b/src/samalg/sam.py
new file mode 100644
index 0000000..d231455
--- /dev/null
+++ b/src/samalg/sam.py
@@ -0,0 +1,2043 @@
+"""Self-Assembling Manifold (SAM) algorithm for single-cell RNA sequencing analysis.
+
+This module contains the main SAM class that implements iterative manifold-aware
+feature weighting for single-cell analysis.
+
+Copyright 2018, Alexander J. Tarashansky, All rights reserved.
+Email: <tarashan@stanford.edu>
+"""
+
+from __future__ import annotations
+
+import gc
+import pickle
+import time
+import warnings
+from typing import TYPE_CHECKING, Any, Literal
+
+import anndata
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+import sklearn.manifold as man
+import sklearn.utils.sparsefuncs as sf
+from anndata import AnnData
+from numba.core.errors import NumbaWarning
+from sklearn.preprocessing import Normalizer
+
+from . import utilities as ut
+from ._logging import get_logger
+from .exceptions import DataNotLoadedError, InvalidParameterError
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from numpy.typing import NDArray
+
+# Suppress NumbaWarning
+warnings.filterwarnings("ignore", category=NumbaWarning)
+
+# Get module logger
+logger = get_logger("sam")
+
+
+class SAM:
+    """Self-Assembling Manifolds single-cell RNA sequencing analysis tool.
+
+    SAM iteratively rescales the input gene expression matrix to emphasize
+    genes that are spatially variable along the intrinsic manifold of the data.
+    It outputs the gene weights, nearest neighbor matrix, and a 2D projection.
+
+    Parameters
+    ----------
+    counts : tuple | list | pd.DataFrame | AnnData | None
+        Input data in one of the following formats:
+        - tuple/list: (data, gene_names, cell_names) where data is sparse/dense matrix
+        - pd.DataFrame: cells x genes expression matrix
+        - AnnData: annotated data object
+        Only use this argument if you want to pass in preloaded data.
+        Otherwise use one of the load functions.
+    inplace : bool, optional
+        If True and counts is AnnData, use the object directly without copying.
+        Default is False.
+
+    Attributes
+    ----------
+    preprocess_args : dict
+        Dictionary of arguments used for the 'preprocess_data' function.
+    run_args : dict
+        Dictionary of arguments used for the 'run' function.
+    adata_raw : AnnData
+        An AnnData object containing the raw, unfiltered input data.
+    adata : AnnData
+        An AnnData object containing all processed data and SAM outputs.
+
+    Examples
+    --------
+    >>> sam = SAM()
+    >>> sam.load_data("expression_matrix.h5ad")
+    >>> sam.preprocess_data()
+    >>> sam.run()
+    >>> sam.leiden_clustering()
+    """
+
+    def __init__(
+        self,
+        counts: (
+            tuple[sp.spmatrix | NDArray[np.floating[Any]], NDArray[Any], NDArray[Any]]
+            | list[Any]
+            | pd.DataFrame
+            | AnnData
+            | None
+        ) = None,
+        inplace: bool = False,
+    ) -> None:
+        self.run_args: dict[str, Any] = {}
+        self.preprocess_args: dict[str, Any] = {}
+
+        if isinstance(counts, (tuple, list)):
+            raw_data, all_gene_names, all_cell_names = counts
+            if isinstance(raw_data, np.ndarray):
+                raw_data = sp.csr_matrix(raw_data)
+
+            self.adata_raw = AnnData(
+                X=raw_data,
+                obs={"obs_names": all_cell_names},
+                var={"var_names": all_gene_names},
+            )
+
+        elif isinstance(counts, pd.DataFrame):
+            raw_data = sp.csr_matrix(counts.values)
+            all_gene_names = np.array(list(counts.columns.values))
+            all_cell_names = np.array(list(counts.index.values))
+
+            self.adata_raw = AnnData(
+                X=raw_data,
+                obs={"obs_names": all_cell_names},
+                var={"var_names": all_gene_names},
+            )
+
+        elif isinstance(counts, AnnData):
+            all_cell_names = np.array(list(counts.obs_names))
+            all_gene_names = np.array(list(counts.var_names))
+            if counts.is_view:
+                counts = counts.copy()
+
+            if inplace:
+                self.adata_raw = counts
+            else:
+                self.adata_raw = counts.copy()
+
+        elif counts is not None:
+            raise TypeError(
+                "'counts' must be either a tuple/list of "
+                "(data, gene IDs, cell IDs), a Pandas DataFrame of "
+                "cells x genes, or an AnnData object."
+            )
+
+        if counts is not None:
+            if np.unique(all_gene_names).size != all_gene_names.size:
+                self.adata_raw.var_names_make_unique()
+            if np.unique(all_cell_names).size != all_cell_names.size:
+                self.adata_raw.obs_names_make_unique()
+
+            if inplace:
+                self.adata = self.adata_raw
+            else:
+                self.adata = self.adata_raw.copy()
+
+            if "X_disp" not in self.adata_raw.layers.keys():
+                self.adata.layers["X_disp"] = self.adata.X
+
+    def preprocess_data(
+        self,
+        div: float = 1,
+        downsample: float = 0,
+        sum_norm: str | float | None = "cell_median",
+        norm: str | None = "log",
+        min_expression: float = 1,
+        thresh_low: float = 0.0,
+        thresh_high: float = 0.96,
+        thresh: float | None = None,
+        filter_genes: bool = True,
+    ) -> None:
+        """Log-normalizes and filters the expression data.
+
+        Parameters
+        ----------
+        div : float, optional
+            The factor by which the gene expression will be divided prior to
+            normalization. Default is 1.
+        downsample : float, optional
+            The factor by which to randomly downsample the data. If 0, the
+            data will not be downsampled. Default is 0.
+        sum_norm : str | float | None, optional
+            Library normalization method. Options:
+            - float: Normalize each cell to this total count
+            - 'cell_median': Normalize to median total count per cell
+            - 'gene_median': Normalize genes to median total count per gene
+            - None: No normalization
+            Default is 'cell_median'.
+        norm : str | None, optional
+            Data transformation method. Options:
+            - 'log': log2(x + 1) transformation
+            - 'ftt': Freeman-Tukey variance-stabilizing transformation
+            - 'asin': arcsinh transformation
+            - 'multinomial': Pearson residual transformation (experimental)
+            - None: No transformation
+            Default is 'log'.
+        min_expression : float, optional
+            Threshold above which a gene is considered expressed. Values below
+            this are set to zero. Default is 1.
+        thresh_low : float, optional
+            Keep genes expressed in greater than thresh_low*100% of cells.
+            Default is 0.0.
+        thresh_high : float, optional
+            Keep genes expressed in less than thresh_high*100% of cells.
+            Default is 0.96.
+        thresh : float | None, optional
+            If provided, sets thresh_low=thresh and thresh_high=1-thresh.
+        filter_genes : bool, optional
+            Whether to apply gene filtering. Default is True.
+        """
+        if thresh is not None:
+            thresh_low = thresh
+            thresh_high = 1 - thresh
+
+        # Load data - check this first before accessing adata
+        if not hasattr(self, "adata_raw"):
+            raise DataNotLoadedError()
+
+        self.preprocess_args = {
+            "div": div,
+            "sum_norm": sum_norm,
+            "norm": norm,
+            "min_expression": min_expression,
+            "thresh_low": thresh_low,
+            "thresh_high": thresh_high,
+            "filter_genes": filter_genes,
+        }
+
+        self.run_args = self.adata.uns.get("run_args", {})
+
+        D = self.adata_raw.X
+        self.adata = self.adata_raw.copy()
+
+        D = self.adata.X
+        if isinstance(D, np.ndarray):
+            D = sp.csr_matrix(D, dtype="float32")
+        else:
+            if str(D.dtype) != "float32":
+                D = D.astype("float32")
+            D.sort_indices()
+
+        if D.getformat() == "csc":
+            D = D.tocsr()
+
+        # Sum-normalize
+        if sum_norm == "cell_median" and norm != "multinomial":
+            s = np.asarray(D.sum(1)).flatten()
+            sum_norm_val = np.median(s)
+            D = D.multiply(1 / s[:, None] * sum_norm_val).tocsr()
+        elif sum_norm == "gene_median" and norm != "multinomial":
+            s = np.asarray(D.sum(0)).flatten()
+            sum_norm_val = np.median(s[s > 0])
+            s[s == 0] = 1
+            D = D.multiply(1 / s[None, :] * sum_norm_val).tocsr()
+        elif sum_norm is not None and norm != "multinomial":
+            D = D.multiply(1 / np.asarray(D.sum(1)).flatten()[:, None] * sum_norm).tocsr()
+
+        # Normalize
+        self.adata.X = D
+        if norm is None:
+            D.data[:] = D.data / div
+
+        elif norm.lower() == "log":
+            D.data[:] = np.log2(D.data / div + 1)
+
+        elif norm.lower() == "ftt":
+            D.data[:] = np.sqrt(D.data / div) + np.sqrt(D.data / div + 1) - 1
+
+        elif norm.lower() == "asin":
+            D.data[:] = np.arcsinh(D.data / div)
+
+        elif norm.lower() == "multinomial":
+            ni = np.asarray(D.sum(1)).flatten()  # cells
+            pj = np.asarray(D.sum(0) / D.sum()).flatten()  # genes
+            col = D.indices
+            row = []
+            for i in range(D.shape[0]):
+                row.append(i * np.ones(D.indptr[i + 1] - D.indptr[i]))
+            row = np.concatenate(row).astype("int32")
+            mu = sp.coo_matrix((ni[row] * pj[col], (row, col))).tocsr()
+            mu2 = mu.copy()
+            mu2.data[:] = mu2.data**2
+            mu2 = mu2.multiply(1 / ni[:, None])
+            mu.data[:] = (D.data - mu.data) / np.sqrt(mu.data - mu2.data)
+
+            self.adata.X = mu
+            if sum_norm is None:
+                sum_norm = np.median(ni)
+            D = D.multiply(1 / ni[:, None] * sum_norm).tocsr()
+            D.data[:] = np.log2(D.data / div + 1)
+
+        else:
+            D.data[:] = D.data / div
+
+        # Zero-out low-expressed genes
+        idx = np.where(D.data <= min_expression)[0]
+        D.data[idx] = 0
+
+        # Filter genes
+        idx_genes = np.arange(D.shape[1])
+        if filter_genes:
+            a, ct = np.unique(D.indices, return_counts=True)
+            c = np.zeros(D.shape[1])
+            c[a] = ct
+
+            keep = np.where(
+                np.logical_and(c / D.shape[0] > thresh_low, c / D.shape[0] <= thresh_high)
+            )[0]
+
+            idx_genes = np.array(list(set(keep) & set(idx_genes)), dtype=np.intp)
+
+        mask_genes = np.zeros(D.shape[1], dtype="bool")
+        mask_genes[idx_genes] = True
+
+        self.adata.X = self.adata.X.multiply(mask_genes[None, :]).tocsr()
+        self.adata.X.eliminate_zeros()
+        self.adata.var["mask_genes"] = mask_genes
+
+        if norm == "multinomial":
+            self.adata.layers["X_disp"] = D.multiply(mask_genes[None, :]).tocsr()
+            self.adata.layers["X_disp"].eliminate_zeros()
+        else:
+            self.adata.layers["X_disp"] = self.adata.X
+
+        self.calculate_mean_var()
+
+        self.adata.uns["preprocess_args"] = self.preprocess_args
+        self.adata.uns["run_args"] = self.run_args
+
+    def calculate_mean_var(self, adata: AnnData | None = None) -> None:
+        """Calculate mean and variance for each gene.
+
+        Parameters
+        ----------
+        adata : AnnData | None, optional
+            The AnnData object to calculate statistics for.
+            If None, uses self.adata.
+        """
+        if adata is None:
+            adata = self.adata
+
+        if sp.issparse(adata.X):
+            mu, var = sf.mean_variance_axis(adata.X, axis=0)
+        else:
+            mu = adata.X.mean(0)
+            var = adata.X.var(0)
+
+        adata.var["means"] = mu
+        adata.var["variances"] = var
+
+    def get_avg_obsm(self, keym: str, keyl: str) -> NDArray[np.floating[Any]]:
+        """Get average obsm values grouped by label.
+
+        Parameters
+        ----------
+        keym : str
+            Key in adata.obsm to average.
+        keyl : str
+            Key in adata.obs containing labels for grouping.
+
+        Returns
+        -------
+        NDArray
+            Array of averaged values for each unique label.
+        """
+        clu = self.get_labels_un(keyl)
+        cl = self.get_labels(keyl)
+        x = []
+        for i in range(clu.size):
+            x.append(self.adata.obsm[keym][cl == clu[i]].mean(0))
+        return np.vstack(x)
+
+    def get_labels_un(self, key: str) -> NDArray[Any]:
+        """Get unique labels from obs.
+
+        Parameters
+        ----------
+        key : str
+            Key in adata.obs.
+
+        Returns
+        -------
+        NDArray
+            Array of unique labels.
+        """
+        if key not in list(self.adata.obs.keys()):
+            logger.warning("Key '%s' does not exist in `obs`.", key)
+            return np.array([])
+        return np.array(list(np.unique(self.adata.obs[key])))
+
+    def get_labels(self, key: str) -> NDArray[Any]:
+        """Get labels from obs.
+
+        Parameters
+        ----------
+        key : str
+            Key in adata.obs.
+
+        Returns
+        -------
+        NDArray
+            Array of labels.
+        """
+        if key not in list(self.adata.obs.keys()):
+            logger.warning("Key '%s' does not exist in `obs`.", key)
+            return np.array([])
+        return np.array(list(self.adata.obs[key]))
+
+    def get_cells(self, label: Any, key: str) -> NDArray[Any]:
+        """Retrieves cells of a particular annotation.
+
+        Parameters
+        ----------
+        label : Any
+            The annotation value to retrieve.
+        key : str
+            The key in `obs` from which to retrieve the annotation.
+
+        Returns
+        -------
+        NDArray
+            Array of cell names matching the label.
+        """
+        if key not in list(self.adata.obs.keys()):
+            logger.warning("Key '%s' does not exist in `obs`.", key)
+            return np.array([])
+        return np.array(list(self.adata.obs_names[np.array(list(self.adata.obs[key])) == label]))
+
+    def load_data(
+        self,
+        filename: str,
+        transpose: bool = True,
+        save_sparse_file: str | None = None,
+        sep: str = ",",
+        calculate_avg: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """Load expression data from file.
+
+        Parameters
+        ----------
+        filename : str
+            Path to the data file. Supported formats:
+            - .csv/.txt: Tabular format (genes x cells by default)
+            - .h5ad: AnnData format
+            - .p: Pickle format
+        transpose : bool, optional
+            If True (default), assumes file is genes x cells.
+            Set to False if file is cells x genes.
+        save_sparse_file : str | None, optional
+            If provided, save processed data to this path (.h5ad format).
+        sep : str, optional
+            Delimiter for CSV/TXT files. Default is ','.
+        calculate_avg : bool, optional
+            If True and loading .h5ad with existing neighbors, perform
+            kNN averaging. Default is False.
+        **kwargs
+            Additional arguments passed to file loading functions.
+        """
+        if filename.split(".")[-1] == "p":
+            raw_data, all_cell_names, all_gene_names = pickle.load(open(filename, "rb"))
+
+            if transpose:
+                raw_data = raw_data.T
+                if raw_data.getformat() == "csc":
+                    logger.info("Converting sparse matrix to csr format...")
+                    raw_data = raw_data.tocsr()
+
+            save_sparse_file = None
+        elif filename.split(".")[-1] != "h5ad":
+            df = pd.read_csv(filename, sep=sep, index_col=0, **kwargs)
+            if transpose:
+                dataset = df.T
+            else:
+                dataset = df
+
+            raw_data = sp.csr_matrix(dataset.values)
+            all_cell_names = np.array(list(dataset.index.values))
+            all_gene_names = np.array(list(dataset.columns.values))
+
+        if filename.split(".")[-1] != "h5ad":
+            self.adata_raw = AnnData(
+                X=raw_data,
+                obs={"obs_names": all_cell_names},
+                var={"var_names": all_gene_names},
+            )
+
+            if np.unique(all_gene_names).size != all_gene_names.size:
+                self.adata_raw.var_names_make_unique()
+            if np.unique(all_cell_names).size != all_cell_names.size:
+                self.adata_raw.obs_names_make_unique()
+
+            self.adata = self.adata_raw.copy()
+            self.adata.layers["X_disp"] = raw_data
+
+        else:
+            self.adata = anndata.read_h5ad(filename, **kwargs)
+            if self.adata.raw is not None:
+                self.adata_raw = AnnData(X=self.adata.raw.X)
+                self.adata_raw.var_names = self.adata.var_names
+                self.adata_raw.obs_names = self.adata.obs_names
+                self.adata_raw.obs = self.adata.obs
+
+                del self.adata.raw
+
+                if (
+                    "X_knn_avg" not in self.adata.layers.keys()
+                    and "connectivities" in self.adata.obsp.keys()
+                    and calculate_avg
+                ):
+                    self.dispersion_ranking_NN(save_avgs=True)
+            else:
+                self.adata_raw = self.adata
+
+            if "X_disp" not in list(self.adata.layers.keys()):
+                self.adata.layers["X_disp"] = self.adata.X
+            save_sparse_file = None
+
+        filename = ".".join(filename.split(".")[:-1]) + ".h5ad"
+        self.adata.uns["path_to_file"] = filename
+        self.adata_raw.uns["path_to_file"] = filename
+
+        if save_sparse_file is not None:
+            if save_sparse_file.split(".")[-1] == "p":
+                self.save_sparse_data(save_sparse_file)
+            elif save_sparse_file.split(".")[-1] == "h5ad":
+                self.save_anndata(save_sparse_file)
+
+    def save_anndata(self, fname: str = "", save_knn: bool = False, **kwargs: Any) -> None:
+        """Save adata to an h5ad file.
+
+        Parameters
+        ----------
+        fname : str, optional
+            Output file path. If empty, uses path from adata.uns['path_to_file'].
+        save_knn : bool, optional
+            If True, include X_knn_avg layer. Default is False (layer can be large).
+        **kwargs
+            Additional arguments passed to AnnData.write_h5ad().
+        """
+        Xknn = None
+        if not save_knn:
+            if "X_knn_avg" in self.adata.layers:
+                Xknn = self.adata.layers["X_knn_avg"]
+                del self.adata.layers["X_knn_avg"]
+
+        if fname == "":
+            if "path_to_file" not in self.adata.uns:
+                raise KeyError("Path to file not known.")
+            fname = self.adata.uns["path_to_file"]
+
+        x = self.adata
+        x.raw = self.adata_raw
+
+        # Fix weird issues when index name is an integer
+        for y in [
+            x.obs.columns,
+            x.var.columns,
+            x.obs.index,
+            x.var.index,
+            x.raw.var.index,
+            x.raw.var.columns,
+        ]:
+            y.name = str(y.name) if y.name is not None else None
+
+        x.write_h5ad(fname, **kwargs)
+        del x.raw
+
+        if Xknn is not None:
+            self.adata.layers["X_knn_avg"] = Xknn
+
+    def load_var_annotations(
+        self, aname: str | pd.DataFrame, sep: str = ",", key_added: str = "annotations"
+    ) -> None:
+        """Load gene annotations.
+
+        Parameters
+        ----------
+        aname : str | pd.DataFrame
+            Path to annotations file or DataFrame. First column should be gene IDs.
+        sep : str, optional
+            Delimiter for file. Default is ','.
+        key_added : str, optional
+            Unused parameter for backwards compatibility.
+        """
+        if isinstance(aname, pd.DataFrame):
+            ann = aname
+        else:
+            ann = pd.read_csv(aname, sep=sep, index_col=0)
+
+        for i in range(ann.shape[1]):
+            self.adata_raw.var[ann.columns[i]] = ann[ann.columns[i]]
+            self.adata.var[ann.columns[i]] = ann[ann.columns[i]]
+
+    def load_obs_annotations(self, aname: str | pd.DataFrame, sep: str = ",") -> None:
+        """Load cell annotations.
+
+        Parameters
+        ----------
+        aname : str | pd.DataFrame
+            Path to annotations file or DataFrame. First column should be cell IDs.
+        sep : str, optional
+            Delimiter for file. Default is ','.
+        """
+        if isinstance(aname, pd.DataFrame):
+            ann = aname
+        else:
+            ann = pd.read_csv(aname, sep=sep, index_col=0)
+
+        for i in range(ann.shape[1]):
+            self.adata_raw.obs[ann.columns[i]] = ann[ann.columns[i]]
+            self.adata.obs[ann.columns[i]] = ann[ann.columns[i]]
+
+    def scatter(
+        self,
+        projection: str | NDArray[np.floating[Any]] | None = None,
+        c: str | NDArray[Any] | None = None,
+        colorspec: str | NDArray[Any] | None = None,
+        cmap: str = "rainbow",
+        linewidth: float = 0.0,
+        edgecolor: str = "k",
+        axes: Any | None = None,
+        colorbar: bool = True,
+        s: float = 10,
+        **kwargs: Any,
+    ) -> Any:
+        """Display a scatter plot.
+
+        Parameters
+        ----------
+        projection : str | NDArray | None, optional
+            Key in adata.obsm or 2D coordinates array. Default is UMAP.
+        c : str | NDArray | None, optional
+            Color data - key in adata.obs or array.
+        colorspec : str | NDArray | None, optional
+            Direct color specification.
+        cmap : str, optional
+            Colormap name. Default is 'rainbow'.
+        linewidth : float, optional
+            Marker edge width. Default is 0.0.
+        edgecolor : str, optional
+            Marker edge color. Default is 'k'.
+        axes : matplotlib.axes.Axes | None, optional
+            Existing axes to plot on.
+        colorbar : bool, optional
+            Whether to show colorbar. Default is True.
+        s : float, optional
+            Marker size. Default is 10.
+        **kwargs
+            Additional arguments passed to matplotlib.pyplot.scatter.
+
+        Returns
+        -------
+        matplotlib.axes.Axes
+            The axes object.
+        """
+        try:
+            import matplotlib.pyplot as plt
+
+            if isinstance(projection, str):
+                if projection not in self.adata.obsm:
+                    logger.error("Please create a projection first using run_umap or run_tsne")
+                    return None
+                dt = self.adata.obsm[projection]
+
+            elif projection is None:
+                if "X_umap" in self.adata.obsm:
+                    dt = self.adata.obsm["X_umap"]
+                elif "X_tsne" in self.adata.obsm:
+                    dt = self.adata.obsm["X_tsne"]
+                else:
+                    logger.error("Please create either a t-SNE or UMAP projection first.")
+                    return None
+            else:
+                dt = projection
+
+            if axes is None:
+                plt.figure()
+                axes = plt.gca()
+
+            if colorspec is not None:
+                axes.scatter(
+                    dt[:, 0],
+                    dt[:, 1],
+                    s=s,
+                    linewidth=linewidth,
+                    edgecolor=edgecolor,
+                    c=colorspec,
+                    **kwargs,
+                )
+            elif c is None:
+                axes.scatter(
+                    dt[:, 0],
+                    dt[:, 1],
+                    s=s,
+                    linewidth=linewidth,
+                    edgecolor=edgecolor,
+                    **kwargs,
+                )
+            else:
+                if isinstance(c, str):
+                    try:
+                        c = self.get_labels(c)
+                    except KeyError:
+                        pass
+
+                if (isinstance(c[0], str) or isinstance(c[0], np.str_)) and (
+                    isinstance(c, np.ndarray) or isinstance(c, list)
+                ):
+                    i = ut.convert_annotations(c)
+                    ui, ai = np.unique(i, return_index=True)
+                    cax = axes.scatter(
+                        dt[:, 0],
+                        dt[:, 1],
+                        c=i,
+                        cmap=cmap,
+                        s=s,
+                        linewidth=linewidth,
+                        edgecolor=edgecolor,
+                        **kwargs,
+                    )
+
+                    if colorbar:
+                        cbar = plt.colorbar(cax, ax=axes, ticks=ui)
+                        cbar.ax.set_yticklabels(c[ai])
+                else:
+                    if not (isinstance(c, np.ndarray) or isinstance(c, list)):
+                        colorbar = False
+                    i = c
+
+                    # Only pass cmap if c is numeric data (not color specs)
+                    scatter_kwargs: dict[str, Any] = {
+                        "c": i,
+                        "s": s,
+                        "linewidth": linewidth,
+                        "edgecolor": edgecolor,
+                        **kwargs,
+                    }
+                    # Check if c is numeric array data suitable for colormapping
+                    if isinstance(i, np.ndarray) and np.issubdtype(i.dtype, np.number):
+                        scatter_kwargs["cmap"] = cmap
+
+                    cax = axes.scatter(
+                        dt[:, 0],
+                        dt[:, 1],
+                        **scatter_kwargs,
+                    )
+
+                    if colorbar:
+                        plt.colorbar(cax, ax=axes)
+            return axes
+        except ImportError:
+            logger.error("matplotlib not installed!")
+            return None
+
+    def show_gene_expression(
+        self, gene: str, avg: bool = True, axes: Any | None = None, **kwargs: Any
+    ) -> tuple[Any, NDArray[np.floating[Any]]] | None:
+        """Display a gene's expression pattern.
+
+        Parameters
+        ----------
+        gene : str
+            Gene name to display.
+        avg : bool, optional
+            If True, use k-nearest-neighbor averaged expression. Default is True.
+        axes : matplotlib.axes.Axes | None, optional
+            Existing axes to plot on.
+        **kwargs
+            Additional arguments passed to scatter().
+
+        Returns
+        -------
+        tuple[Axes, NDArray] | None
+            The axes object and expression values, or None if gene not found.
+        """
+        all_gene_names = np.array(list(self.adata.var_names))
+        cell_names = np.array(list(self.adata.obs_names))
+        all_cell_names = np.array(list(self.adata_raw.obs_names))
+        idx2 = np.where(np.isin(all_cell_names, cell_names))[0]
+        idx = np.where(all_gene_names == gene)[0]
+        name = gene
+        if idx.size == 0:
+            logger.warning(
+                "Gene not found in the filtered dataset. Note that genes are case sensitive."
+            )
+            return None
+
+        if avg:
+            a = self.adata.layers["X_knn_avg"][:, idx].toarray().flatten()
+            if a.sum() == 0:
+                a = self.adata_raw.X[:, idx].toarray().flatten()[idx2]
+                norm = self.preprocess_args.get("norm", "log")
+                if norm is not None:
+                    if norm.lower() == "log":
+                        a = np.log2(a + 1)
+                    elif norm.lower() == "ftt":
+                        a = np.sqrt(a) + np.sqrt(a + 1)
+                    elif norm.lower() == "asin":
+                        a = np.arcsinh(a)
+        else:
+            a = self.adata_raw.X[:, idx].toarray().flatten()[idx2]
+            norm = self.preprocess_args.get("norm", "log")
+            if norm is not None:
+                if norm.lower() == "log":
+                    a = np.log2(a + 1)
+                elif norm.lower() == "ftt":
+                    a = np.sqrt(a) + np.sqrt(a + 1)
+                elif norm.lower() == "asin":
+                    a = np.arcsinh(a)
+
+        axes = self.scatter(c=a, axes=axes, **kwargs)
+        if axes is not None:
+            axes.set_title(name)
+
+        return axes, a
+
+    def dispersion_ranking_NN(
+        self,
+        nnm: sp.spmatrix | None = None,
+        num_norm_avg: int = 50,
+        weight_mode: Literal["dispersion", "variance", "rms", "combined"] = "combined",
+        save_avgs: bool = False,
+        adata: AnnData | None = None,
+    ) -> NDArray[np.float64]:
+        """Compute spatial dispersion factors for each gene.
+
+        Parameters
+        ----------
+        nnm : scipy.sparse.spmatrix | None, optional
+            Cell-to-cell nearest-neighbor matrix. If None, uses
+            adata.obsp['connectivities'].
+        num_norm_avg : int, optional
+            Number of top dispersions to average for normalization. Default is 50.
+        weight_mode : str, optional
+            Weight calculation method. One of 'dispersion', 'variance', 'rms',
+            'combined'. Default is 'combined'.
+        save_avgs : bool, optional
+            If True, save kNN-averaged values to layers['X_knn_avg']. Default is False.
+        adata : AnnData | None, optional
+            AnnData object to use. If None, uses self.adata.
+
+        Returns
+        -------
+        NDArray[np.float64]
+            Vector of gene weights.
+        """
+        if adata is None:
+            adata = self.adata
+
+        if nnm is None:
+            nnm = adata.obsp["connectivities"]
+        f = np.asarray(nnm.sum(1))
+        f[f == 0] = 1
+        D_avg = (nnm.multiply(1 / f)).dot(adata.layers["X_disp"])
+
+        if save_avgs:
+            adata.layers["X_knn_avg"] = D_avg.copy()
+
+        if sp.issparse(D_avg):
+            mu, var = sf.mean_variance_axis(D_avg, axis=0)
+            if weight_mode == "rms":
+                D_avg.data[:] = D_avg.data**2
+                mu, _ = sf.mean_variance_axis(D_avg, axis=0)
+                mu = mu**0.5
+
+            if weight_mode == "combined":
+                D_avg.data[:] = D_avg.data**2
+                mu2, _ = sf.mean_variance_axis(D_avg, axis=0)
+                mu2 = mu2**0.5
+        else:
+            mu = D_avg.mean(0)
+            var = D_avg.var(0)
+            if weight_mode == "rms":
+                mu = (D_avg**2).mean(0) ** 0.5
+            if weight_mode == "combined":
+                mu2 = (D_avg**2).mean(0) ** 0.5
+
+        if not save_avgs:
+            del D_avg
+            gc.collect()
+
+        if weight_mode in ("dispersion", "rms", "combined"):
+            dispersions = np.zeros(var.size)
+            dispersions[mu > 0] = var[mu > 0] / mu[mu > 0]
+            adata.var["spatial_dispersions"] = dispersions.copy()
+
+            if weight_mode == "combined":
+                dispersions2 = np.zeros(var.size)
+                dispersions2[mu2 > 0] = var[mu2 > 0] / mu2[mu2 > 0]
+
+        elif weight_mode == "variance":
+            dispersions = var
+            adata.var["spatial_variances"] = dispersions.copy()
+        else:
+            raise InvalidParameterError(
+                "weight_mode",
+                weight_mode,
+                valid_values=["dispersion", "variance", "rms", "combined"],
+            )
+
+        ma = np.sort(dispersions)[-num_norm_avg:].mean()
+        dispersions[dispersions >= ma] = ma
+
+        weights = ((dispersions / dispersions.max()) ** 0.5).flatten()
+
+        if weight_mode == "combined":
+            ma = np.sort(dispersions2)[-num_norm_avg:].mean()
+            dispersions2[dispersions2 >= ma] = ma
+
+            weights2 = ((dispersions2 / dispersions2.max()) ** 0.5).flatten()
+            weights = np.vstack((weights, weights2)).max(0)
+
+        return weights
+
+    def run(
+        self,
+        max_iter: int = 10,
+        verbose: bool = True,
+        projection: Literal["umap", "tsne", "diff_umap"] | None = "umap",
+        stopping_condition: float = 1e-2,
+        num_norm_avg: int = 50,
+        k: int = 20,
+        distance: Literal["correlation", "euclidean", "cosine"] = "cosine",
+        preprocessing: Literal["StandardScaler", "Normalizer"] | None = "StandardScaler",
+        npcs: int = 150,
+        n_genes: int | None = 3000,
+        weight_PCs: bool = False,
+        sparse_pca: bool = False,
+        proj_kwargs: dict[str, Any] | None = None,
+        seed: int = 0,
+        weight_mode: Literal["dispersion", "variance", "rms", "combined"] = "rms",
+        components: NDArray[np.floating[Any]] | None = None,
+        batch_key: str | None = None,
+    ) -> None:
+        """Run the Self-Assembling Manifold algorithm.
+
+        Parameters
+        ----------
+        max_iter : int, optional
+            Maximum number of iterations. Default is 10.
+        verbose : bool, optional
+            If True, print progress. Default is True.
+        projection : str | None, optional
+            Projection method: 'umap', 'tsne', 'diff_umap', or None. Default is 'umap'.
+        stopping_condition : float, optional
+            RMSE threshold for convergence. Default is 1e-2.
+        num_norm_avg : int, optional
+            Top dispersions to average for normalization. Default is 50.
+        k : int, optional
+            Number of nearest neighbors. Default is 20.
+        distance : str, optional
+            Distance metric: 'correlation', 'euclidean', 'cosine'. Default is 'cosine'.
+        preprocessing : str | None, optional
+            Preprocessing method: 'StandardScaler', 'Normalizer', None. Default is 'StandardScaler'.
+        npcs : int, optional
+            Number of principal components. Default is 150.
+        n_genes : int | None, optional
+            Number of genes to use. Default is 3000. If None, uses all genes.
+        weight_PCs : bool, optional
+            Weight PCs by eigenvalues. Default is False.
+        sparse_pca : bool, optional
+            Use sparse PCA implementation. Default is False.
+        proj_kwargs : dict | None, optional
+            Additional arguments for projection. Default is None.
+        seed : int, optional
+            Random seed. Default is 0.
+        weight_mode : str, optional
+            Weight calculation mode. Default is 'rms'.
+        components : NDArray | None, optional
+            Pre-computed PCA components. Default is None.
+        batch_key : str | None, optional
+            Key in obs for batch correction with Harmony. Default is None.
+        """
+        if proj_kwargs is None:
+            proj_kwargs = {}
+
+        D = self.adata.X
+        if k < 5:
+            k = 5
+        if k > D.shape[0] - 1:
+            k = D.shape[0] - 2
+
+        if preprocessing not in ("StandardScaler", "Normalizer", None, "None"):
+            raise InvalidParameterError(
+                "preprocessing",
+                preprocessing,
+                valid_values=["StandardScaler", "Normalizer", None],
+            )
+        if weight_mode not in ("dispersion", "variance", "rms", "combined"):
+            raise InvalidParameterError(
+                "weight_mode",
+                weight_mode,
+                valid_values=["dispersion", "variance", "rms", "combined"],
+            )
+
+        if self.adata.layers["X_disp"].min() < 0 and weight_mode == "dispersion":
+            logger.warning(
+                "`X_disp` layer contains negative values. Setting `weight_mode` to 'rms'."
+            )
+            weight_mode = "rms"
+
+        numcells = D.shape[0]
+
+        if n_genes is None:
+            n_genes = self.adata.shape[1]
+            if not sparse_pca and numcells > 10000:
+                warnings.warn(
+                    "All genes are being used. It is recommended "
+                    "to set `sparse_pca=True` to satisfy memory "
+                    "constraints for datasets with more than "
+                    "10,000 cells. Setting `sparse_pca` to True."
+                )
+                sparse_pca = True
+
+        if not sparse_pca:
+            n_genes = min(n_genes, (D.sum(0) > 0).sum())
+
+        self.run_args = {
+            "max_iter": max_iter,
+            "verbose": verbose,
+            "projection": projection,
+            "stopping_condition": stopping_condition,
+            "num_norm_avg": num_norm_avg,
+            "k": k,
+            "distance": distance,
+            "preprocessing": preprocessing,
+            "npcs": npcs,
+            "n_genes": n_genes,
+            "weight_PCs": weight_PCs,
+            "proj_kwargs": proj_kwargs,
+            "sparse_pca": sparse_pca,
+            "weight_mode": weight_mode,
+            "seed": seed,
+            "components": components,
+        }
+        self.adata.uns["run_args"] = self.run_args
+
+        tinit = time.time()
+        np.random.seed(seed)
+
+        if verbose:
+            logger.info("Running SAM algorithm")
+
+        W = np.ones(D.shape[1])
+        self.adata.var["weights"] = W
+
+        old = np.zeros(W.size)
+        new = W
+
+        i = 0
+        err = ((new - old) ** 2).mean() ** 0.5
+
+        if max_iter < 5:
+            max_iter = 5
+
+        nnas = num_norm_avg
+
+        while i < max_iter and err > stopping_condition:
+            conv = err
+            if verbose:
+                logger.info("Iteration: %d, Convergence: %.6f", i, conv)
+
+            i += 1
+            old = new
+            first = i == 1
+
+            W = self.calculate_nnm(
+                batch_key=batch_key,
+                n_genes=n_genes,
+                preprocessing=preprocessing,
+                npcs=npcs,
+                num_norm_avg=nnas,
+                weight_PCs=weight_PCs,
+                sparse_pca=sparse_pca,
+                weight_mode=weight_mode,
+                seed=seed,
+                components=components,
+                first=first,
+            )
+            gc.collect()
+            new = W
+            err = ((new - old) ** 2).mean() ** 0.5
+            self.adata.var["weights"] = W
+
+        all_gene_names = np.array(list(self.adata.var_names))
+        indices = np.argsort(-W)
+        ranked_genes = all_gene_names[indices]
+
+        self.adata.uns["ranked_genes"] = ranked_genes
+
+        if projection == "tsne":
+            if verbose:
+                logger.info("Computing the t-SNE embedding...")
+            self.run_tsne(**proj_kwargs)
+        elif projection == "umap":
+            if verbose:
+                logger.info("Computing the UMAP embedding...")
+            self.run_umap(seed=seed, **proj_kwargs)
+        elif projection == "diff_umap":
+            if verbose:
+                logger.info("Computing the diffusion UMAP embedding...")
+            self.run_diff_umap(**proj_kwargs)
+
+        elapsed = time.time() - tinit
+        if verbose:
+            logger.info("Elapsed time: %.2f seconds", elapsed)
+
+    def calculate_nnm(
+        self,
+        adata: AnnData | None = None,
+        batch_key: str | None = None,
+        g_weighted: NDArray[np.floating[Any]] | None = None,
+        n_genes: int = 3000,
+        preprocessing: str | None = "StandardScaler",
+        npcs: int = 150,
+        num_norm_avg: int = 50,
+        weight_PCs: bool = False,
+        sparse_pca: bool = False,
+        update_manifold: bool = True,
+        weight_mode: str = "dispersion",
+        seed: int = 0,
+        components: NDArray[np.floating[Any]] | None = None,
+        first: bool = False,
+    ) -> NDArray[np.float64] | tuple[NDArray[np.floating[Any]], NDArray[np.floating[Any]]]:
+        """Calculate nearest neighbor matrix and update weights.
+
+        This is the core iteration step of the SAM algorithm.
+
+        Parameters
+        ----------
+        adata : AnnData | None
+            AnnData object to use.
+        batch_key : str | None
+            Key for batch correction.
+        g_weighted : NDArray | None
+            Pre-computed weighted coordinates.
+        n_genes : int
+            Number of genes to use.
+        preprocessing : str | None
+            Preprocessing method.
+        npcs : int
+            Number of PCs.
+        num_norm_avg : int
+            Normalization averaging.
+        weight_PCs : bool
+            Weight by eigenvalues.
+        sparse_pca : bool
+            Use sparse PCA.
+        update_manifold : bool
+            Update manifold structure.
+        weight_mode : str
+            Weight calculation mode.
+        seed : int
+            Random seed.
+        components : NDArray | None
+            Pre-computed components.
+        first : bool
+            Is this the first iteration.
+
+        Returns
+        -------
+        NDArray | tuple
+            Gene weights, or (PCs, weighted_coords) if not updating manifold.
+        """
+        if adata is None:
+            adata = self.adata
+
+        numcells = adata.shape[0]
+        k = adata.uns["run_args"].get("k", 20)
+        distance = adata.uns["run_args"].get("distance", "correlation")
+
+        D = adata.X
+        W = adata.var["weights"].values
+
+        if "means" not in adata.var.keys() or "variances" not in adata.var.keys():
+            self.calculate_mean_var(adata)
+
+        if n_genes is None:
+            gkeep = np.arange(W.size)
+        else:
+            if first:
+                mu = np.array(list(adata.var["means"]))
+                var = np.array(list(adata.var["variances"]))
+                mu[mu == 0] = 1
+                dispersions = var / mu
+                gkeep = np.sort(np.argsort(-dispersions)[:n_genes])
+            else:
+                gkeep = np.sort(np.argsort(-W)[:n_genes])
+
+        if g_weighted is None:
+            if preprocessing == "Normalizer":
+                Ds = D[:, gkeep]
+                if sp.issparse(Ds) and not sparse_pca:
+                    Ds = Ds.toarray()
+
+                Ds = Normalizer().fit_transform(Ds)
+
+            elif preprocessing == "StandardScaler":
+                if not sparse_pca:
+                    Ds = D[:, gkeep]
+                    if sp.issparse(Ds):
+                        Ds = Ds.toarray()
+
+                    v = adata.var["variances"].values[gkeep]
+                    m = adata.var["means"].values[gkeep]
+                    v[v == 0] = 1
+                    Ds = (Ds - m) / v**0.5
+
+                    Ds[Ds > 10] = 10
+                    Ds[Ds < -10] = -10
+                else:
+                    Ds = D[:, gkeep]
+                    v = adata.var["variances"].values[gkeep]
+                    v[v == 0] = 1
+                    Ds = Ds.multiply(1 / v**0.5).tocsr()
+
+            else:
+                Ds = D[:, gkeep].toarray()
+
+            if sp.issparse(Ds):
+                D_sub = Ds.multiply(W[gkeep]).tocsr()
+            else:
+                D_sub = Ds * (W[gkeep])
+
+            if components is None:
+                if not sparse_pca:
+                    npcs = min(npcs, min((D.shape[0], gkeep.size)))
+                    if numcells > 500:
+                        g_weighted, pca = ut.weighted_PCA(
+                            D_sub,
+                            npcs=npcs,
+                            do_weight=weight_PCs,
+                            solver="auto",
+                            seed=seed,
+                        )
+                    else:
+                        g_weighted, pca = ut.weighted_PCA(
+                            D_sub,
+                            npcs=npcs,
+                            do_weight=weight_PCs,
+                            solver="full",
+                            seed=seed,
+                        )
+                    components = pca.components_
+
+                else:
+                    npcs = min(npcs, min((D.shape[0], gkeep.size)) - 1)
+                    v = adata.var["variances"].values[gkeep]
+                    v[v == 0] = 1
+                    m = adata.var["means"].values[gkeep] * W[gkeep]
+                    if preprocessing == "StandardScaler":
+                        no = m / v**0.5
+                    else:
+                        no = np.asarray(D_sub.mean(0)).flatten()
+                    mean_correction = no
+                    output = ut._pca_with_sparse(D_sub, npcs, mu=(no)[None, :], seed=seed)
+                    components = output["components"]
+                    g_weighted = output["X_pca"]
+
+                    if weight_PCs:
+                        ev = output["variance"]
+                        ev = ev / ev.max()
+                        g_weighted = g_weighted * (ev**0.5)
+            else:
+                components = components[:, gkeep]
+                v = adata.var["variances"].values[gkeep]
+                v[v == 0] = 1
+                m = adata.var["means"].values[gkeep] * W[gkeep]
+                if preprocessing == "StandardScaler":
+                    ns = m / v**0.5
+                else:
+                    ns = np.asarray(D_sub.mean(0)).flatten()
+                mean_correction = ns
+
+                if sp.issparse(D_sub):
+                    g_weighted = D_sub.dot(components.T) - ns.flatten().dot(components.T)
+                else:
+                    g_weighted = (D_sub - ns).dot(components.T)
+                if weight_PCs:
+                    ev = g_weighted.var(0)
+                    ev = ev / ev.max()
+                    g_weighted = g_weighted * (ev**0.5)
+
+            adata.varm["PCs"] = np.zeros(shape=(adata.n_vars, npcs))
+            adata.varm["PCs"][gkeep] = components.T
+            adata.obsm["X_processed"] = D_sub
+            adata.uns["dimred_indices"] = gkeep
+            if sparse_pca:
+                mc = np.zeros(adata.shape[1])
+                mc[gkeep] = mean_correction
+                adata.var["mean_correction"] = mc
+
+        if batch_key is not None:
+            try:
+                import harmonypy
+
+                harmony_out = harmonypy.run_harmony(g_weighted, adata.obs, batch_key, verbose=False)
+                g_weighted = harmony_out.Z_corr.T
+            except ImportError as err:
+                raise ImportError(
+                    "harmonypy is required for batch correction. "
+                    "Install it with: pip install harmonypy"
+                ) from err
+
+        if update_manifold:
+            edm = ut.calc_nnm(g_weighted, k, distance)
+            # Use lil format temporarily for setdiag, then convert back to csr
+            edm_lil = edm.tolil()
+            edm_lil.setdiag(0)
+            adata.obsp["distances"] = edm_lil.tocsr()
+
+            EDM = edm.copy()
+            EDM.data[:] = 1
+            EDM_lil = EDM.tolil()
+            EDM_lil.setdiag(1)
+            EDM = EDM_lil.tocsr()
+
+            adata.obsp["connectivities"] = EDM
+
+            if distance in ("correlation", "cosine"):
+                edm.data[:] = 1 - edm.data
+                edm_lil = edm.tolil()
+                edm_lil.setdiag(1)
+                edm = edm_lil.tocsr()
+                edm.data[edm.data < 0] = 0.001
+                adata.obsp["nnm"] = edm
+            else:
+                adata.obsp["nnm"] = EDM
+            W = self.dispersion_ranking_NN(
+                EDM, weight_mode=weight_mode, num_norm_avg=num_norm_avg, adata=adata
+            )
+            adata.obsm["X_pca"] = g_weighted
+            return W
+        else:
+            logger.info("Not updating the manifold...")
+            PCs = np.zeros(shape=(adata.n_vars, npcs))
+            PCs[gkeep] = components.T
+            return PCs, g_weighted
+
+    def run_tsne(
+        self,
+        X: NDArray[np.floating[Any]] | None = None,
+        metric: str = "correlation",
+        **kwargs: Any,
+    ) -> NDArray[np.floating[Any]] | None:
+        """Compute t-SNE embedding.
+
+        Parameters
+        ----------
+        X : NDArray | None, optional
+            Input data. If None, uses X_pca from adata.
+        metric : str, optional
+            Distance metric. Default is 'correlation'.
+        **kwargs
+            Additional arguments passed to sklearn.manifold.TSNE.
+
+        Returns
+        -------
+        NDArray | None
+            t-SNE coordinates if X provided, None otherwise.
+        """
+        if X is not None:
+            dt = man.TSNE(metric=metric, **kwargs).fit_transform(X)
+            return dt
+
+        distance = self.adata.uns["run_args"].get("distance", "correlation")
+        dt = man.TSNE(metric=distance, **kwargs).fit_transform(self.adata.obsm["X_pca"])
+        self.adata.obsm["X_tsne"] = dt
+        return None
+
+    def run_umap(
+        self,
+        X: str | NDArray[np.floating[Any]] = "X_pca",
+        metric: str | None = None,
+        seed: int | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Compute UMAP embedding.
+
+        Parameters
+        ----------
+        X : str | NDArray, optional
+            Key in obsm or data array. Default is 'X_pca'.
+        metric : str | None, optional
+            Distance metric. If None, uses value from run_args.
+        seed : int | None, optional
+            Random seed. If None, uses value from run_args.
+        **kwargs
+            Additional arguments passed to umap.UMAP.
+
+        Returns
+        -------
+        umap.UMAP | tuple
+            UMAP object if using key, or (coords, UMAP) if using array.
+        """
+        import umap
+
+        if metric is None:
+            metric = self.adata.uns["run_args"].get("distance", "correlation")
+
+        if seed is None:
+            seed = self.adata.uns["run_args"].get("seed", 0)
+
+        if isinstance(X, str):
+            if X == "":
+                X_data = self.adata.X
+            else:
+                X_data = self.adata.obsm[X]
+            umap_obj = umap.UMAP(metric=metric, random_state=seed, **kwargs)
+            umap2d = umap_obj.fit_transform(X_data)
+            self.adata.obsm["X_umap"] = umap2d
+            return umap_obj
+        else:
+            umap_obj = umap.UMAP(metric=metric, random_state=seed, **kwargs)
+            dt = umap_obj.fit_transform(X)
+            return dt, umap_obj
+
+    def run_diff_umap(
+        self,
+        use_rep: str = "X_pca",
+        metric: str = "euclidean",
+        n_comps: int = 15,
+        method: str = "gauss",
+        **kwargs: Any,
+    ) -> None:
+        """Compute diffusion UMAP embedding.
+
+        Requires scanpy.
+
+        Parameters
+        ----------
+        use_rep : str, optional
+            Key in obsm to use. Default is 'X_pca'.
+        metric : str, optional
+            Distance metric. Default is 'euclidean'.
+        n_comps : int, optional
+            Number of diffusion components. Default is 15.
+        method : str, optional
+            Method for scanpy neighbors. Default is 'gauss'.
+        **kwargs
+            Additional arguments.
+        """
+        import scanpy.api as sc
+
+        k = self.adata.uns["run_args"].get("k", 20)
+        distance = self.adata.uns["run_args"].get("distance", "correlation")
+        sc.pp.neighbors(self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method)
+        sc.tl.diffmap(self.adata, n_comps=n_comps)
+        sc.pp.neighbors(
+            self.adata,
+            use_rep="X_diffmap",
+            n_neighbors=k,
+            metric="euclidean",
+            method=method,
+        )
+
+        if "X_umap" in self.adata.obsm.keys():
+            temp = self.adata.obsm["X_umap"].copy()
+
+        sc.tl.umap(self.adata, min_dist=0.1, copy=False)
+        temp2 = self.adata.obsm["X_umap"]
+        self.adata.obsm["X_umap"] = temp
+        self.adata.obsm["X_diff_umap"] = temp2
+
+    def run_diff_map(
+        self,
+        use_rep: str = "X_pca",
+        metric: str = "euclidean",
+        n_comps: int = 15,
+        method: str = "gauss",
+        **kwargs: Any,
+    ) -> None:
+        """Compute diffusion map embedding.
+
+        Requires scanpy.
+
+        Parameters
+        ----------
+        use_rep : str, optional
+            Key in obsm to use. Default is 'X_pca'.
+        metric : str, optional
+            Distance metric. Default is 'euclidean'.
+        n_comps : int, optional
+            Number of components. Default is 15.
+        method : str, optional
+            Method for neighbors. Default is 'gauss'.
+        **kwargs
+            Additional arguments.
+        """
+        import scanpy.api as sc
+
+        k = self.adata.uns["run_args"].get("k", 20)
+        distance = self.adata.uns["run_args"].get("distance", "correlation")
+        sc.pp.neighbors(self.adata, use_rep=use_rep, n_neighbors=k, metric=distance, method=method)
+        sc.tl.diffmap(self.adata, n_comps=n_comps + 1)
+        self.adata.obsm["X_diffmap"] = self.adata.obsm["X_diffmap"][:, 1:]
+
+    def density_clustering(
+        self,
+        X: NDArray[np.floating[Any]] | None = None,
+        eps: float = 1,
+        metric: str = "euclidean",
+        **kwargs: Any,
+    ) -> NDArray[np.int64] | None:
+        """Perform DBSCAN clustering.
+
+        Parameters
+        ----------
+        X : NDArray | None, optional
+            Input coordinates. If None, uses X_umap.
+        eps : float, optional
+            DBSCAN epsilon parameter. Default is 1.
+        metric : str, optional
+            Distance metric. Default is 'euclidean'.
+        **kwargs
+            Additional arguments passed to DBSCAN.
+
+        Returns
+        -------
+        NDArray | None
+            Cluster labels if X provided, None otherwise.
+        """
+        from sklearn.cluster import DBSCAN
+
+        if X is None:
+            X = self.adata.obsm["X_umap"]
+            save = True
+        else:
+            save = False
+
+        cl = DBSCAN(eps=eps, metric=metric, **kwargs).fit_predict(X)
+        k = self.adata.uns["run_args"].get("k", 20)
+        idx0 = np.where(cl != -1)[0]
+        idx1 = np.where(cl == -1)[0]
+        if idx1.size > 0 and idx0.size > 0:
+            xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :])
+            knn = np.argsort(xcmap.T, axis=1)[:, :k]
+            nnm = np.zeros(xcmap.shape).T
+            nnm[
+                np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(),
+                knn.flatten(),
+            ] = 1
+            nnmc = np.zeros((nnm.shape[0], cl.max() + 1))
+            for i in range(cl.max() + 1):
+                nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1)
+
+            cl[idx1] = np.argmax(nnmc, axis=1)
+
+        if save:
+            self.adata.obs["dbscan_clusters"] = pd.Categorical(cl)
+            return None
+        return cl
+
+    def clustering(
+        self,
+        X: sp.spmatrix | NDArray[np.floating[Any]] | None = None,
+        param: float | int | None = None,
+        method: Literal[
+            "leiden", "leiden_sig", "louvain", "louvain_sig", "kmeans", "hdbscan", "dbscan"
+        ] = "leiden",
+    ) -> NDArray[np.int64] | None:
+        """Wrapper for various clustering algorithms.
+
+        Parameters
+        ----------
+        X : sparse matrix | NDArray | None, optional
+            Input data. Type depends on method. If None, uses internal data.
+        param : float | int | None, optional
+            Method-specific parameter. Resolution for leiden/louvain,
+            number of clusters for kmeans, eps for dbscan.
+        method : str, optional
+            Clustering method. Default is 'leiden'.
+
+        Returns
+        -------
+        NDArray | None
+            Cluster labels if X provided, None otherwise.
+        """
+        if method == "leiden":
+            if param is None:
+                param = 1
+            return self.leiden_clustering(X=X, res=param, method="modularity")
+        elif method == "leiden_sig":
+            if param is None:
+                param = 1
+            return self.leiden_clustering(X=X, res=param, method="significance")
+        elif method == "louvain":
+            if param is None:
+                param = 1
+            return self.louvain_clustering(X=X, res=param, method="modularity")
+        elif method == "louvain_sig":
+            if param is None:
+                param = 1
+            return self.louvain_clustering(X=X, res=param, method="significance")
+        elif method == "kmeans":
+            if param is None:
+                param = 6
+            return self.kmeans_clustering(int(param), X=X)[0]
+        elif method == "hdbscan":
+            if param is None:
+                param = 25
+            return self.hdbknn_clustering(npcs=int(param))
+        elif method == "dbscan":
+            if param is None:
+                param = 0.5
+            return self.density_clustering(eps=param)
+        return None
+
+    def louvain_clustering(
+        self,
+        X: sp.spmatrix | None = None,
+        res: float = 1,
+        method: Literal["modularity", "significance"] = "modularity",
+    ) -> NDArray[np.int64] | None:
+        """Perform Louvain clustering.
+
+        Requires louvain and igraph packages.
+
+        Parameters
+        ----------
+        X : sparse matrix | None, optional
+            Adjacency matrix. If None, uses connectivities.
+        res : float, optional
+            Resolution parameter. Default is 1.
+        method : str, optional
+            Optimization method. Default is 'modularity'.
+
+        Returns
+        -------
+        NDArray | None
+            Cluster labels if X provided, None otherwise.
+        """
+        if X is None:
+            X = self.adata.obsp["connectivities"]
+            save = True
+        else:
+            if not sp.isspmatrix_csr(X):
+                X = sp.csr_matrix(X)
+            save = False
+
+        import igraph as ig
+        import louvain
+
+        adjacency = X
+        sources, targets = adjacency.nonzero()
+        weights = adjacency[sources, targets]
+        if isinstance(weights, np.matrix):
+            weights = np.asarray(weights).flatten()
+        g = ig.Graph(directed=True)
+        g.add_vertices(adjacency.shape[0])
+        g.add_edges(list(zip(sources, targets, strict=False)))
+        try:
+            g.es["weight"] = weights
+        except (ValueError, TypeError):
+            pass
+
+        if method == "significance":
+            cl = louvain.find_partition(g, louvain.SignificanceVertexPartition)
+        else:
+            cl = louvain.find_partition(
+                g, louvain.RBConfigurationVertexPartition, resolution_parameter=res
+            )
+
+        if save:
+            if method == "modularity":
+                self.adata.obs["louvain_clusters"] = pd.Categorical(np.array(cl.membership))
+            elif method == "significance":
+                self.adata.obs["louvain_sig_clusters"] = pd.Categorical(np.array(cl.membership))
+            return None
+        return np.array(cl.membership)
+
+    def kmeans_clustering(
+        self,
+        numc: int,
+        X: NDArray[np.floating[Any]] | None = None,
+        npcs: int = 25,
+    ) -> tuple[NDArray[np.int64], Any]:
+        """Perform k-means clustering.
+
+        Parameters
+        ----------
+        numc : int
+            Number of clusters.
+        X : NDArray | None, optional
+            Input coordinates. If None, uses X_pca.
+        npcs : int, optional
+            Unused parameter for backward compatibility.
+
+        Returns
+        -------
+        tuple
+            (cluster_labels, kmeans_object)
+        """
+        from sklearn.cluster import KMeans
+
+        if X is None:
+            X = self.adata.obsm["X_pca"]
+
+        km = KMeans(n_clusters=numc)
+        cl = km.fit_predict(Normalizer().fit_transform(X))
+
+        self.adata.obs["kmeans_clusters"] = pd.Categorical(cl)
+        return cl, km
+
+    def leiden_clustering(
+        self,
+        X: sp.spmatrix | None = None,
+        res: float = 1,
+        method: Literal["modularity", "significance"] = "modularity",
+        seed: int = 0,
+    ) -> NDArray[np.int64] | None:
+        """Perform Leiden clustering.
+
+        Requires leidenalg and igraph packages.
+
+        Parameters
+        ----------
+        X : sparse matrix | None, optional
+            Adjacency matrix. If None, uses connectivities.
+        res : float, optional
+            Resolution parameter. Default is 1.
+        method : str, optional
+            Optimization method. Default is 'modularity'.
+        seed : int, optional
+            Random seed. Default is 0.
+
+        Returns
+        -------
+        NDArray | None
+            Cluster labels if X provided, None otherwise.
+        """
+        if X is None:
+            X = self.adata.obsp["connectivities"]
+            save = True
+        else:
+            if not sp.isspmatrix_csr(X):
+                X = sp.csr_matrix(X)
+            save = False
+
+        import igraph as ig
+        import leidenalg
+
+        adjacency = X
+        sources, targets = adjacency.nonzero()
+        weights = adjacency[sources, targets]
+        if isinstance(weights, np.matrix):
+            weights = np.asarray(weights).flatten()
+        g = ig.Graph(directed=True)
+        g.add_vertices(adjacency.shape[0])
+        g.add_edges(list(zip(sources, targets, strict=False)))
+        try:
+            g.es["weight"] = weights
+        except (ValueError, TypeError):
+            pass
+
+        if method == "significance":
+            cl = leidenalg.find_partition(g, leidenalg.SignificanceVertexPartition, seed=seed)
+        else:
+            cl = leidenalg.find_partition(
+                g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res, seed=seed
+            )
+
+        if save:
+            if method == "modularity":
+                self.adata.obs["leiden_clusters"] = pd.Categorical(np.array(cl.membership))
+            elif method == "significance":
+                self.adata.obs["leiden_sig_clusters"] = pd.Categorical(np.array(cl.membership))
+            return None
+        return np.array(cl.membership)
+
+    def hdbknn_clustering(
+        self,
+        X: NDArray[np.floating[Any]] | None = None,
+        k: int | None = None,
+        npcs: int = 15,
+        **kwargs: Any,
+    ) -> NDArray[np.int64] | None:
+        """Perform HDBSCAN clustering.
+
+        Requires hdbscan package.
+
+        Parameters
+        ----------
+        X : NDArray | None, optional
+            Input coordinates. If None, uses X_pca.
+        k : int | None, optional
+            Number of neighbors for unassigned cells. If None, uses run_args k.
+        npcs : int, optional
+            Unused parameter for backward compatibility.
+        **kwargs
+            Additional arguments passed to HDBSCAN.
+
+        Returns
+        -------
+        NDArray | None
+            Cluster labels if X provided, None otherwise.
+        """
+        import hdbscan
+
+        if X is None:
+            X = self.adata.obsm["X_pca"]
+            X = Normalizer().fit_transform(X)
+            save = True
+        else:
+            save = False
+
+        if k is None:
+            k = self.adata.uns["run_args"].get("k", 20)
+
+        hdb = hdbscan.HDBSCAN(metric="euclidean", **kwargs)
+
+        cl = hdb.fit_predict(X)
+
+        idx0 = np.where(cl != -1)[0]
+        idx1 = np.where(cl == -1)[0]
+        if idx1.size > 0 and idx0.size > 0:
+            xcmap = ut.generate_euclidean_map(X[idx0, :], X[idx1, :])
+            knn = np.argsort(xcmap.T, axis=1)[:, :k]
+            nnm = np.zeros(xcmap.shape).T
+            nnm[
+                np.tile(np.arange(knn.shape[0])[:, None], (1, knn.shape[1])).flatten(),
+                knn.flatten(),
+            ] = 1
+            nnmc = np.zeros((nnm.shape[0], cl.max() + 1))
+            for i in range(cl.max() + 1):
+                nnmc[:, i] = nnm[:, cl[idx0] == i].sum(1)
+
+            cl[idx1] = np.argmax(nnmc, axis=1)
+
+        if save:
+            self.adata.obs["hdbscan_clusters"] = pd.Categorical(cl)
+            return None
+        return cl
+
+    def identify_marker_genes_rf(
+        self,
+        labels: str | NDArray[Any] | None = None,
+        clusters: int | str | Sequence[int | str] | None = None,
+        n_genes: int = 4000,
+    ) -> tuple[dict[Any, NDArray[Any]], dict[Any, NDArray[np.floating[Any]]]]:
+        """Identify marker genes using random forest classification.
+
+        Parameters
+        ----------
+        labels : str | NDArray | None, optional
+            Cluster labels or key in obs. If None, auto-detects clusters.
+        clusters : int | str | Sequence | None, optional
+            Specific cluster(s) to analyze. If None, analyzes all.
+        n_genes : int, optional
+            Number of genes for classifier training. Default is 4000.
+
+        Returns
+        -------
+        tuple
+            (markers dict, scores dict) mapping cluster IDs to gene arrays.
+        """
+        if labels is None:
+            keys = np.array(list(self.adata.obs_keys()))
+            matches = ut.search_string(keys, "_clusters")
+            if matches[0] == -1:
+                logger.error(
+                    "Please generate cluster labels first or set the 'labels' keyword argument."
+                )
+                return {}, {}
+            lbls = self.get_labels(matches[0][0])
+        elif isinstance(labels, str):
+            lbls = self.get_labels(labels)
+        else:
+            lbls = labels
+
+        from sklearn.ensemble import RandomForestClassifier
+
+        markers: dict[Any, NDArray[Any]] = {}
+        markers_scores: dict[Any, NDArray[np.floating[Any]]] = {}
+        if clusters is None:
+            lblsu = np.unique(lbls)
+        else:
+            lblsu = np.unique(clusters)
+
+        indices = np.argsort(-self.adata.var["weights"].values)
+        X = self.adata.layers["X_disp"][:, indices[:n_genes]].toarray()
+        for K in range(lblsu.size):
+            y = np.zeros(lbls.size)
+
+            y[lbls == lblsu[K]] = 1
+
+            clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0)
+
+            clf.fit(X, y)
+
+            idx = np.argsort(-clf.feature_importances_)
+
+            markers[lblsu[K]] = self.adata.uns["ranked_genes"][idx]
+            markers_scores[lblsu[K]] = clf.feature_importances_[idx]
+
+        if clusters is None:
+            if isinstance(labels, str):
+                self.adata.uns["rf_" + labels] = markers
+            else:
+                self.adata.uns["rf"] = markers
+
+        return markers, markers_scores
+
+    def identify_marker_genes_sw(
+        self,
+        labels: str | NDArray[Any] | None = None,
+        clusters: int | str | Sequence[int | str] | None = None,
+        inplace: bool = True,
+    ) -> pd.DataFrame | None:
+        """Identify marker genes using spatial dispersion weights.
+
+        Parameters
+        ----------
+        labels : str | NDArray | None, optional
+            Cluster labels or key in obs.
+        clusters : int | str | Sequence | None, optional
+            Specific cluster(s) to analyze.
+        inplace : bool, optional
+            If True, stores scores in var. Default is True.
+
+        Returns
+        -------
+        pd.DataFrame | None
+            DataFrame of scores if inplace=False, None otherwise.
+        """
+        if labels is None:
+            keys = np.array(list(self.adata.obs_keys()))
+            matches = ut.search_string(keys, "_clusters")
+            if matches[0] == -1:
+                logger.error(
+                    "Please generate cluster labels first or set the 'labels' keyword argument."
+                )
+                return None
+            lbls = self.get_labels(matches[0][0])
+        elif isinstance(labels, str):
+            lbls = self.get_labels(labels)
+        else:
+            lbls = labels
+
+        markers_scores = []
+        if clusters is None:
+            lblsu = np.unique(lbls)
+        else:
+            lblsu = np.unique(clusters)
+
+        if "X_knn_avg" not in list(self.adata.layers.keys()):
+            logger.info("Performing kNN-averaging...")
+            self.dispersion_ranking_NN(save_avgs=True)
+        layer = self.adata.layers["X_knn_avg"]
+        m = np.asarray(layer.mean(0)).flatten()
+        cells = np.array(list(self.adata.obs_names))
+        for K in range(lblsu.size):
+            selected = np.where(np.isin(cells, self.get_cells(lblsu[K], labels)))[0]
+            ms = np.asarray(layer[selected, :].mean(0)).flatten()
+            lsub = layer[selected, :]
+            lsub.data[:] = lsub.data**2
+            ms2 = np.asarray(lsub.mean(0)).flatten()
+            v = ms2 - 2 * ms * m + m**2
+            wmu = np.zeros(v.size)
+            wmu[m > 0] = v[m > 0] / m[m > 0]
+            markers_scores.append(wmu)
+        A = pd.DataFrame(
+            data=np.vstack(markers_scores), index=lblsu, columns=self.adata.var_names
+        ).T
+        if inplace:
+            A.columns = labels + ";;" + A.columns.astype("str").astype("object")
+            for Ac in A.columns:
+                self.adata.var[Ac] = A[Ac]
+            return None
+        return A
+
+    def identify_marker_genes_ratio(
+        self, labels: str | NDArray[Any] | None = None
+    ) -> dict[Any, NDArray[Any]]:
+        """Identify marker genes using SAM-weighted expression ratio.
+
+        Parameters
+        ----------
+        labels : str | NDArray | None, optional
+            Cluster labels or key in obs.
+
+        Returns
+        -------
+        dict
+            Mapping cluster IDs to ranked gene arrays.
+        """
+        if labels is None:
+            keys = np.array(list(self.adata.obs_keys()))
+            matches = ut.search_string(keys, "_clusters")
+            if matches[0] == -1:
+                logger.error(
+                    "Please generate cluster labels first or set the 'labels' keyword argument."
+                )
+                return {}
+            lbls = self.get_labels(matches[0][0])
+        elif isinstance(labels, str):
+            lbls = self.get_labels(labels)
+        else:
+            lbls = labels
+
+        all_gene_names = np.array(list(self.adata.var_names))
+
+        markers: dict[Any, NDArray[Any]] = {}
+
+        s = np.array(self.adata.layers["X_disp"].sum(0)).flatten()
+        lblsu = np.unique(lbls)
+        for i in lblsu:
+            d = np.array(self.adata.layers["X_disp"][lbls == i, :].sum(0)).flatten()
+            rat = np.zeros(d.size)
+            rat[s > 0] = d[s > 0] ** 2 / s[s > 0] * self.adata.var["weights"].values[s > 0]
+            x = np.argsort(-rat)
+            markers[i] = all_gene_names[x[:]]
+
+        self.adata.uns["marker_genes_ratio"] = markers
+
+        return markers
+
+    def save(self, fn: str) -> None:
+        """Save the SAM object to a pickle file.
+
+        Parameters
+        ----------
+        fn : str
+            Output file path.
+        """
+        import dill
+
+        if len(fn.split(".pkl")) == 1:
+            fn = fn + ".pkl"
+        self.path_to_file = fn
+        d = {}
+        for k in self.__dict__.keys():
+            d[k] = self.__dict__[k]
+        with open(fn, "wb") as f:
+            dill.dump(d, f)
+
+    def load(self, fn: str) -> None:
+        """Load a SAM object from a pickle file.
+
+        Parameters
+        ----------
+        fn : str
+            Input file path.
+        """
+        import dill
+
+        if len(fn.split(".pkl")) == 1:
+            fn = fn + ".pkl"
+        with open(fn, "rb") as f:
+            self.__dict__ = dill.load(f)
diff --git a/src/samalg/utilities.py b/src/samalg/utilities.py
new file mode 100644
index 0000000..7c86035
--- /dev/null
+++ b/src/samalg/utilities.py
@@ -0,0 +1,844 @@
+"""Utility functions for the SAM algorithm.
+
+This module provides helper functions for PCA, nearest neighbor computation,
+gene correlation analysis, and other common operations.
+"""
+
+from __future__ import annotations
+
+import errno
+import os
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import scipy as sp
+import sklearn.utils.sparsefuncs as sf
+from scipy import sparse
+from sklearn.decomposition import PCA
+from sklearn.utils import check_array, check_random_state
+from sklearn.utils.extmath import svd_flip
+from umap.umap_ import nearest_neighbors
+
+from ._logging import get_logger
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+    from .sam import SAM
+
+__version__ = "2.0.0"
+
+# Get module logger
+logger = get_logger("utilities")
+
+
+def find_corr_genes(sam: SAM, input_gene: str) -> NDArray[Any] | None:
+    """Rank genes by correlation of spatially averaged expression patterns.
+
+    Parameters
+    ----------
+    sam : SAM
+        The analyzed SAM object.
+    input_gene : str
+        The gene ID with respect to which correlations will be computed.
+
+    Returns
+    -------
+    NDArray | None
+        A ranked list of gene IDs based on correlation to the input gene,
+        or None if the gene is not found.
+    """
+    all_gene_names = np.array(list(sam.adata.var_names))
+
+    D_avg = sam.adata.layers["X_knn_avg"]
+
+    input_gene_idx = np.where(all_gene_names == input_gene)[0]
+
+    if input_gene_idx.size == 0:
+        logger.warning(
+            "Gene not found in the filtered dataset. Note that genes are case sensitive."
+        )
+        return None
+
+    pw_corr = generate_correlation_map(D_avg.T.toarray(), D_avg[:, input_gene_idx].T.toarray())
+    return all_gene_names[np.argsort(-pw_corr.flatten())]
+
+
+def _pca_with_sparse(
+    X: sparse.spmatrix,
+    npcs: int,
+    solver: str = "arpack",
+    mu: NDArray[np.floating[Any]] | None = None,
+    seed: int = 0,
+    mu_axis: int = 0,
+) -> dict[str, NDArray[np.floating[Any]]]:
+    """Perform PCA on sparse matrices using iterative SVD.
+
+    Parameters
+    ----------
+    X : sparse.spmatrix
+        Input sparse matrix.
+    npcs : int
+        Number of principal components.
+    solver : str, optional
+        SVD solver to use. Default is 'arpack'.
+    mu : NDArray | None, optional
+        Pre-computed mean. If None, computed from X.
+    seed : int, optional
+        Random seed. Default is 0.
+    mu_axis : int, optional
+        Axis along which mean was computed. Default is 0.
+
+    Returns
+    -------
+    dict
+        Dictionary with keys 'X_pca', 'variance', 'variance_ratio', 'components'.
+    """
+    random_state = check_random_state(seed)
+    np.random.set_state(random_state.get_state())
+    random_init = np.random.rand(np.min(X.shape))
+    X = check_array(X, accept_sparse=["csr", "csc"])
+
+    if mu is None:
+        if mu_axis == 0:
+            mu = np.asarray(X.mean(0)).flatten()[None, :]
+        else:
+            mu = np.asarray(X.mean(1)).flatten()[:, None]
+
+    if mu_axis == 0:
+        mdot = mu.dot
+        mmat = mdot
+        mhdot = mu.T.dot
+        mhmat = mu.T.dot
+        Xdot = X.dot
+        Xmat = Xdot
+        XHdot = X.T.conj().dot
+        XHmat = XHdot
+        ones = np.ones(X.shape[0])[None, :].dot
+
+        def matvec(x: NDArray[Any]) -> NDArray[Any]:
+            return Xdot(x) - mdot(x)
+
+        def matmat(x: NDArray[Any]) -> NDArray[Any]:
+            return Xmat(x) - mmat(x)
+
+        def rmatvec(x: NDArray[Any]) -> NDArray[Any]:
+            return XHdot(x) - mhdot(ones(x))
+
+        def rmatmat(x: NDArray[Any]) -> NDArray[Any]:
+            return XHmat(x) - mhmat(ones(x))
+
+    else:
+        mdot = mu.dot
+        mmat = mdot
+        mhdot = mu.T.dot
+        mhmat = mu.T.dot
+        Xdot = X.dot
+        Xmat = Xdot
+        XHdot = X.T.conj().dot
+        XHmat = XHdot
+        ones = np.ones(X.shape[1])[None, :].dot
+
+        def matvec(x: NDArray[Any]) -> NDArray[Any]:
+            return Xdot(x) - mdot(ones(x))
+
+        def matmat(x: NDArray[Any]) -> NDArray[Any]:
+            return Xmat(x) - mmat(ones(x))
+
+        def rmatvec(x: NDArray[Any]) -> NDArray[Any]:
+            return XHdot(x) - mhdot(x)
+
+        def rmatmat(x: NDArray[Any]) -> NDArray[Any]:
+            return XHmat(x) - mhmat(x)
+
+    XL = sp.sparse.linalg.LinearOperator(
+        matvec=matvec,
+        dtype=X.dtype,
+        matmat=matmat,
+        shape=X.shape,
+        rmatvec=rmatvec,
+        rmatmat=rmatmat,
+    )
+
+    u, s, v = sp.sparse.linalg.svds(XL, solver=solver, k=npcs, v0=random_init)
+    u, v = svd_flip(u, v)
+    idx = np.argsort(-s)
+    v = v[idx, :]
+
+    X_pca = (u * s)[:, idx]
+    ev = s[idx] ** 2 / (X.shape[0] - 1)
+
+    total_var = sf.mean_variance_axis(X, axis=0)[1].sum()
+    ev_ratio = ev / total_var
+
+    output = {
+        "X_pca": X_pca,
+        "variance": ev,
+        "variance_ratio": ev_ratio,
+        "components": v,
+    }
+    return output
+
+
+def nearest_neighbors_wrapper(
+    X: NDArray[np.floating[Any]],
+    n_neighbors: int = 15,
+    metric: str = "correlation",
+    metric_kwds: dict[str, Any] | None = None,
+    angular: bool = True,
+    random_state: int = 0,
+) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]:
+    """Wrapper for UMAP's nearest neighbors function.
+
+    Parameters
+    ----------
+    X : NDArray
+        Input data matrix.
+    n_neighbors : int, optional
+        Number of neighbors. Default is 15.
+    metric : str, optional
+        Distance metric. Default is 'correlation'.
+    metric_kwds : dict | None, optional
+        Additional metric arguments.
+    angular : bool, optional
+        Use angular distance. Default is True.
+    random_state : int, optional
+        Random seed. Default is 0.
+
+    Returns
+    -------
+    tuple
+        (indices, distances) arrays.
+    """
+    if metric_kwds is None:
+        metric_kwds = {}
+    random_state_obj = np.random.RandomState(random_state)
+    return nearest_neighbors(X, n_neighbors, metric, metric_kwds, angular, random_state_obj)[:2]
+
+
+def knndist(
+    nnma: sparse.spmatrix,
+) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]:
+    """Extract k-nearest neighbor indices and distances from sparse matrix.
+
+    Parameters
+    ----------
+    nnma : sparse.spmatrix
+        Sparse nearest neighbor matrix.
+
+    Returns
+    -------
+    tuple
+        (indices, distances) arrays.
+    """
+    _, y = nnma.nonzero()
+    data = nnma.data
+    knn = y.reshape((nnma.shape[0], nnma[0, :].data.size))
+    val = data.reshape(knn.shape)
+    return knn, val
+
+
+def save_figures(
+    filename: str,
+    fig_IDs: int | list[int] | None = None,
+    **kwargs: Any,
+) -> None:
+    """Save matplotlib figures to file.
+
+    Parameters
+    ----------
+    filename : str
+        Output filename.
+    fig_IDs : int | list[int] | None, optional
+        Figure IDs to save. If list, saves as PDF. If int, saves as PNG.
+        If None, saves all open figures as PDF.
+    **kwargs
+        Additional arguments passed to matplotlib.pyplot.savefig.
+    """
+    import matplotlib.pyplot as plt
+
+    if fig_IDs is not None:
+        if isinstance(fig_IDs, list):
+            savetype = "pdf"
+        else:
+            savetype = "png"
+    else:
+        savetype = "pdf"
+
+    if savetype == "pdf":
+        from matplotlib.backends.backend_pdf import PdfPages
+
+        if len(filename.split(".")) == 1:
+            filename = filename + ".pdf"
+        else:
+            filename = ".".join(filename.split(".")[:-1]) + ".pdf"
+
+        pdf = PdfPages(filename)
+
+        if fig_IDs is None:
+            figs = [plt.figure(n) for n in plt.get_fignums()]
+        else:
+            figs = [plt.figure(n) for n in fig_IDs]
+
+        for fig in figs:
+            fig.savefig(pdf, format="pdf", **kwargs)
+        pdf.close()
+    elif savetype == "png":
+        plt.figure(fig_IDs).savefig(filename, **kwargs)
+
+
+def weighted_PCA(
+    mat: NDArray[np.floating[Any]],
+    do_weight: bool = True,
+    npcs: int | None = None,
+    solver: str = "auto",
+    seed: int = 0,
+) -> tuple[NDArray[np.floating[Any]], PCA]:
+    """Perform PCA with optional eigenvalue weighting.
+
+    Parameters
+    ----------
+    mat : NDArray
+        Input data matrix.
+    do_weight : bool, optional
+        If True, weight PCs by eigenvalues. Default is True.
+    npcs : int | None, optional
+        Number of components. If None, uses min(mat.shape).
+    solver : str, optional
+        SVD solver. Default is 'auto'.
+    seed : int, optional
+        Random seed. Default is 0.
+
+    Returns
+    -------
+    tuple
+        (reduced_weighted, pca_object)
+    """
+    if do_weight:
+        if min(mat.shape) >= 10000 and npcs is None:
+            logger.warning(
+                "More than 10,000 cells. Running with 'npcs' set to < 1000 is recommended."
+            )
+
+        if npcs is None:
+            ncom = min(mat.shape)
+        else:
+            ncom = min((min(mat.shape), npcs))
+
+        pca = PCA(svd_solver=solver, n_components=ncom, random_state=check_random_state(seed))
+        reduced = pca.fit_transform(mat)
+        scaled_eigenvalues = pca.explained_variance_
+        scaled_eigenvalues = scaled_eigenvalues / scaled_eigenvalues.max()
+        reduced_weighted = reduced * scaled_eigenvalues[None, :] ** 0.5
+    else:
+        pca = PCA(n_components=npcs, svd_solver=solver, random_state=check_random_state(seed))
+        reduced = pca.fit_transform(mat)
+        if reduced.shape[1] == 1:
+            pca = PCA(n_components=2, svd_solver=solver, random_state=check_random_state(seed))
+            reduced = pca.fit_transform(mat)
+        reduced_weighted = reduced
+
+    return reduced_weighted, pca
+
+
+def transform_wPCA(mat: NDArray[np.floating[Any]], pca: PCA) -> NDArray[np.floating[Any]]:
+    """Transform data using a fitted weighted PCA model.
+
+    Parameters
+    ----------
+    mat : NDArray
+        Input data matrix.
+    pca : PCA
+        Fitted PCA object.
+
+    Returns
+    -------
+    NDArray
+        Transformed and weighted data.
+    """
+    mat = mat - pca.mean_
+    reduced = mat.dot(pca.components_.T)
+    v = pca.explained_variance_
+    scaled_eigenvalues = v / v.max()
+    reduced_weighted = np.array(reduced) * scaled_eigenvalues[None, :] ** 0.5
+    return reduced_weighted
+
+
+def search_string(
+    vec: NDArray[Any] | list[str],
+    s: str | list[str],
+    case_sensitive: bool = False,
+    invert: bool = False,
+) -> tuple[NDArray[Any] | int, NDArray[np.int64] | int]:
+    """Search for strings matching a pattern.
+
+    Parameters
+    ----------
+    vec : NDArray | list
+        Array of strings to search.
+    s : str | list
+        Pattern(s) to search for.
+    case_sensitive : bool, optional
+        Whether search is case-sensitive. Default is False.
+    invert : bool, optional
+        If True, return non-matching strings. Default is False.
+
+    Returns
+    -------
+    tuple
+        (matching_strings, indices) or (-1, -1) if no matches.
+    """
+    vec = np.array(vec)
+
+    if isinstance(s, list):
+        S = s
+    else:
+        S = [s]
+
+    V: list[NDArray[Any]] = []
+    M: list[NDArray[np.int64]] = []
+    for pattern in S:
+        m = []
+        if not case_sensitive:
+            pattern = pattern.lower()
+        for i in range(len(vec)):
+            if case_sensitive:
+                st = vec[i]
+            else:
+                st = vec[i].lower()
+            b = st.find(pattern)
+            if (not invert and b != -1) or (invert and b == -1):
+                m.append(i)
+        if len(m) > 0:
+            V.append(vec[np.array(m)])
+            M.append(np.array(m))
+    if len(V) > 0:
+        i = len(V)
+        if not invert:
+            V_arr = np.concatenate(V)
+            M_arr = np.concatenate(M)
+            if i > 1:
+                ix = np.sort(np.unique(M_arr, return_index=True)[1])
+                V_arr = V_arr[ix]
+                M_arr = M_arr[ix]
+            return V_arr, M_arr
+        else:
+            for j in range(len(V)):
+                V[j] = list(set(V[j]).intersection(*V))
+            V_arr = vec[np.isin(vec, np.unique(np.concatenate(V)))]
+            M_arr = np.array([np.where(vec == x)[0][0] for x in V_arr])
+            return V_arr, M_arr
+    else:
+        return -1, -1
+
+
+def distance_matrix_error(
+    dist1: NDArray[np.floating[Any]], dist2: NDArray[np.floating[Any]]
+) -> float:
+    """Compute correlation-based error between two distance matrices.
+
+    Parameters
+    ----------
+    dist1 : NDArray
+        First distance matrix.
+    dist2 : NDArray
+        Second distance matrix.
+
+    Returns
+    -------
+    float
+        Error value (1 - mean correlation).
+    """
+    s = 0.0
+    for k in range(dist1.shape[0]):
+        s += np.corrcoef(dist1[k, :], dist2[k, :])[0, 1]
+    return 1 - s / dist1.shape[0]
+
+
+def generate_euclidean_map(
+    A: NDArray[np.floating[Any]], B: NDArray[np.floating[Any]]
+) -> NDArray[np.floating[Any]]:
+    """Compute pairwise Euclidean distances between two sets of points.
+
+    Parameters
+    ----------
+    A : NDArray
+        First set of points (n x d).
+    B : NDArray
+        Second set of points (m x d).
+
+    Returns
+    -------
+    NDArray
+        Distance matrix (n x m).
+    """
+    a = (A**2).sum(1).flatten()
+    b = (B**2).sum(1).flatten()
+    x = a[:, None] + b[None, :] - 2 * np.dot(A, B.T)
+    x[x < 0] = 0
+    return np.sqrt(x)
+
+
+def generate_correlation_map(
+    x: NDArray[np.floating[Any]], y: NDArray[np.floating[Any]]
+) -> NDArray[np.floating[Any]]:
+    """Compute pairwise correlations between two sets of vectors.
+
+    Parameters
+    ----------
+    x : NDArray
+        First set of vectors (n x d).
+    y : NDArray
+        Second set of vectors (m x d).
+
+    Returns
+    -------
+    NDArray
+        Correlation matrix (n x m).
+    """
+    mu_x = x.mean(1)
+    mu_y = y.mean(1)
+    n = x.shape[1]
+    if n != y.shape[1]:
+        raise ValueError("x and y must have the same number of timepoints.")
+    s_x = x.std(1, ddof=n - 1)
+    s_y = y.std(1, ddof=n - 1)
+    s_x[s_x == 0] = 1
+    s_y[s_y == 0] = 1
+    cov = np.dot(x, y.T) - n * np.dot(mu_x[:, None], mu_y[None, :])
+    return cov / np.dot(s_x[:, None], s_y[None, :])
+
+
+def extract_annotation(
+    cn: NDArray[Any],
+    x: int | None,
+    c: str = "_",
+) -> NDArray[Any] | list[NDArray[Any]]:
+    """Extract annotations from cell names by splitting on delimiter.
+
+    Parameters
+    ----------
+    cn : NDArray
+        Array of cell names.
+    x : int | None
+        Index of annotation field to extract. If None, returns all fields.
+    c : str, optional
+        Delimiter character. Default is '_'.
+
+    Returns
+    -------
+    NDArray | list
+        Extracted annotations.
+    """
+    m = []
+    if x is not None:
+        for i in range(cn.size):
+            f = cn[i].split(c)
+            x = min(len(f) - 1, x)
+            m.append(f[x])
+        return np.array(m)
+    else:
+        ms: list[list[str]] = []
+        ls = []
+        for i in range(cn.size):
+            f = cn[i].split(c)
+            m_inner = []
+            for field_x in range(len(f)):
+                m_inner.append(f[field_x])
+            ms.append(m_inner)
+            ls.append(len(m_inner))
+        ml = max(ls)
+        for i in range(len(ms)):
+            ms[i].extend([""] * (ml - len(ms[i])))
+            if ml - len(ms[i]) > 0:
+                ms[i] = list(np.concatenate(ms[i]))
+        ms_arr = np.vstack(ms)
+        MS = []
+        for i in range(ms_arr.shape[1]):
+            MS.append(ms_arr[:, i])
+        return MS
+
+
+def isolate(
+    dt: NDArray[np.floating[Any]], x1: float, x2: float, y1: float, y2: float
+) -> NDArray[np.int64]:
+    """Get indices of points within a rectangular region.
+
+    Parameters
+    ----------
+    dt : NDArray
+        2D coordinates (n x 2).
+    x1, x2 : float
+        X-axis bounds.
+    y1, y2 : float
+        Y-axis bounds.
+
+    Returns
+    -------
+    NDArray
+        Indices of points within the region.
+    """
+    return np.where(
+        np.logical_and(
+            np.logical_and(dt[:, 0] > x1, dt[:, 0] < x2),
+            np.logical_and(dt[:, 1] > y1, dt[:, 1] < y2),
+        )
+    )[0]
+
+
+def to_lower(y: NDArray[Any]) -> NDArray[Any]:
+    """Convert string array to lowercase.
+
+    Parameters
+    ----------
+    y : NDArray
+        Array of strings.
+
+    Returns
+    -------
+    NDArray
+        Lowercase strings.
+    """
+    x = y.copy().flatten()
+    for i in range(x.size):
+        x[i] = x[i].lower()
+    return x
+
+
+def to_upper(y: NDArray[Any]) -> NDArray[Any]:
+    """Convert string array to uppercase.
+
+    Parameters
+    ----------
+    y : NDArray
+        Array of strings.
+
+    Returns
+    -------
+    NDArray
+        Uppercase strings.
+    """
+    x = y.copy().flatten()
+    for i in range(x.size):
+        x[i] = x[i].upper()
+    return x
+
+
+def create_folder(path: str) -> None:
+    """Create a directory if it doesn't exist.
+
+    Parameters
+    ----------
+    path : str
+        Directory path to create.
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exception:
+        if exception.errno != errno.EEXIST:
+            raise
+
+
+def convert_annotations(A: NDArray[Any]) -> NDArray[np.int64]:
+    """Convert categorical annotations to integer codes.
+
+    Parameters
+    ----------
+    A : NDArray
+        Array of categorical values.
+
+    Returns
+    -------
+    NDArray
+        Integer codes.
+    """
+    x = np.unique(A)
+    y = np.zeros(A.size)
+    z = 0
+    for i in x:
+        y[i == A] = z
+        z += 1
+
+    return y.astype("int")
+
+
+def nearest_neighbors_hnsw(
+    x: NDArray[np.floating[Any]],
+    ef: int = 200,
+    M: int = 48,
+    n_neighbors: int = 100,
+) -> tuple[NDArray[np.int64], NDArray[np.floating[Any]]]:
+    """Compute approximate nearest neighbors using HNSW algorithm.
+
+    Parameters
+    ----------
+    x : NDArray
+        Input data matrix.
+    ef : int, optional
+        HNSW ef parameter. Default is 200.
+    M : int, optional
+        HNSW M parameter. Default is 48.
+    n_neighbors : int, optional
+        Number of neighbors. Default is 100.
+
+    Returns
+    -------
+    tuple
+        (indices, distances) arrays.
+    """
+    import hnswlib
+
+    labels = np.arange(x.shape[0])
+    p = hnswlib.Index(space="cosine", dim=x.shape[1])
+    p.init_index(max_elements=x.shape[0], ef_construction=ef, M=M)
+    p.add_items(x, labels)
+    p.set_ef(ef)
+    idx, dist = p.knn_query(x, k=n_neighbors)
+    return idx, dist
+
+
+def calc_nnm(
+    g_weighted: NDArray[np.floating[Any]],
+    k: int,
+    distance: str | None = None,
+) -> sparse.csr_matrix:
+    """Calculate k-nearest neighbor matrix.
+
+    Parameters
+    ----------
+    g_weighted : NDArray
+        Input coordinates.
+    k : int
+        Number of neighbors.
+    distance : str | None, optional
+        Distance metric. If 'cosine', uses HNSW.
+
+    Returns
+    -------
+    sparse.csr_matrix
+        Sparse k-NN matrix with distances.
+    """
+    if g_weighted.shape[0] > 0:
+        if distance == "cosine":
+            nnm, dists = nearest_neighbors_hnsw(g_weighted, n_neighbors=k)
+        else:
+            nnm, dists = nearest_neighbors_wrapper(g_weighted, n_neighbors=k, metric=distance)
+        EDM = gen_sparse_knn(nnm, dists)
+        EDM = EDM.tocsr()
+    return EDM
+
+
+def compute_distances(A: NDArray[np.floating[Any]], dm: str) -> NDArray[np.floating[Any]]:
+    """Compute pairwise distance matrix.
+
+    Parameters
+    ----------
+    A : NDArray
+        Input data matrix.
+    dm : str
+        Distance metric ('euclidean', 'correlation', or scipy metric).
+
+    Returns
+    -------
+    NDArray
+        Square distance matrix.
+    """
+    if dm == "euclidean":
+        m = np.dot(A, A.T)
+        h = np.diag(m)
+        x = h[:, None] + h[None, :] - 2 * m
+        x[x < 0] = 0
+        dist = np.sqrt(x)
+    elif dm == "correlation":
+        dist = 1 - np.corrcoef(A)
+    else:
+        dist = sp.spatial.distance.squareform(sp.spatial.distance.pdist(A, metric=dm))
+    return dist
+
+
+def dist_to_nn(d: NDArray[np.floating[Any]], K: int) -> NDArray[np.floating[Any]]:
+    """Convert distance matrix to binary k-NN adjacency matrix.
+
+    Parameters
+    ----------
+    d : NDArray
+        Square distance matrix.
+    K : int
+        Number of neighbors.
+
+    Returns
+    -------
+    NDArray
+        Binary adjacency matrix.
+    """
+    E = d.copy()
+    np.fill_diagonal(E, -1)
+    M = np.max(E) * 2
+    x = np.argsort(E, axis=1)[:, :K]
+    E[
+        np.tile(np.arange(E.shape[0]).reshape(E.shape[0], -1), (1, x.shape[1])).flatten(),
+        x.flatten(),
+    ] = M
+
+    E[E < M] = 0
+    E[E > 0] = 1
+    return E
+
+
+def to_sparse_knn(D1: sparse.csr_matrix, k: int) -> sparse.csr_matrix:
+    """Sparsify matrix to keep only k nearest neighbors per row.
+
+    Parameters
+    ----------
+    D1 : sparse.csr_matrix
+        Input sparse matrix.
+    k : int
+        Number of neighbors to keep.
+
+    Returns
+    -------
+    sparse.csr_matrix
+        Sparsified matrix.
+    """
+    for i in range(D1.shape[0]):
+        x = D1.data[D1.indptr[i] : D1.indptr[i + 1]]
+        idx = np.argsort(x)
+        if idx.size > k:
+            x[idx[:-k]] = 0
+        D1.data[D1.indptr[i] : D1.indptr[i + 1]] = x
+    D1.eliminate_zeros()
+    return D1
+
+
+def gen_sparse_knn(
+    knni: NDArray[np.int64],
+    knnd: NDArray[np.floating[Any]],
+    shape: tuple[int, int] | None = None,
+) -> sparse.csr_matrix:
+    """Generate sparse k-NN matrix from indices and distances.
+
+    Parameters
+    ----------
+    knni : NDArray
+        k-NN indices (n x k).
+    knnd : NDArray
+        k-NN distances (n x k).
+    shape : tuple | None, optional
+        Output shape. If None, uses (n, n).
+
+    Returns
+    -------
+    sparse.csr_matrix
+        Sparse k-NN matrix.
+    """
+    if shape is None:
+        shape = (knni.shape[0], knni.shape[0])
+
+    D1 = sp.sparse.lil_matrix(shape)
+
+    D1[
+        np.tile(np.arange(knni.shape[0])[:, None], (1, knni.shape[1])).flatten().astype("int32"),
+        knni.flatten().astype("int32"),
+    ] = knnd.flatten()
+    D1 = D1.tocsr()
+    return D1
diff --git a/test/test_sam.py b/test/test_sam.py
deleted file mode 100644
index b2f2c3a..0000000
--- a/test/test_sam.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-author:     Alexander Tarashansky
-date:       10/01/2018
-content:    Tests for SAM.
-"""
-# Modules
-from samalg import SAM
-
-
-# Script
-if __name__ == "__main__":
-
-    sam = SAM()
-    sam.load_data("example_data/darmanis_data.csv.gz")
-    sam.load_obs_annotations("example_data/darmanis_ann.csv")
-    sam.preprocess_data()
-    sam.run(projection=None)
-    sam.run_umap()
-    sam.run_tsne()
-    sam.kmeans_clustering(4)
-    sam.identify_marker_genes_ratio()
-    sam.identify_marker_genes_rf()
-    umap_obj = sam.umap_obj  # checking to see if umap_obj is stored in sam object
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..91207cb
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for the samalg package."""
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..dd4169d
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,165 @@
+"""Pytest fixtures for samalg tests."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from anndata import AnnData
+
+
+@pytest.fixture
+def random_seed() -> int:
+    """Provide a consistent random seed for reproducibility."""
+    return 42
+
+
+@pytest.fixture
+def small_expression_matrix(random_seed: int) -> tuple[sp.csr_matrix, np.ndarray, np.ndarray]:
+    """Create a small test expression matrix.
+
+    Returns
+    -------
+    tuple
+        (sparse_matrix, gene_names, cell_names)
+    """
+    np.random.seed(random_seed)
+    n_cells = 100
+    n_genes = 200
+
+    # Create sparse count matrix with realistic properties
+    # Most entries are 0, some genes highly expressed
+    data = np.random.negative_binomial(n=5, p=0.3, size=(n_cells, n_genes)).astype(np.float32)
+    # Make it sparse (set low values to 0)
+    data[data < 2] = 0
+    sparse_data = sp.csr_matrix(data)
+
+    gene_names = np.array([f"Gene_{i}" for i in range(n_genes)])
+    cell_names = np.array([f"Cell_{i}" for i in range(n_cells)])
+
+    return sparse_data, gene_names, cell_names
+
+
+@pytest.fixture
+def medium_expression_matrix(random_seed: int) -> tuple[sp.csr_matrix, np.ndarray, np.ndarray]:
+    """Create a medium-sized test expression matrix.
+
+    Returns
+    -------
+    tuple
+        (sparse_matrix, gene_names, cell_names)
+    """
+    np.random.seed(random_seed)
+    n_cells = 500
+    n_genes = 1000
+
+    # Create sparse count matrix with more realistic expression patterns
+    # Higher n parameter to ensure more genes are expressed
+    data = np.random.negative_binomial(n=10, p=0.5, size=(n_cells, n_genes)).astype(np.float32)
+    # Less aggressive sparsification to keep more genes
+    sparse_data = sp.csr_matrix(data)
+
+    gene_names = np.array([f"Gene_{i}" for i in range(n_genes)])
+    cell_names = np.array([f"Cell_{i}" for i in range(n_cells)])
+
+    return sparse_data, gene_names, cell_names
+
+
+@pytest.fixture
+def simple_adata(small_expression_matrix: tuple) -> AnnData:
+    """Create a simple AnnData object for testing.
+
+    Parameters
+    ----------
+    small_expression_matrix : tuple
+        Fixture providing expression matrix data.
+
+    Returns
+    -------
+    AnnData
+        Annotated data object.
+    """
+    X, gene_names, cell_names = small_expression_matrix
+    adata = AnnData(X=X)
+    adata.var_names = gene_names
+    adata.obs_names = cell_names
+    return adata
+
+
+@pytest.fixture
+def adata_with_clusters(simple_adata: AnnData, random_seed: int) -> AnnData:
+    """Create an AnnData object with cluster annotations.
+
+    Parameters
+    ----------
+    simple_adata : AnnData
+        Fixture providing basic AnnData.
+    random_seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    AnnData
+        AnnData with cluster labels.
+    """
+    np.random.seed(random_seed)
+    n_cells = simple_adata.n_obs
+    n_clusters = 4
+
+    # Assign random cluster labels
+    clusters = np.random.choice([f"Cluster_{i}" for i in range(n_clusters)], size=n_cells)
+    simple_adata.obs["leiden_clusters"] = clusters
+
+    return simple_adata
+
+
+@pytest.fixture
+def dense_matrix(random_seed: int) -> np.ndarray:
+    """Create a dense matrix for PCA testing.
+
+    Parameters
+    ----------
+    random_seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    np.ndarray
+        Dense matrix with structure.
+    """
+    np.random.seed(random_seed)
+    n_samples = 100
+    n_features = 50
+
+    # Create matrix with some structure (for PCA to find)
+    # 3 underlying components
+    components = np.random.randn(3, n_features)
+    loadings = np.random.randn(n_samples, 3)
+    noise = np.random.randn(n_samples, n_features) * 0.1
+
+    return loadings @ components + noise
+
+
+@pytest.fixture
+def knn_test_data(random_seed: int) -> np.ndarray:
+    """Create data for k-NN testing with clear clusters.
+
+    Parameters
+    ----------
+    random_seed : int
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    np.ndarray
+        Data with 3 clear clusters.
+    """
+    np.random.seed(random_seed)
+    n_per_cluster = 30
+
+    # Create 3 well-separated clusters
+    cluster1 = np.random.randn(n_per_cluster, 10) + np.array([5, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+    cluster2 = np.random.randn(n_per_cluster, 10) + np.array([0, 5, 0, 0, 0, 0, 0, 0, 0, 0])
+    cluster3 = np.random.randn(n_per_cluster, 10) + np.array([0, 0, 5, 0, 0, 0, 0, 0, 0, 0])
+
+    return np.vstack([cluster1, cluster2, cluster3])
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..6fe9c77
--- /dev/null
+++ b/tests/integration/__init__.py
@@ -0,0 +1 @@
+"""Integration tests for samalg."""
diff --git a/tests/integration/test_sam_workflow.py b/tests/integration/test_sam_workflow.py
new file mode 100644
index 0000000..23ec400
--- /dev/null
+++ b/tests/integration/test_sam_workflow.py
@@ -0,0 +1,278 @@
+"""Integration tests for the SAM workflow."""
+
+from __future__ import annotations
+
+import numpy as np
+import pytest
+import scipy.sparse as sp
+from anndata import AnnData
+
+from samalg import SAM
+from samalg.exceptions import DataNotLoadedError
+
+
+class TestSAMInitialization:
+    """Tests for SAM object initialization."""
+
+    def test_init_empty(self) -> None:
+        """Test creating an empty SAM object."""
+        sam = SAM()
+
+        assert not hasattr(sam, "adata")
+        assert sam.run_args == {}
+        assert sam.preprocess_args == {}
+
+    def test_init_with_tuple(
+        self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test initialization with tuple of (data, genes, cells)."""
+        X, genes, cells = small_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+
+        assert hasattr(sam, "adata")
+        assert hasattr(sam, "adata_raw")
+        assert sam.adata.n_obs == len(cells)
+        assert sam.adata.n_vars == len(genes)
+
+    def test_init_with_anndata(self, simple_adata: AnnData) -> None:
+        """Test initialization with AnnData object."""
+        sam = SAM(counts=simple_adata)
+
+        assert hasattr(sam, "adata")
+        assert sam.adata.n_obs == simple_adata.n_obs
+        assert sam.adata.n_vars == simple_adata.n_vars
+
+    def test_init_inplace(self, simple_adata: AnnData) -> None:
+        """Test inplace initialization doesn't copy."""
+        sam = SAM(counts=simple_adata, inplace=True)
+
+        # Should be the same object
+        assert sam.adata_raw is simple_adata
+
+    def test_init_not_inplace(self, simple_adata: AnnData) -> None:
+        """Test non-inplace initialization copies data."""
+        sam = SAM(counts=simple_adata, inplace=False)
+
+        # Should be a different object
+        assert sam.adata_raw is not simple_adata
+
+
+class TestPreprocessing:
+    """Tests for data preprocessing."""
+
+    def test_preprocess_no_data(self) -> None:
+        """Test preprocessing without loaded data raises error."""
+        sam = SAM()
+
+        with pytest.raises(DataNotLoadedError):
+            sam.preprocess_data()
+
+    def test_preprocess_default(
+        self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test preprocessing with default parameters."""
+        X, genes, cells = small_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data()
+
+        # Check preprocessing was stored
+        assert "preprocess_args" in sam.adata.uns
+        assert sam.preprocess_args["norm"] == "log"
+        assert sam.preprocess_args["sum_norm"] == "cell_median"
+
+    def test_preprocess_custom_norm(
+        self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test preprocessing with custom normalization."""
+        X, genes, cells = small_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(norm="ftt", sum_norm=None)
+
+        assert sam.preprocess_args["norm"] == "ftt"
+        assert sam.preprocess_args["sum_norm"] is None
+
+    def test_preprocess_filters_genes(
+        self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test that gene filtering works."""
+        X, genes, cells = small_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=True, thresh_low=0.1, thresh_high=0.9)
+
+        # Should have filtered some genes (mask_genes should have False values)
+        assert "mask_genes" in sam.adata.var
+        # Some genes should be filtered out
+        n_kept = sam.adata.var["mask_genes"].sum()
+        assert n_kept < len(genes)
+
+    def test_preprocess_stores_layers(
+        self, small_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test that preprocessing creates X_disp layer."""
+        X, genes, cells = small_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data()
+
+        assert "X_disp" in sam.adata.layers
+
+
+class TestSAMRun:
+    """Tests for running the SAM algorithm."""
+
+    def test_run_basic(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test basic SAM run."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection=None, verbose=False)
+
+        # Check outputs
+        assert "weights" in sam.adata.var
+        assert "X_pca" in sam.adata.obsm
+        assert "connectivities" in sam.adata.obsp
+        assert "ranked_genes" in sam.adata.uns
+
+    def test_run_with_umap(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test SAM run with UMAP projection."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection="umap", verbose=False)
+
+        assert "X_umap" in sam.adata.obsm
+        assert sam.adata.obsm["X_umap"].shape == (len(cells), 2)
+
+    def test_run_stores_args(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test that run arguments are stored."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=3, k=15, npcs=50, projection=None, verbose=False)
+
+        assert sam.run_args["max_iter"] == 3
+        assert sam.run_args["k"] == 15
+        assert sam.run_args["npcs"] == 50
+
+    def test_run_weight_modes(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test different weight modes."""
+        X, genes, cells = medium_expression_matrix
+
+        for mode in ["dispersion", "variance", "rms", "combined"]:
+            sam = SAM(counts=(X, genes, cells))
+            sam.preprocess_data(filter_genes=False, min_expression=0)
+            sam.run(max_iter=2, weight_mode=mode, projection=None, verbose=False)
+
+            assert "weights" in sam.adata.var
+            # Weights should be between 0 and 1
+            weights = sam.adata.var["weights"].values
+            assert np.all(weights >= 0) and np.all(weights <= 1)
+
+
+class TestClustering:
+    """Tests for clustering methods."""
+
+    def test_leiden_clustering(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test Leiden clustering."""
+        pytest.importorskip("leidenalg")
+        pytest.importorskip("igraph")
+
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection=None, verbose=False)
+        sam.leiden_clustering(res=1.0)
+
+        assert "leiden_clusters" in sam.adata.obs
+
+    def test_kmeans_clustering(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test k-means clustering."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection=None, verbose=False)
+        cl, _km = sam.kmeans_clustering(numc=5)
+
+        assert "kmeans_clusters" in sam.adata.obs
+        assert len(np.unique(cl)) == 5
+
+
+class TestDispersionRanking:
+    """Tests for dispersion ranking."""
+
+    def test_dispersion_ranking(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test dispersion ranking NN."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection=None, verbose=False)
+
+        weights = sam.dispersion_ranking_NN(weight_mode="dispersion")
+
+        assert len(weights) == sam.adata.n_vars
+        assert np.all(weights >= 0) and np.all(weights <= 1)
+
+    def test_dispersion_saves_averages(
+        self, medium_expression_matrix: tuple[sp.csr_matrix, np.ndarray, np.ndarray]
+    ) -> None:
+        """Test that save_avgs creates X_knn_avg layer."""
+        X, genes, cells = medium_expression_matrix
+        sam = SAM(counts=(X, genes, cells))
+        sam.preprocess_data(filter_genes=False, min_expression=0)
+        sam.run(max_iter=2, projection=None, verbose=False)
+
+        sam.dispersion_ranking_NN(save_avgs=True)
+
+        assert "X_knn_avg" in sam.adata.layers
+
+
+class TestHelperMethods:
+    """Tests for helper methods."""
+
+    def test_get_labels(self, adata_with_clusters: AnnData) -> None:
+        """Test get_labels method."""
+        sam = SAM(counts=adata_with_clusters)
+
+        labels = sam.get_labels("leiden_clusters")
+
+        assert len(labels) == sam.adata.n_obs
+
+    def test_get_labels_un(self, adata_with_clusters: AnnData) -> None:
+        """Test get_labels_un returns unique labels."""
+        sam = SAM(counts=adata_with_clusters)
+
+        unique_labels = sam.get_labels_un("leiden_clusters")
+
+        assert len(unique_labels) == 4  # We created 4 clusters
+
+    def test_get_cells(self, adata_with_clusters: AnnData) -> None:
+        """Test get_cells retrieves cells with specific label."""
+        sam = SAM(counts=adata_with_clusters)
+
+        cells = sam.get_cells("Cluster_0", "leiden_clusters")
+
+        # All returned cells should have the right label
+        for cell in cells:
+            idx = np.where(sam.adata.obs_names == cell)[0][0]
+            assert sam.adata.obs["leiden_clusters"].iloc[idx] == "Cluster_0"
+
+    def test_get_labels_missing_key(self, simple_adata: AnnData) -> None:
+        """Test get_labels with missing key returns empty array."""
+        sam = SAM(counts=simple_adata)
+
+        labels = sam.get_labels("nonexistent_key")
+
+        assert len(labels) == 0
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..7cc8f76
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1 @@
+"""Unit tests for samalg."""
diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py
new file mode 100644
index 0000000..f45cb05
--- /dev/null
+++ b/tests/unit/test_utilities.py
@@ -0,0 +1,246 @@
+"""Unit tests for samalg.utilities module."""
+
+from __future__ import annotations
+
+import numpy as np
+import scipy.sparse as sp
+
+from samalg import utilities as ut
+
+
+class TestWeightedPCA:
+    """Tests for weighted_PCA function."""
+
+    def test_basic_pca(self, dense_matrix: np.ndarray) -> None:
+        """Test basic PCA functionality."""
+        reduced, pca = ut.weighted_PCA(dense_matrix, do_weight=False, npcs=10)
+
+        assert reduced.shape[0] == dense_matrix.shape[0]
+        assert reduced.shape[1] == 10
+        assert hasattr(pca, "components_")
+        assert pca.n_components_ == 10
+
+    def test_weighted_pca(self, dense_matrix: np.ndarray) -> None:
+        """Test weighted PCA with eigenvalue scaling."""
+        reduced, pca = ut.weighted_PCA(dense_matrix, do_weight=True, npcs=10)
+
+        assert reduced.shape[0] == dense_matrix.shape[0]
+        assert reduced.shape[1] == 10
+        # Weighted variance should be normalized
+        eigenvalues = pca.explained_variance_
+        assert eigenvalues[0] >= eigenvalues[-1]
+
+    def test_pca_npcs_limit(self, dense_matrix: np.ndarray) -> None:
+        """Test that npcs is correctly bounded."""
+        max_npcs = min(dense_matrix.shape)
+        reduced, _pca = ut.weighted_PCA(dense_matrix, npcs=max_npcs + 100)
+
+        assert reduced.shape[1] <= max_npcs
+
+    def test_pca_reproducibility(self, dense_matrix: np.ndarray) -> None:
+        """Test PCA is reproducible with same seed."""
+        reduced1, _ = ut.weighted_PCA(dense_matrix, npcs=5, seed=42)
+        reduced2, _ = ut.weighted_PCA(dense_matrix, npcs=5, seed=42)
+
+        np.testing.assert_array_almost_equal(np.abs(reduced1), np.abs(reduced2))
+
+
+class TestSparsePCA:
+    """Tests for sparse PCA function."""
+
+    def test_sparse_pca_basic(self, random_seed: int) -> None:
+        """Test sparse PCA on sparse matrix."""
+        np.random.seed(random_seed)
+        n, m = 100, 50
+        data = np.random.randn(n, m)
+        data[data < 0.5] = 0
+        sparse_data = sp.csr_matrix(data)
+
+        result = ut._pca_with_sparse(sparse_data, npcs=10)
+
+        assert "X_pca" in result
+        assert "variance" in result
+        assert "components" in result
+        assert result["X_pca"].shape == (n, 10)
+        assert result["components"].shape == (10, m)
+
+    def test_sparse_pca_variance_ordering(self, random_seed: int) -> None:
+        """Test that variance is in descending order."""
+        np.random.seed(random_seed)
+        data = np.random.randn(100, 50)
+        data[data < 0.5] = 0
+        sparse_data = sp.csr_matrix(data)
+
+        result = ut._pca_with_sparse(sparse_data, npcs=10)
+        variance = result["variance"]
+
+        # Variance should be in descending order
+        assert np.all(variance[:-1] >= variance[1:])
+
+
+class TestNearestNeighbors:
+    """Tests for nearest neighbor functions."""
+
+    def test_calc_nnm_basic(self, knn_test_data: np.ndarray) -> None:
+        """Test basic k-NN computation."""
+        nnm = ut.calc_nnm(knn_test_data, k=5, distance="euclidean")
+
+        assert sp.issparse(nnm)
+        assert nnm.shape[0] == nnm.shape[1] == knn_test_data.shape[0]
+
+    def test_calc_nnm_cosine(self, knn_test_data: np.ndarray) -> None:
+        """Test k-NN with cosine distance (HNSW)."""
+        nnm = ut.calc_nnm(knn_test_data, k=5, distance="cosine")
+
+        assert sp.issparse(nnm)
+        assert nnm.shape[0] == knn_test_data.shape[0]
+
+    def test_gen_sparse_knn(self) -> None:
+        """Test sparse k-NN matrix generation."""
+        n_samples, k = 10, 3
+        indices = np.tile(np.arange(k)[None, :], (n_samples, 1))
+        distances = np.random.rand(n_samples, k)
+
+        sparse_nnm = ut.gen_sparse_knn(indices, distances)
+
+        assert sp.issparse(sparse_nnm)
+        assert sparse_nnm.shape == (n_samples, n_samples)
+
+
+class TestDistanceFunctions:
+    """Tests for distance computation functions."""
+
+    def test_euclidean_map(self) -> None:
+        """Test Euclidean distance computation."""
+        A = np.array([[0, 0], [1, 0], [0, 1]])
+        B = np.array([[0, 0], [2, 0]])
+
+        dist = ut.generate_euclidean_map(A, B)
+
+        assert dist.shape == (3, 2)
+        np.testing.assert_almost_equal(dist[0, 0], 0.0)  # A[0] to B[0]
+        np.testing.assert_almost_equal(dist[1, 0], 1.0)  # A[1] to B[0]
+        np.testing.assert_almost_equal(dist[0, 1], 2.0)  # A[0] to B[1]
+
+    def test_correlation_map(self) -> None:
+        """Test correlation computation."""
+        x = np.array([[1, 2, 3], [4, 5, 6]])
+        y = np.array([[1, 2, 3], [-1, -2, -3]])
+
+        corr = ut.generate_correlation_map(x, y)
+
+        assert corr.shape == (2, 2)
+        # Perfect positive correlation
+        np.testing.assert_almost_equal(corr[0, 0], 1.0, decimal=5)
+        # Perfect negative correlation
+        np.testing.assert_almost_equal(corr[0, 1], -1.0, decimal=5)
+
+
+class TestSearchString:
+    """Tests for string search function."""
+
+    def test_basic_search(self) -> None:
+        """Test basic string search."""
+        vec = np.array(["Gene_A", "Gene_B", "Other_C", "gene_d"])
+
+        matches, indices = ut.search_string(vec, "Gene")
+
+        assert len(matches) == 3  # Gene_A, Gene_B, gene_d (case insensitive)
+        assert len(indices) == 3
+
+    def test_case_sensitive_search(self) -> None:
+        """Test case-sensitive search."""
+        vec = np.array(["Gene_A", "Gene_B", "gene_c"])
+
+        matches, _indices = ut.search_string(vec, "Gene", case_sensitive=True)
+
+        assert len(matches) == 2  # Only Gene_A, Gene_B
+        assert "gene_c" not in matches
+
+    def test_inverted_search(self) -> None:
+        """Test inverted (non-matching) search."""
+        vec = np.array(["Gene_A", "Gene_B", "Other_C"])
+
+        matches, _indices = ut.search_string(vec, "Gene", invert=True)
+
+        assert len(matches) == 1
+        assert matches[0] == "Other_C"
+
+    def test_no_matches(self) -> None:
+        """Test search with no matches."""
+        vec = np.array(["Gene_A", "Gene_B"])
+
+        matches, indices = ut.search_string(vec, "NotFound")
+
+        assert matches == -1
+        assert indices == -1
+
+
+class TestConvertAnnotations:
+    """Tests for annotation conversion."""
+
+    def test_basic_conversion(self) -> None:
+        """Test converting string labels to integers."""
+        labels = np.array(["A", "B", "A", "C", "B"])
+
+        codes = ut.convert_annotations(labels)
+
+        assert codes.dtype == np.int64
+        assert len(np.unique(codes)) == 3
+        # Same labels should have same codes
+        assert codes[0] == codes[2]  # Both "A"
+        assert codes[1] == codes[4]  # Both "B"
+
+    def test_numeric_labels(self) -> None:
+        """Test with numeric labels."""
+        labels = np.array([1, 2, 1, 3, 2])
+
+        codes = ut.convert_annotations(labels)
+
+        assert codes.dtype == np.int64
+        assert len(np.unique(codes)) == 3
+
+
+class TestDistToNN:
+    """Tests for distance to nearest neighbor conversion."""
+
+    def test_basic_dist_to_nn(self) -> None:
+        """Test basic distance matrix to k-NN conversion."""
+        # Simple 4x4 distance matrix
+        dist = np.array([[0, 1, 2, 3], [1, 0, 1.5, 2.5], [2, 1.5, 0, 1], [3, 2.5, 1, 0]])
+
+        nn = ut.dist_to_nn(dist, K=2)
+
+        assert nn.shape == dist.shape
+        # Each row should have K neighbors (value 1)
+        assert np.sum(nn[0]) == 2
+        assert np.sum(nn[1]) == 2
+
+
+class TestHelperFunctions:
+    """Tests for miscellaneous helper functions."""
+
+    def test_to_lower(self) -> None:
+        """Test lowercase conversion."""
+        arr = np.array(["GENE_A", "Gene_B", "gene_c"])
+        result = ut.to_lower(arr)
+
+        assert np.all(result == np.array(["gene_a", "gene_b", "gene_c"]))
+
+    def test_to_upper(self) -> None:
+        """Test uppercase conversion."""
+        arr = np.array(["gene_a", "Gene_B", "GENE_C"])
+        result = ut.to_upper(arr)
+
+        assert np.all(result == np.array(["GENE_A", "GENE_B", "GENE_C"]))
+
+    def test_isolate(self) -> None:
+        """Test point isolation in rectangular region."""
+        coords = np.array([[0, 0], [1, 1], [5, 5], [10, 10]])
+
+        # Get points in region (0.5, 6) x (0.5, 6)
+        idx = ut.isolate(coords, 0.5, 6, 0.5, 6)
+
+        assert len(idx) == 2
+        assert 1 in idx  # (1, 1)
+        assert 2 in idx  # (5, 5)