Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"scipy",
"seaborn",
"session-info2",
"tmtoolkit",
]
optional-dependencies.dev = [ "pre-commit", "twine>=4.0.2" ]
optional-dependencies.doc = [
Expand Down
7 changes: 2 additions & 5 deletions src/tfmindi/tl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,11 @@

from tfmindi.tl.cluster import cluster_seqlets
from tfmindi.tl.patterns import create_patterns
from tfmindi.tl.topic_modeling import (
evaluate_topic_models,
run_topic_modeling,
)
from tfmindi.tl.topic_modeling import add_topic_modeling_result, run_topic_modeling

__all__ = [
"cluster_seqlets",
"create_patterns",
"run_topic_modeling",
"evaluate_topic_models",
"add_topic_modeling_result",
]
235 changes: 81 additions & 154 deletions src/tfmindi/tl/topic_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,63 +2,28 @@

from __future__ import annotations

import math

import lda
import pandas as pd
from anndata import AnnData


def loglikelihood(nzw, ndz, alpha, eta):
"""Calculate log-likelihood of LDA model parameters (from pycisTopic)."""
D = ndz.shape[0]
n_topics = ndz.shape[1]
vocab_size = nzw.shape[1]

const_prior = (n_topics * math.lgamma(alpha) - math.lgamma(alpha * n_topics)) * D
const_ll = (vocab_size * math.lgamma(eta) - math.lgamma(eta * vocab_size)) * n_topics

# calculate log p(w|z)
topic_ll = 0
for k in range(n_topics):
sum = eta * vocab_size
for w in range(vocab_size):
if nzw[k, w] > 0:
topic_ll = math.lgamma(nzw[k, w] + eta)
sum += nzw[k, w]
topic_ll -= math.lgamma(sum)

# calculate log p(z)
doc_ll = 0
for d in range(D):
sum = alpha * n_topics
for k in range(n_topics):
if ndz[d, k] > 0:
doc_ll = math.lgamma(ndz[d, k] + alpha)
sum += ndz[d, k]
doc_ll -= math.lgamma(sum)

ll = doc_ll - const_prior + topic_ll - const_ll
return ll
import lda # type: ignore
import pandas as pd # type: ignore
from anndata import AnnData # type: ignore
from tmtoolkit.topicmod import tm_lda # type: ignore


def run_topic_modeling(
adata: AnnData,
n_topics: int = 40,
alpha: float = 50,
eta: float = 0.1,
n_iter: int = 150,
n_topics: int | list[int],
alpha: float | list[float] = 0.04,
eta: float | list[float] = 0.1,
n_iter: int = 2_000,
random_state: int = 123,
filter_unknown: bool = True,
) -> None:
) -> tuple[list, pd.DataFrame]:
"""
Discover co-occurring motif patterns using topic modeling on region-level data.

This function performs the following steps:
1. Group seqlets by genomic regions using stored coordinates
2. Create region-cluster count matrix from leiden assignments
3. Fit LDA model to discover topics (co-occurring cluster patterns)
4. Store fitted model and results in adata.uns and adata.obsm

Parameters
----------
Expand All @@ -70,35 +35,34 @@ def run_topic_modeling(
- adata.obs["start"]: Seqlet start positions
- adata.obs["end"]: Seqlet end positions
- adata.obs["cluster_dbd"]: DBD annotations per cluster (optional)

n_topics
Number of topics to discover

alpha
Dirichlet prior for document-topic distribution

eta
Dirichlet prior for topic-word distribution

n_iter
Number of LDA iterations

random_state
Random seed for reproducibility

filter_unknown
Whether to filter out seqlets with unknown DBD annotations

For the following parameters either a list of values or a single value is accepted:
- n_topics
- alpha
- eta
Multiple models will be fitted to cover all parameters specified by the list of values.

Returns
-------
None
Results are stored in adata:
- adata.uns['topic_modeling']: Model, parameters, and all topic-related matrices

Examples
--------
>>> import tfmindi as tm
>>> # adata with clustering results
>>> tm.tl.run_topic_modeling(adata, n_topics=40)
>>> print(f"Discovered {adata.uns['topic_modeling']['params']['n_topics']} topics")
>>> print(f"Region-topic matrix shape: {adata.obsm['X_topics'].shape}")
>>> # Now can plot directly from adata
>>> tm.pl.dbd_topic_heatmap(adata)
>>> tm.pl.region_topic_tsne(adata)
list of topic modeling results and count table
"""
# Check required columns
required_cols = ["leiden", "example_idx", "start", "end"]
Expand Down Expand Up @@ -127,21 +91,65 @@ def run_topic_modeling(
count_table.index.name = "region_id"
count_table.columns.name = "cluster"

print(f"Count matrix shape: {count_table.shape} (regions × clusters)")
print(f"Count matrix shape: {count_table.shape} (regions x clusters)")

# Fit LDA model
print(f"Fitting LDA model with {n_topics} topics...")

model = lda.LDA(
n_topics=n_topics,
n_iter=n_iter,
random_state=random_state,
alpha=alpha / n_topics, # Normalize alpha by n_topics
eta=eta,
constant_params: dict[str, int | float] = {"n_iter": n_iter, "random_state": random_state}
variable_params: dict[str, list[int] | list[float]] = {}

if isinstance(n_topics, list):
variable_params["n_topics"] = n_topics
else:
constant_params["n_topics"] = n_topics

if isinstance(alpha, list):
variable_params["alpha"] = alpha
else:
constant_params["alpha"] = alpha

if isinstance(eta, list):
variable_params["eta"] = eta
else:
constant_params["eta"] = eta

print("Fitting model")
print("\tConstant parameters:")
for _param, _val in constant_params.items():
print(f"\t\t{_param}={_val}")
print("\tVariable parameters:")
for _param, _val in variable_params.items(): # type: ignore
print(f"\t\t{_param}={_val}")

# fit multiple models in parallel
topic_modeling_results = tm_lda.evaluate_topic_models(
data=count_table.values,
varying_parameters=variable_params,
constant_parameters=constant_params,
metric=tm_lda.AVAILABLE_METRICS,
return_models=True,
)

model.fit(count_table.values)
return topic_modeling_results, count_table


def add_topic_modeling_result(adata: AnnData, model: lda.LDA, count_table: pd.DataFrame) -> None:
"""Add topic model to AnnData object.

Parameters
----------
adata
AnnData object.

model
LDA model.

count_table
seqlet by region count table.

Returns
-------
Stores topic modeling result in adata.uns["topic_modeling"]
"""
# Create region-topic matrix
region_topic = pd.DataFrame(
model.doc_topic_, index=count_table.index.values, columns=[f"Topic_{x + 1}" for x in range(model.n_topics)]
Expand All @@ -158,12 +166,11 @@ def run_topic_modeling(
adata.uns["topic_modeling"] = {
"model": model,
"params": {
"n_topics": n_topics,
"alpha": alpha,
"eta": eta,
"n_iter": n_iter,
"random_state": random_state,
"filter_unknown": filter_unknown,
"n_topics": model.n_topics,
"alpha": model.alpha,
"eta": model.eta,
"n_iter": model.n_iter,
"random_state": model.random_state,
},
"count_matrix": count_table,
"topic_cluster_matrix": topic_cluster,
Expand All @@ -174,83 +181,3 @@ def run_topic_modeling(
adata.uns["topic_modeling"]["region_topic_matrix"] = region_topic

print("Stored topic modeling results in adata.uns['topic_modeling']")


def evaluate_topic_models(
adata: AnnData,
n_topics_range: list[int] | None = None,
alpha: float = 50,
eta: float = 0.1,
n_iter: int = 150,
random_state: int = 123,
**kwargs,
) -> dict[int, float]:
"""
Evaluate multiple topic models to find optimal number of topics.

Parameters
----------
adata
AnnData object with cluster assignments and genomic coordinates
n_topics_range
List of topic numbers to evaluate (default: [10, 15, 20, 25, 30, 35, 40, 50])
alpha
Dirichlet prior for document-topic distribution (default: 50)
eta
Dirichlet prior for topic-word distribution (default: 0.1)
n_iter
Number of LDA iterations (default: 150)
random_state
Random seed for reproducibility (default: 123)
**kwargs
Additional arguments passed to run_topic_modeling

Returns
-------
Mapping of n_topics to log-likelihood scores

Note: The best-performing model is automatically stored in adata

Examples
--------
>>> import tfmindi as tm
>>> # Evaluate different numbers of topics
>>> scores = tm.tl.evaluate_topic_models(adata, n_topics_range=[10, 20, 30, 40])
>>> best_n_topics = max(scores, key=scores.get)
>>> print(f"Best number of topics: {best_n_topics}")
>>> # Best model is already stored in adata for plotting
"""
if n_topics_range is None:
n_topics_range = [10, 15, 20, 25, 30, 35, 40, 50]

print(f"Evaluating {len(n_topics_range)} different topic models...")

model_to_ll = {}
best_model_info = None
best_ll = -float("inf")

for n_topics in n_topics_range:
print(f"Training model with {n_topics} topics...")
run_topic_modeling(
adata, n_topics=n_topics, alpha=alpha, eta=eta, n_iter=n_iter, random_state=random_state, **kwargs
)

# Get the model that was just stored
model = adata.uns["topic_modeling"]["model"]
ll = loglikelihood(model.nzw_, model.ndz_, alpha / n_topics, eta)
model_to_ll[n_topics] = ll
print(f"Model with {n_topics} topics: log-likelihood = {ll:.2f}")

# Track the best model
if ll > best_ll:
best_ll = ll
best_model_info = n_topics

# Ensure the best model is stored in adata (rerun if needed)
if best_model_info != n_topics: # If best model isn't the last one trained
print(f"Storing best model with {best_model_info} topics...")
run_topic_modeling(
adata, n_topics=best_model_info, alpha=alpha, eta=eta, n_iter=n_iter, random_state=random_state, **kwargs
)

return model_to_ll
Loading