Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
version: 2
sphinx:
configuration: docs/conf.py

build:
os: "ubuntu-20.04"
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@ $ pip install .
## References

[Bravo Gonzalez-Blas, C. & De Winter, S. *et al.* (2022). SCENIC+: single-cell multiomic inference of enhancers and gene regulatory networks](https://www.biorxiv.org/content/10.1101/2022.08.19.504505v1)

## license

The SCENIC+ [license](https://github.com/aertslab/scenicplus/blob/main/LICENCE.txt) covers [SCENIC+](https://github.com/aertslab/scenicplus), [pycisTarget](https://github.com/aertslab/pycistarget), [pycisTopic](https://github.com/aertslab/pycisTopic) and the [cisTarget dabases](https://resources.aertslab.org/cistarget/).
18 changes: 15 additions & 3 deletions src/scenicplus/cli/commands.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,7 @@ def calculate_auc(
eRegulons_fname: pathlib.Path,
multiome_mudata_fname: pathlib.Path,
out_file: pathlib.Path,
temp_dir: pathlib.Path,
n_cpu: int = 1):
"""
Calculate eRegulon enrichment scores.
Expand All @@ -944,19 +945,30 @@ def calculate_auc(
Path to multiome MuData file.
out_file : pathlib.Path
Path to store output.
temp_dir: pathlib.Path
Temporary directory for parallel processing.
n_cpu : int
Number of parallel processes to run.

"""
from scenicplus.eregulon_enrichment import score_eRegulons

if temp_dir is not None:
if type(temp_dir) == str:
temp_dir = pathlib.Path(temp_dir)
if not temp_dir.exists():
Warning(f"{temp_dir} does not exist, creating it.")
os.makedirs(temp_dir)
log.info("Reading data.")
mdata = mudata.read(multiome_mudata_fname.__str__())
mdata = mudata.read(multiome_mudata_fname.__str__(), backed=True)
eRegulons = pd.read_table(eRegulons_fname)

log.info("Calculating enrichment scores.")
gene_region_AUC = score_eRegulons(
eRegulons=eRegulons,
gex_mtx=mdata["scRNA"].to_df(),
acc_mtx=mdata["scATAC"].to_df(),
gex_mtx=mdata["scRNA"],
acc_mtx=mdata["scATAC"],
temp_dir=temp_dir.__str__(),
n_cpu=n_cpu)
mdata_AUC = mudata.MuData(
{
Expand Down
5 changes: 5 additions & 0 deletions src/scenicplus/cli/scenicplus.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -1003,6 +1003,7 @@ def aucell(arg):
eRegulons_fname=arg.eRegulon_fname,
multiome_mudata_fname=arg.multiome_mudata_fname,
out_file=arg.aucell_out_fname,
temp_dir=arg.temp_dir,
n_cpu=arg.n_cpu)
parser.set_defaults(func=aucell)
# Required arguments
Expand All @@ -1018,6 +1019,10 @@ def aucell(arg):
"--aucell_out_fname", dest="aucell_out_fname",
action="store", type=pathlib.Path, required=True,
help="Path to store enrichment scores (.h5mu).")
parser.add_argument(
"--temp_dir", dest="temp_dir",
action="store", type=pathlib.Path, required=True,
help="Path to temp dir.")
# Optional arguments
parser.add_argument(
"--n_cpu", dest="n_cpu",
Expand Down
119 changes: 98 additions & 21 deletions src/scenicplus/eregulon_enrichment.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from typing import Literal, Dict, List, Optional
import numpy as np
import pandas as pd
import anndata as ad
import joblib
import pathlib
from dataclasses import dataclass
from scenicplus.scenicplus_class import SCENICPLUS

Expand Down Expand Up @@ -79,34 +82,108 @@ def get_eRegulons_as_signatures(
lambda x: list(set(x))).to_dict()
return {"Gene_based": gene_signatures, "Region_based": region_signatures}

def _rank_and_enrich(
cell_df: pd.DataFrame,
signatures: Dict[str, List[str]],
auc_threshold: float = 0.05,
normalize: bool = False,
n_cpu: int = 1) -> pd.DataFrame:
"""
Compute enrichment AUC scores for signatures in a cell x feature dataframe
Nb. expects a dataframe to maintain pycistopic compatibility in signature_enrichment().
"""

ranking = rank_data(cell_df)
auc = signature_enrichment(
ranking,
signatures,
enrichment_type='gene',
auc_threshold=auc_threshold,
normalize=normalize,
n_cpu=1 # force single-threaded inside worker
)

return auc

def _chunk_scoring(
adata: ad.AnnData,
signatures: Dict[str, List[str]],
temp_dir: pathlib.Path,
chunk_size: int = 1000,
auc_threshold: float = 0.05,
normalize: bool = False,
n_cpu: int = 1) -> pd.DataFrame:
"""
Helper function that splits AnnData into chunks of cells to score in parallel.
N.b. that the chunks are turned into a dense dataframe
Increasing the chunk_size increases speed but also memory usage.
"""
# loop over sets of chunk_size cells - to avoid unsparsing the full
# cell x feature matrix when creating ranks during _rank_and_enrich()
all_cells = adata.obs_names
cell_chunks = [all_cells[i:i + chunk_size] for i in range(0, len(all_cells), chunk_size)]

# get signature enrichment scores in parallel
# concatenate in dataframe
AUC_list: List[auc] = joblib.Parallel(
n_jobs=n_cpu,
temp_folder=temp_dir
)(
joblib.delayed(
_rank_and_enrich
)(
adata[cell_chunk].to_df().copy(),
signatures,
auc_threshold=auc_threshold,
normalize=normalize
)
for cell_chunk in cell_chunks
)
AUC_scores = pd.concat(AUC_list)
# match order of indices
AUC_scores = AUC_scores.loc[adata.obs_names]

return AUC_scores

def score_eRegulons(
eRegulons: pd.DataFrame,
gex_mtx: pd.DataFrame,
acc_mtx: pd.DataFrame,
gex_mtx: ad.AnnData,
acc_mtx: ad.AnnData,
temp_dir: pathlib.Path,
auc_threshold: float = 0.05,
normalize: bool = False,
n_cpu: int = 1
) -> Dict[str, pd.DataFrame]:
chunk_size: int = 1000,
n_cpu: int = 1) -> Dict[str, pd.DataFrame]:
"""
"""
eRegulon_signatures = get_eRegulons_as_signatures(eRegulons=eRegulons)
gex_ranking = rank_data(gex_mtx)
acc_ranking = rank_data(acc_mtx)
gex_AUC = signature_enrichment(
gex_ranking,
eRegulon_signatures["Gene_based"],
enrichment_type='gene',
auc_threshold=auc_threshold,
normalize=normalize,
n_cpu=n_cpu)
acc_AUC = signature_enrichment(
acc_ranking,
eRegulon_signatures["Region_based"],
enrichment_type='gene',
auc_threshold=auc_threshold,
normalize=normalize,
n_cpu=n_cpu)
return {"Gene_based": gex_AUC, "Region_based": acc_AUC}

# get eRegulon enrichment scores for gene expression
gex_AUC = _chunk_scoring(
adata = gex_mtx,
signatures = eRegulon_signatures["Gene_based"],
temp_dir = temp_dir,
chunk_size = chunk_size,
auc_threshold = auc_threshold,
normalize = normalize,
n_cpu = n_cpu
)

# get eRegulon enrichment scores for accessibility
acc_AUC = _chunk_scoring(
adata = acc_mtx,
signatures = eRegulon_signatures["Region_based"],
temp_dir = temp_dir,
chunk_size = chunk_size,
auc_threshold = auc_threshold,
normalize = normalize,
n_cpu = n_cpu
)

return {
"Gene_based": gex_AUC,
"Region_based": acc_AUC
}

def binarize_AUC(scplus_obj: SCENICPLUS,
auc_key: Optional[str] = 'eRegulon_AUC',
Expand Down
8 changes: 8 additions & 0 deletions src/scenicplus/snakemake/Snakefile
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,18 @@ rule AUCell_direct:
multiome_mudata_fname=config["output_data"]["combined_GEX_ACC_mudata"],
output:
aucell_direct_out_fname=config["output_data"]["AUCell_direct"]
params:
temp_dir=lambda wildcards: config["params_general"]["temp_dir"]
threads: config["params_general"]["n_cpu"]
shell:
"""
echo "Snakemake is passing sample: {params.temp_dir}"

scenicplus grn_inference AUCell \
--eRegulon_fname {input.eRegulons_direct} \
--multiome_mudata_fname {input.multiome_mudata_fname} \
--aucell_out_fname {output.aucell_direct_out_fname} \
--temp_dir {params.temp_dir} \
--n_cpu {threads}
"""

Expand All @@ -407,13 +412,16 @@ rule AUCell_extended:
multiome_mudata_fname=config["output_data"]["combined_GEX_ACC_mudata"],
output:
aucell_extended_out_fname=config["output_data"]["AUCell_extended"]
params:
temp_dir=lambda wildcards: config["params_general"]["temp_dir"]
threads: config["params_general"]["n_cpu"]
shell:
"""
scenicplus grn_inference AUCell \
--eRegulon_fname {input.eRegulons_extended} \
--multiome_mudata_fname {input.multiome_mudata_fname} \
--aucell_out_fname {output.aucell_extended_out_fname} \
--temp_dir {params.temp_dir} \
--n_cpu {threads}
"""

Expand Down