Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions rdock-utils/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ description = "Utilities for working with RDock and operating on SD files"
requires-python = ">=3.10.0"

[tool.setuptools.dynamic]
dependencies = {file = ["requirements.txt"]}
optional-dependencies = { dev = {file = ["requirements-dev.txt"]} }
dependencies = { file = ["requirements.txt"] }
optional-dependencies = { dev = { file = ["requirements-dev.txt"] } }

[project.scripts]
rbhtfinder = "rdock_utils.rbhtfinder:main"
rbhtfinder_old = "rdock_utils.rbhtfinder_original_copy:main"
sdfield = "rdock_utils.sdfield:main"
sdrmsd_old = "rdock_utils.sdrmsd_original:main"
sdrmsd = "rdock_utils.sdrmsd.main:main"
Expand All @@ -26,11 +28,16 @@ Repository = "https://github.com/CBDD/rDock.git"
[tool.ruff]
line-length = 119
target-version = "py312"
exclude = [".git", "__pycache__", "rdock_utils/sdrmsd_original.py", "rdock_utils/sdtether_original.py"]
exclude = [
".git",
"__pycache__",
"rdock_utils/sdrmsd_original.py",
"rdock_utils/sdtether_original.py",
]

[tool.ruff.lint]
select = ["E4", "E7", "E9", "F", "I"]
ignore = ["E231","E501","E203"]
ignore = ["E231", "E501", "E203"]

[tool.ruff.format]
quote-style = "double"
Expand Down Expand Up @@ -67,4 +74,12 @@ no_implicit_reexport = false

strict_equality = true

exclude = ["build/*", "rdock_utils/sdrmsd_original.py", "tests/", "rdock_utils/sdtether_original.py"]
exclude = [
"build/*",
"rdock_utils/sdrmsd_original.py",
"tests/",
"rdock_utils/sdtether_original.py",
"rdock_utils/rbhtfinder_original_copy.py",
]

plugins = "numpy.typing.mypy_plugin"
22 changes: 21 additions & 1 deletion rdock-utils/rdock_utils/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,21 @@
from .SDFParser import FastSDMol, molecules_with_progress_log, read_molecules, read_molecules_from_all_inputs
from .superpose3d import MolAlignmentData, Superpose3D, update_coordinates
from .types import (
Array1DFloat,
Array1DInt,
Array1DStr,
Array2DFloat,
Array3DFloat,
AtomsMapping,
AutomorphismRMSD,
ColumnNamesArray,
CoordsArray,
FilterCombination,
FloatArray,
InputData,
Matrix3x3,
MinScoreIndices,
SDReportArray,
SingularValueDecomposition,
Superpose3DResult,
Vector3D,
Expand All @@ -25,11 +35,21 @@
"MolAlignmentData",
"Superpose3D",
# -- types --
"Array1DFloat",
"Array1DInt",
"Array1DStr",
"Array2DFloat",
"Array3DFloat",
"AtomsMapping",
"AutomorphismRMSD",
"ColumnNamesArray",
"CoordsArray",
"FilterCombination",
"FloatArray",
"AtomsMapping",
"InputData",
"Matrix3x3",
"MinScoreIndices",
"SDReportArray",
"SingularValueDecomposition",
"Superpose3DResult",
"Vector3D",
Expand Down
39 changes: 27 additions & 12 deletions rdock-utils/rdock_utils/common/types.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,38 @@
from typing import Any

import numpy
import numpy as np
import numpy.typing as npt
Copy link
Copy Markdown
Contributor

@ggutierrez-sunbright ggutierrez-sunbright Jul 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know the standard/recommendation for numpy imports is to give it the np alias, but I'd rather have the full name. It's not like it saves a big amount of characters, and it is less "clear" / "pythonic".

This one is a purely stylistic choice, just the same way I'd name a variable velocity_x, instead of just vx, for anything other than a prototype.

(same thing with pandas, matplotlib.pyplot, etc)


FloatArray = numpy.ndarray[Any, numpy.dtype[numpy.float64]]
CoordsArray = numpy.ndarray[Any, numpy.dtype[numpy.float64]]
# TODO: Review common types for all rdock_utils scripts
# SDRMSD types
FloatArray = np.ndarray[Any, np.dtype[np.float64]]
CoordsArray = np.ndarray[Any, np.dtype[np.float64]]
AutomorphismRMSD = tuple[float, CoordsArray | None]
Vector3D = numpy.ndarray[Any, numpy.dtype[numpy.float64]]
Matrix3x3 = numpy.ndarray[Any, numpy.dtype[numpy.float64]]
Vector3D = np.ndarray[Any, np.dtype[np.float64]]
Matrix3x3 = np.ndarray[Any, np.dtype[np.float64]]
SingularValueDecomposition = tuple[Matrix3x3, Vector3D, Matrix3x3]
Superpose3DResult = tuple[CoordsArray, float, Matrix3x3]
AtomsMapping = tuple[tuple[int, int], ...]

## Shape support for type hinting is not yet avaialable in numpy
## let's keep this as a guide for numpy 2.0 release
# FloatArray = numpy.ndarray[Literal["N"], numpy.dtype[float]]
# BoolArray = numpy.ndarray[Literal["N"], numpy.dtype[bool]]
# CoordsArray = numpy.ndarray[Literal["N", 3], numpy.dtype[float]]
# RBHTFinder types
SDReportArray = np.ndarray[list[int | str | float], np.dtype[np.object_]]
Array1DFloat = npt.NDArray[np.float_]
Array2DFloat = npt.NDArray[np.float_]
Array3DFloat = npt.NDArray[np.float_]
Array1DStr = npt.NDArray[np.str_]
Array1DInt = npt.NDArray[np.int_]
ColumnNamesArray = Array1DStr | list[str]
InputData = tuple[SDReportArray, ColumnNamesArray]
MinScoreIndices = dict[int, Array1DInt]
FilterCombination = tuple[float, float]

## Shape support for type hinting is not yet avaialable in np
## let's keep this as a guide for np 2.0 release
# FloatArray = np.ndarray[Literal["N"], np.dtype[float]]
# BoolArray = np.ndarray[Literal["N"], np.dtype[bool]]
# CoordsArray = np.ndarray[Literal["N", 3], np.dtype[float]]
# AutomorphismRMSD = tuple[float, CoordsArray | None]
# Vector3D = numpy.ndarray[Literal[3], numpy.dtype[float]]
# Matrix3x3 = numpy.ndarray[Literal[3, 3], numpy.dtype[float]]
# Vector3D = np.ndarray[Literal[3], np.dtype[float]]
# Matrix3x3 = np.ndarray[Literal[3, 3], np.dtype[float]]
# SingularValueDecomposition = tuple[Matrix3x3, Vector3D, Matrix3x3]
# Superpose3DResult = tuple[CoordsArray, float, Matrix3x3]
Empty file.
12 changes: 12 additions & 0 deletions rdock-utils/rdock_utils/rbhtfinder/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from .parser import get_config
from .rbhtfinder import RBHTFinder


def main(argv: list[str] | None = None) -> None:
config = get_config(argv)
rbhtfinder = RBHTFinder(config)
rbhtfinder.run()


if __name__ == "__main__":
main()
141 changes: 141 additions & 0 deletions rdock-utils/rdock_utils/rbhtfinder/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import argparse
from dataclasses import dataclass

from .schemas import Filter


@dataclass
class RBHTFinderConfig:
input: str
output: str
threshold: str | None
name: int
filters: list[Filter]
validation: int
header: bool
max_time: float
min_percentage: float

def __post_init__(self) -> None:
self.filters = self.get_parsed_filters()

def get_parsed_filters(self) -> list[Filter]:
filter_args: list[str] = self.filters # type: ignore
parsed_filters = [self._parse_filter(arg) for arg in filter_args]
# sort filters by step at which they are applied
parsed_filters.sort(key=lambda filter: filter.steps)
return parsed_filters

@staticmethod
def _parse_filter(argument: str) -> Filter:
parsed_filter = Filter()

for item in argument.split(","):
key, value = item.split("=")
setattr(parsed_filter, key, float(value) if key in ("interval", "min", "max") else int(value))
# User inputs with 1-based numbering whereas python uses 0-based
parsed_filter.column -= 1
parsed_filter.interval = parsed_filter.interval or 1.0
return parsed_filter


def get_parser() -> argparse.ArgumentParser:
description = """
Estimate the results and computation time of an rDock high-throughput protocol.

Steps:
1. Perform exhaustive docking of a small representative part of the entire library.
2. Store the result of sdreport -t from that exhaustive docking run in a file
<sdreport_file>, which will be the input of this script.
3. Run rbhtfinder, specifying -i <sdreport_file> and an arbitrary number of filters
using the -f option, for example, "-f column=6,steps=5,min=0.5,max=1.0,interval=0.1".
This example would simulate the effect of applying thresholds on column 6 after 5 poses
have been generated, for values between 0.5 and 1.0 (i.e., 0.5, 0.6, 0.7, 0.8, 0.9, 1.0).
More than one threshold can be specified, e.g.,
"-f column=4,steps=5,min=-12,max=-10,interval=1 column=4,steps=15,min=-16,max=-15,interval=1"
will test the following combinations of thresholds on column 4:
5 -10 15 -15
5 -11 15 -15
5 -12 15 -15
5 -10 15 -16
5 -11 15 -16
5 -12 15 -16
The number of combinations will increase very rapidly, the more filters are used and the
larger the range of values specified for each. It may be sensible to run rbhtfinder several
times to explore the effects of various filters independently.

Output:
The output of the program consists of the following columns:
FILTER1 NSTEPS1 THR1 PERC1 TOP500_SCORE.INTER ENRICH_SCORE.INTER TIME
SCORE.INTER 5 -13.00 6.04 72.80 12.05 0.0500
SCORE.INTER 5 -12.00 9.96 82.80 8.31 0.0500
The four columns are repeated for each filter specified with the -f option:
name of the column on which the filter is applied (FILTER1),
number of steps at which the threshold is applied (NSTEPS1),
value of the threshold (THR1)
and the percentage of poses which pass this filter (PERC1).
Additional filters (FILTER2, FILTER3 etc.) are listed in the order that they are applied
(i.e., by NSTEPS).

The final columns provide some overall statistics for the combination of thresholds
specified in a row. TOP500_SCORE.INTER gives the percentage of the top-scoring 500 poses,
measured by SCORE.INTER, from the whole of <sdreport_file> which are retained after the
thresholds are applied. This can be contrasted with the final PERC column. The higher the
ratio (the 'enrichment factor'), the better the combination of thresholds. If thresholds are
applied on multiple columns, this column will be duplicated for each, e.g. TOP500_SCORE.INTER
and TOP500_SCORE.RESTR will give the percentage of the top-scoring poses retained for both of
these scoring methods. The exact number of poses used for this validation can be changed from
the default 500 using the --validation flag.
ENRICH_SCORE.INTER gives the enrichment factor as a quick rule-of-thumb to assess the best
choice of thresholds. The final column TIME provides an estimate of the time taken to perform
docking, as a proportion of the time taken for exhaustive docking. This value should be below
0.1.

After a combination of thresholds has been selected, they need to be encoded into a threshold
file which rDock can use as an input. rbhtfinder attempts to help with this task by
automatically selecting a combination and writing a threshold file. The combination chosen is
that which provides the highest enrichment factor, after all options with a TIME value over
0.1 are excluded. This choice should not be blindly followed, so the threshold file should be
considered a template that the user modifies as needed.

Requirements:
rbhtfinder requires NumPy. Installation of Pandas is recommended, but optional; if Pandas is
not available, loading the input file for calculations will be considerably slower.
"""
input_help = "Input from sdreport (tabular separated format)."
output_help = "Output file for report on threshold combinations."
threshold_help = "Threshold file used by rDock as input."
name_help = "Index of column containing the molecule name (0 indexed). Default is 1."
filter_help = "Filter to apply, e.g. column=4,steps=5,min=-10,max=-15,interval=1 will test applying a filter to column 4 after generation of 5 poses, with threshold values between -10 and -15 tested. The variables column, steps, min and max must all be specified; interval defaults to 1 if not given."
validation_help = "Top-scoring N molecules from input to use for validating threshold combinations. Default 500."
header_help = "Specify if the input file from sdreport contains a header line with column names. If not, output files will describe columns using indices, e.g. COL4, COL5."
max_time_help = "Maximum value for time to use when autogenerating a high-throughput protocol - default is 0.1, i.e. 10%% of the time exhaustive docking would take."
min_perc_help = "Minimum value for the estimated final percentage of compounds to use when autogenerating a high-throughput protocol - default is 1."

parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", help=input_help, type=str, required=True)
parser.add_argument("-o", "--output", help=output_help, type=str, required=True)
parser.add_argument("-t", "--threshold", help=threshold_help, type=str)
parser.add_argument("-n", "--name", type=int, default=1, help=name_help)
parser.add_argument("-f", "--filters", nargs="+", type=str, help=filter_help, required=True) # Review 'required'
parser.add_argument("-v", "--validation", type=int, default=500, help=validation_help)
parser.add_argument("--header", action="store_true", help=header_help)
parser.add_argument("--max-time", type=float, default=0.1, help=max_time_help)
parser.add_argument("--min-perc", type=float, default=1.0, help=min_perc_help)
return parser


def get_config(argv: list[str] | None = None) -> RBHTFinderConfig:
parser = get_parser()
args = parser.parse_args(argv)
return RBHTFinderConfig(
input=args.input,
output=args.output,
threshold=args.threshold,
name=args.name,
filters=args.filters,
validation=args.validation,
header=args.header,
max_time=args.max_time,
min_percentage=args.min_perc,
)
Loading