Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
786adb5
add reading hd examiner peptide pool
Jhsmit Dec 9, 2025
0d6a223
prepare for identify by path
Jhsmit Dec 9, 2025
9a3964c
add protein filter field
Jhsmit Dec 10, 2025
448479b
refactor loader to reader, rework formats
Jhsmit Dec 10, 2025
e29079c
remove combined schema
Jhsmit Dec 10, 2025
09464ff
fix peptide pool reader and add tests
Jhsmit Dec 11, 2025
019e24c
add kingfisher HD examiner example file
Jhsmit Dec 11, 2025
561e230
refactor peptide pool reader
Jhsmit Dec 11, 2025
103810c
refactor: update format handling and improve peptide loading function…
Jhsmit Dec 11, 2025
961d9f7
example of only reading files
Jhsmit Dec 11, 2025
e9f380a
fix loading datasets example
Jhsmit Dec 11, 2025
627f780
add protein field to docs
Jhsmit Dec 11, 2025
5fa2458
add hd examiner peptides files
Jhsmit Dec 11, 2025
74eb719
cast exposure can also raise ValueError
Jhsmit Dec 11, 2025
6e0749e
group by protein and state such that dataframes can be aggregated in …
Jhsmit Dec 15, 2025
95d7895
fix hd examiner format identification
Jhsmit Dec 15, 2025
9461f2a
aggregate replicate, cluster and charge count
Jhsmit Dec 15, 2025
c19c43d
add summary function
Jhsmit Dec 15, 2025
916b973
add uptake summary table
Jhsmit Dec 15, 2025
e362e0f
add load method to formats
Jhsmit Dec 16, 2025
72c45e8
fix format names
Jhsmit Dec 16, 2025
5ba3ff9
add uptake summary converter
Jhsmit Dec 16, 2025
3adf0fe
renew test data with n_charges column
Jhsmit Dec 16, 2025
f45d70f
expand docs on n_charges
Jhsmit Dec 16, 2025
ef846aa
add function to find offset between structure and peptides
Jhsmit Dec 16, 2025
6fac61b
update docstring and allow selecting mutiple columns on join
Jhsmit Dec 16, 2025
4dd0861
make publication title required
Jhsmit Dec 16, 2025
25e9973
support loading from .zip files
Jhsmit Dec 16, 2025
6e67f95
allow loading form dir also when there is only 1 dataset present
Jhsmit Dec 16, 2025
37e40c5
delete comment
Jhsmit Dec 16, 2025
9c778e1
pass exception when format is not a string
Jhsmit Dec 17, 2025
c2e2b78
formatting
Jhsmit Dec 17, 2025
4e7ebae
remove unused code from zip example
Jhsmit Dec 17, 2025
4d9494e
check for stringIO type, format file
Jhsmit Dec 17, 2025
5b8c221
add loading .zip test
Jhsmit Dec 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/fields.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ residue number of the last amino acid in the peptide
### sequence (str)
fasta sequence of the peptide

### protein (str)
protein name or identifier

HDExaminer name: Protein
DynamX name: Protein

### state (str)
state label

Expand Down Expand Up @@ -93,6 +99,9 @@ These fields are derived from other fields defined in the above sections.
added after data aggregation
Total number of replicates that were aggregated together

### n_charges
Total number of different charged states that were aggregated together

### n_clusters
added after data aggregation
Total number of isotopic clusters that were aggregated together. When replicates include multiple isotopic clusters (different charged states), this value will be larger than n_replicates.
Expand Down
290 changes: 290 additions & 0 deletions docs/hd_examiner_files/HDX export file test.csv

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions docs/hd_examiner_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,25 @@ FD control: 'MAX' (older version)
Comments:


### Kingfisher HD examiner example

File: HDX export file test.csv
Source: https://github.com/juan2089/Kingfisher-HDX/blob/Kingfisher-v1.1/www/HDX%20export%20file%20test.csv

Columns:
The first line is a header with exposure times.

The second line has the column names, starting with:
'State,Protein,Start,End,Sequence,Search RT,Charge,Max D,'

Followed by repeating blocks of:
'Start RT,End RT,#D,%D,#D right,%D right,Score,Conf,'
Format: (almost!) HD examiner summary file

This is a HD examiner 'peptide pool' file



## HD Examiner manual on exporting data

**Peptide Pool Results / Uptake Summary Table**
Expand Down
2 changes: 1 addition & 1 deletion examples/from_hxms_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Optional

from hdxms_datasets.database import populate_known_ids, submit_dataset
from hdxms_datasets.loader import (
from hdxms_datasets.reader import (
read_hxms,
)
from hdxms_datasets.models import (
Expand Down
17 changes: 17 additions & 0 deletions examples/from_zip_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from hdxms_datasets import load_dataset
from pathlib import Path

DATA_ID = "HDX_C1198C76" # SecA DynamX state data
DATA_ID = "HDX_D9096080" # SecB DynamX state data

fname = "HDX_3BAE2080.zip" # Example dataset in a zip file

# %%
test_pth = Path(__file__).parent.parent / "tests"
database_dir = test_pth / "datasets"

dataset = load_dataset(database_dir / fname) # Should load the dataset from the zip file

print(dataset.states)

# %%
2 changes: 1 addition & 1 deletion examples/load_local_dynamx_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
plot_peptides(selected, domain=(0, 1), value="frac_max_uptake")

# %%
peptides = dataset.states[0].peptides[0]
peptides = dataset.states[0].peptides[0].load()
StructureView(dataset.structure).peptide_coverage(peptides)

# %%
Expand Down
10 changes: 5 additions & 5 deletions examples/load_local_dynamx_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@
# load the partially deuterated peptides
df = state.peptides[0].load(
convert=True,
aggregate=True,
# sort_rows=True,
# sort_columns=True,
aggregate=None, # dynamx state data is already aggregated
sort_rows=True,
sort_columns=True,
)
print(df.columns)
# > ['start', 'end', 'sequence', 'state', 'exposure', 'centroid_mz', 'rt', 'rt_sd', 'uptake', 'uptake_sd']
Expand Down Expand Up @@ -112,12 +112,12 @@
# %%
# show a single peptide
start, end = processed["start", "end"].row(10)
view = StructureView(dataset.structure).color_peptide(start, end, chain=["A"])
view = StructureView(dataset.structure).color_peptide(start, end)
view

# %%
# select a set of peptides for further viusualization
peptides = dataset.states[0].peptides[0]
peptides = dataset.states[0].peptides[0].load()

# %%
# show regions of the structure that are covered by peptides
Expand Down
1 change: 0 additions & 1 deletion examples/load_local_hdexaminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
selected = processed.filter(nw.col("exposure") == exposure_value)
plot_peptides(selected.to_polars(), value="frac_max_uptake", domain=(0, 1))
# %%
# %%

peptides = dataset.states[0].peptides[0]
StructureView(dataset.structure).peptide_coverage(selected)
34 changes: 34 additions & 0 deletions examples/read_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# %%

from pathlib import Path

from hdxms_datasets import identify_format

# %%

cwd = Path(__file__).parent

# %%

# read a hxms file
f = cwd / "test_data" / "ecDHFR" / "ecDHFR_2025-09-23_APO.hxms"

fmt_spec = identify_format(f)
# read to dataframe
df = fmt_spec.read(f)

# convert to open-hdx format
df_converted = fmt_spec.convert(df)
df_converted.to_native()

# %%
# read an dynamx file
f = cwd / "test_data" / "ecSecB" / "ecSecB_apo.csv"
fmt_spec = identify_format(f)
# read to dataframe
df = fmt_spec.read(f)

# convert to open-hdx format
df_converted = fmt_spec.convert(df)
df_converted.to_native()
# %%
4 changes: 4 additions & 0 deletions examples/test_data/ecDHFR/notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@ the HXMS file format manuscript.
Correct source:
https://www.biorxiv.org/content/10.1101/2025.10.14.682397v1.supplementary-material



ecDHFR tutorial.csv
Source: https://huggingface.co/spaces/glasgow-lab/PFLink
5 changes: 4 additions & 1 deletion hdxms_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from hdxms_datasets.__version__ import __version__
from hdxms_datasets.database import DataBase, RemoteDataBase, load_dataset, submit_dataset
from hdxms_datasets.loader import load_peptides, read_csv
from hdxms_datasets.formats import identify_format
from hdxms_datasets.models import (
Author,
DatasetMetadata,
Expand All @@ -18,8 +18,10 @@
aggregate,
apply_filters,
compute_uptake_metrics,
load_peptides,
merge_peptides,
)
from hdxms_datasets.reader import read_csv
from hdxms_datasets.utils import verify_sequence

__all__ = [
Expand All @@ -44,4 +46,5 @@
"apply_filters",
"aggregate",
"verify_sequence",
"identify_format",
]
70 changes: 64 additions & 6 deletions hdxms_datasets/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def from_dynamx_state(dynamx_df: nw.DataFrame) -> nw.DataFrame:
Convert a DynamX state DataFrame to OpenHDX format.
"""
column_mapping = {
# TODO add Protein
"State": "state",
"Exposure": "exposure",
"Start": "start",
Expand Down Expand Up @@ -79,10 +80,11 @@ def convert_rt(rt_str: str) -> float:
return mean


def cast_exposure(df):
def cast_exposure(df: nw.DataFrame) -> nw.DataFrame:
"""Tries to cast the exposure column to float"""
try:
df = df.with_columns(nw.col("exposure").str.strip_chars("s").cast(nw.Float64))
except InvalidOperationError:
except (InvalidOperationError, ValueError, AttributeError):
pass
return df

Expand All @@ -100,12 +102,19 @@ def _fmt_extra_columns(columns: list[str] | dict[str, str] | str | None) -> dict
raise ValueError("additional_columns must be a list or dict, not {}".format(type(columns)))


def from_hdexaminer(
def from_hdexaminer_all_results(
hd_examiner_df: nw.DataFrame,
extra_columns: list[str] | dict[str, str] | str | None = None,
) -> nw.DataFrame:
"""
Convert an HDExaminer DataFrame to OpenHDX format.
Convert an HDExaminer 'All results' exported DataFrame to OpenHDX format.

To export as all results (from HDExaminer documentation):

To export all tables to a .csv file, switch to the Analysis View, then select any experiment.
Select “Tools”, then “Export”, then “All Results Tables…” or right-click on the results table
and select “Export All Tables…”. Specify a filename. HDExaminer will save the combined tables
to that file.

Args:
hd_examiner_df: DataFrame in HDExaminer format.
Expand All @@ -116,7 +125,7 @@ def from_hdexaminer(
A DataFrame in OpenHDX format.

"""
from hdxms_datasets.loader import BACKEND
from hdxms_datasets.reader import BACKEND

column_mapping = {
"Protein State": "state",
Expand All @@ -139,19 +148,68 @@ def from_hdexaminer(
column_mapping.update(cols)
column_order.extend(cols.values())

# TODO: parse to two columns, start_rt, end_rt
rt_values = [convert_rt(rt_str) for rt_str in hd_examiner_df["Actual RT"]]
rt_series = nw.new_series(values=rt_values, name="rt", backend=BACKEND)

df = (
hd_examiner_df.rename(column_mapping)
.with_columns([centroid_mass, rt_series])
.select(column_order)
.sort(by=["state", "exposure", "start", "end", "replicate"])
.sort(
by=["state", "exposure", "start", "end", "replicate"]
) # TODO sort by protein first (if available), take from global var
)

return cast_exposure(df)


def from_hdexaminer_peptide_pool(df: nw.DataFrame) -> nw.DataFrame:
"""Convert from hd examiner peptide pool format to OpenHDX format."""
column_mapping = {
"State": "state",
"Exposure": "exposure",
"Start": "start",
"End": "end",
"Sequence": "sequence",
"Charge": "charge",
"#D": "uptake",
"Start RT": "start_rt",
"End RT": "end_rt",
"Search RT": "search_rt",
}

df = df.rename(column_mapping)
column_order = list(column_mapping.values())

df = df.select(column_order) # .sort(by=["state", "exposure", "start", "end"])

return cast_exposure(df)


def from_hdexaminer_uptake_summary(df: nw.DataFrame) -> nw.DataFrame:
"""Convert from hd examiner uptake summary format to OpenHDX format."""
column_mapping = {
"Protein": "protein",
"Protein State": "state",
"Start": "start",
"End": "end",
"Deut Time (sec)": "exposure",
#'Peptide Mass' ?,
"Sequence": "sequence",
"#D": "uptake",
"RT (min)": "rt",
"#Rep": "n_replicates",
}

df = df.rename(column_mapping)
column_order = list(column_mapping.values())

df = df.select(column_order) # .sort(by=["state", "exposure", "start", "end"])

return cast_exposure(df)


def from_hxms(
hxms_df: nw.DataFrame,
extra_columns: list[str] | dict[str, str] | str | None = "sequence",
Expand Down
Loading
Loading