Skip to content
Open
Show file tree
Hide file tree
Changes from 43 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
1a40fd7
wip
lotif Jan 8, 2026
1d18580
wip
lotif Jan 8, 2026
e42e630
WIP moving forward with the ensemble attack code changes
lotif Jan 13, 2026
a46a010
WIP adding training and sythesizing code
lotif Jan 13, 2026
30c0ed3
More info on readme
lotif Jan 14, 2026
9464962
More ctgan changes
lotif Feb 23, 2026
e5c8fda
Adding the split data code
lotif Feb 24, 2026
8f10678
More config changes and bug fixes
lotif Feb 24, 2026
077d909
Removing ids dynamically
lotif Feb 25, 2026
b711fbd
Working!
lotif Feb 25, 2026
efdde68
Merge branch 'main' into marcelo/ensamble-ctgan
lotif Mar 3, 2026
1a38af2
Fixing indent on config file and adding some more information to the …
lotif Mar 3, 2026
af4f04e
Adding test attack model code
lotif Mar 4, 2026
5afb774
Small bug fixes
lotif Mar 5, 2026
e4ec793
Updates to readme and config file values
lotif Mar 5, 2026
1c13126
Small changes on configs and script bug fixes
lotif Mar 5, 2026
4e9a8c9
Adding the compute attack success script and fixing minor issues
lotif Mar 5, 2026
d83aabf
Cr by CodeRabbit and Sara
lotif Mar 9, 2026
a198fe9
Reducing the amount of training samples to 20k
lotif Mar 9, 2026
0416dbc
Merge branch 'main' into marcelo/ensamble-ctgan
lotif Mar 9, 2026
e69b07e
Change function name to avoid pytest thinking it's a test
lotif Mar 9, 2026
579d0f3
Merge remote-tracking branch 'origin/marcelo/ensamble-ctgan' into mar…
lotif Mar 9, 2026
5fa4fef
Fixing test assertions
lotif Mar 9, 2026
8b6bf10
Merge branch 'main' into marcelo/ensamble-ctgan
lotif Mar 9, 2026
a9369f6
Making population_all_with_challenge.csv into a constant and adding a…
lotif Mar 13, 2026
163bba8
Addressing last comments by Fatemeh
lotif Mar 16, 2026
bf805c1
Merge branch 'main' into marcelo/ensamble-ctgan
lotif Mar 16, 2026
ecab1e2
WIP adding model runner class
lotif Mar 16, 2026
dda8c5e
Merge branch 'main' into marcelo/support-attack-models
lotif Mar 18, 2026
38a20b5
working first refactor
lotif Mar 18, 2026
ac1a0bf
train attack model working
lotif Mar 18, 2026
2c3fa1e
Adding changes for the test model script
lotif Mar 19, 2026
ca87ac3
Linter changes
lotif Mar 19, 2026
cfb4ded
Merge branch 'main' into marcelo/support-attack-models
lotif Mar 19, 2026
c42ee6e
Fixing mypy and ruff
lotif Mar 19, 2026
093b0e4
Tests passing
lotif Mar 19, 2026
d50ff39
renaming model to models
lotif Mar 19, 2026
7135924
Small bug fix
lotif Mar 19, 2026
082ea7c
Bringing back the config json saving function against my will
lotif Mar 19, 2026
94da62e
one more bug fix
lotif Mar 19, 2026
cc2cb81
Fixing the test
lotif Mar 19, 2026
fe78e34
One more refactor to make things simpler.
lotif Mar 19, 2026
26f88f6
CR by Coderabbit
lotif Mar 19, 2026
5137a87
Fixing a bug on the amount of shadow model samples to generate
lotif Mar 20, 2026
068d936
CR by David
lotif Mar 23, 2026
e0007e8
Merge branch 'main' into marcelo/support-attack-models
emersodb Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ site/
# Test artifacts
tests/integration/attacks/tartan_federer/assets/tabddpm_models/**/challenge_label_predictions.csv
tests/integration/attacks/tartan_federer/assets/tartan_federer_attack_results
tests/integration/attacks/ensemble/assets/workspace

# Training Logs
*.err
Expand Down
22 changes: 18 additions & 4 deletions examples/ensemble_attack/run_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
provided resources and data.
"""

import json
from logging import INFO
from pathlib import Path

Expand All @@ -13,7 +14,8 @@
import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline
from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig
from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME, process_split_data
from midst_toolkit.common.logger import log
from midst_toolkit.common.random import set_all_random_seeds

Expand Down Expand Up @@ -79,12 +81,24 @@ def main(config: DictConfig) -> None:
if config.pipeline.run_shadow_model_training:
df_master_challenge_train = load_dataframe(
Path(config.data_paths.processed_attack_data_path),
"master_challenge_train.csv",
PROCESSED_TRAIN_DATA_FILE_NAME,
)
shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train)

with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file:
training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file))
training_config.fine_tuning_diffusion_iterations = (
config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations
)
training_config.fine_tuning_classifier_iterations = (
config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations
)

model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps you've already thought of this, but should the code above be part of the base for the ModelRunner? That is, should lines 87-94 actually happen inside that class rather than in the attack script here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would also slightly simplify the process of subbing out the model, since you would just need to sub the runner class instead of both the running and the config class? I might be missing a complexity though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if I understood your idea, but I thought maybe if I pass the config dictionary to the init of the model runner class we would be able to skip making the config. Is that it?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sort of. My thought was that you could simply have the EnsembleAttackTabDDPMModelRunner init take a path to the configuration file. Then you could load the file and do all of the steps to properly construct EnsembleAttackTabDDPMTrainingConfig object within the runner class? That way a user doesn't have to do that themselves.

It's possible I'm missing something where that would be a bad idea though 🙂

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me know if my explanation of what I was trying to suggest isn't clear. We can talk about it together.


shadow_data_paths = shadow_pipeline.run_shadow_model_training(model_runner, config, df_master_challenge_train)
shadow_data_paths = [Path(path) for path in shadow_data_paths]

target_model_synthetic_path = shadow_pipeline.run_target_model_training(config)
target_model_synthetic_path = shadow_pipeline.run_target_model_training(model_runner, config)

if config.pipeline.run_metaclassifier_training:
if not config.pipeline.run_shadow_model_training:
Expand Down
14 changes: 10 additions & 4 deletions examples/ensemble_attack/run_metaclassifier_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME
from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
from midst_toolkit.attacks.ensemble.process_split_data import (
PROCESSED_TEST_DATA_FILE_NAME,
PROCESSED_TEST_LABELS_FILE_NAME,
PROCESSED_TRAIN_DATA_FILE_NAME,
PROCESSED_TRAIN_LABELS_FILE_NAME,
)
from midst_toolkit.common.logger import log


Expand All @@ -32,20 +38,20 @@ def run_metaclassifier_training(
# Load the processed data splits.
df_meta_train = load_dataframe(
Path(config.data_paths.processed_attack_data_path),
"master_challenge_train.csv",
PROCESSED_TRAIN_DATA_FILE_NAME,
)

# y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train
# belongs to the target model's training set.
y_meta_train = np.load(
Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy",
Path(config.data_paths.processed_attack_data_path) / PROCESSED_TRAIN_LABELS_FILE_NAME,
)
df_meta_test = load_dataframe(
Path(config.data_paths.processed_attack_data_path),
"master_challenge_test.csv",
PROCESSED_TEST_DATA_FILE_NAME,
)
y_meta_test = np.load(
Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy",
Path(config.data_paths.processed_attack_data_path) / PROCESSED_TEST_LABELS_FILE_NAME,
)

# Three sets of shadow models are trained separately and their paths are provided here.
Expand Down
69 changes: 23 additions & 46 deletions examples/ensemble_attack/run_shadow_model_training.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
import shutil
from logging import INFO
from pathlib import Path
from typing import cast

import pandas as pd
from omegaconf import DictConfig

from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import (
train_three_sets_of_shadow_models,
)
from midst_toolkit.attacks.ensemble.shadow_model_utils import (
ModelType,
TrainingResult,
save_additional_training_config,
train_or_fine_tune_and_synthesize_with_ctgan,
train_tabddpm_and_synthesize,
from examples.ensemble_attack.real_data_collection import (
COLLECTED_DATA_FILE_NAME,
)
from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner
from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models
from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config
from midst_toolkit.common.logger import log


DEFAULT_TABLE_NAME = "trans"
DEFAULT_ID_COLUMN_NAME = "trans_id"
DEFAULT_MODEL_TYPE = ModelType.TABDDPM


def run_target_model_training(config: DictConfig) -> Path:
def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> Path:
"""
Function to run the target model training for RMIA attack.

Args:
model_runner: The model runner to be used for training the target model.
Should be an instance of a subclass of `EnsembleAttackModelRunner`.
config: Configuration object set in config.yaml.

Returns:
Expand All @@ -54,11 +48,6 @@ def run_target_model_training(config: DictConfig) -> Path:

target_folder = target_model_output_path / "target_model"

model_type = DEFAULT_MODEL_TYPE
if "model_name" in config.shadow_training:
model_type = ModelType(config.shadow_training.model_name)
log(INFO, f"Training target model with model type: {model_type.value}")

target_folder.mkdir(parents=True, exist_ok=True)
shutil.copyfile(
target_training_json_config_paths.table_domain_file_path,
Expand All @@ -68,30 +57,16 @@ def run_target_model_training(config: DictConfig) -> Path:
target_training_json_config_paths.dataset_meta_file_path,
target_folder / "dataset_meta.json",
)
configs, save_dir = save_additional_training_config(

configs = update_and_save_training_config(
config=model_runner.training_config,
data_dir=target_folder,
training_config_json_path=Path(target_training_json_config_paths.training_config_path),
final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json
experiment_name="trained_target_model",
model_type=model_type,
)
model_runner.training_config = configs

train_result: TrainingResult
if model_type == ModelType.TABDDPM:
train_result = train_tabddpm_and_synthesize(
train_set=df_real_data,
configs=cast(ClavaDDPMTrainingConfig, configs),
save_dir=save_dir,
synthesize=True,
number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
)
elif model_type == ModelType.CTGAN:
train_result = train_or_fine_tune_and_synthesize_with_ctgan(
dataset=df_real_data,
configs=cast(CTGANTrainingConfig, configs),
save_dir=save_dir,
synthesize=True,
)
train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a much needed change!


# To train the attack model (metaclassifier), we only need to save target's synthetic data,
# and not the entire target model's training result object.
Expand All @@ -105,11 +80,17 @@ def run_target_model_training(config: DictConfig) -> Path:
return target_model_synthetic_path


def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]:
def run_shadow_model_training(
model_runner: EnsembleAttackModelRunner,
config: DictConfig,
df_challenge_train: pd.DataFrame,
) -> list[Path]:
"""
Function to run the shadow model training for RMIA attack.

Args:
model_runner: The model runner to be used for training the shadow models.
Should be an instance of `EnsembleAttackModelRunner`.
config: Configuration object set in config.yaml.
df_challenge_train: DataFrame containing the data that is used to train RMIA shadow models.

Expand All @@ -130,10 +111,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
# Population data is used to pre-train some of the shadow models.
df_population_with_challenge = load_dataframe(Path(config.data_paths.population_path), data_file_name)

model_type = DEFAULT_MODEL_TYPE
if "model_name" in config.shadow_training:
model_type = ModelType(config.shadow_training.model_name)
log(INFO, f"Training shadow models with model type: {model_type.value}")
log(INFO, f"Training shadow models with model runner: {model_runner}")

# Make sure master challenge train and population data have the id column.
assert id_column_name in df_challenge_train.columns, (
Expand All @@ -146,6 +124,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
# ``master_challenge_df`` is used for fine-tuning for half of the shadow models.
# For the other half of the shadow models, only ``master_challenge_df`` is used for training.
first_set_result_path, second_set_result_path, third_set_result_path = train_three_sets_of_shadow_models(
model_runner=model_runner,
population_data=df_population_with_challenge,
master_challenge_data=df_challenge_train,
shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path),
Expand All @@ -157,9 +136,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
# ``4 * n_models_per_set`` total shadow models.
n_models_per_set=4, # 4 based on the original code, must be even
n_reps=12, # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
random_seed=config.random_seed,
model_type=model_type,
)
log(
INFO,
Expand Down
71 changes: 56 additions & 15 deletions examples/ensemble_attack/test_attack_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@
from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training
from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
from midst_toolkit.attacks.ensemble.models import (
EnsembleAttackModelRunner,
EnsembleAttackTabDDPMModelRunner,
EnsembleAttackTabDDPMTrainingConfig,
)
from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME
from midst_toolkit.common.logger import log
from midst_toolkit.common.random import set_all_random_seeds
from midst_toolkit.models.clavaddpm.train import get_df_without_id
Expand Down Expand Up @@ -87,7 +93,11 @@ def extract_primary_id_column(
return data_frame[id_column_name]


def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]:
def run_rmia_shadow_training(
model_runner: EnsembleAttackModelRunner,
config: DictConfig,
df_challenge: pd.DataFrame,
) -> list[dict[str, list[Any]]]:
"""
Three sets of shadow models will be trained as a part of this attack.
Note that shadow models need to be trained on the collection of challenge points once and used
Expand All @@ -96,14 +106,16 @@ def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) ->
of the shadow models, and these shadow models are used to attack all target models.

Args:
config: Configuration object set in ``experiments_config.yaml``.
model_runner: The model runner to be used for training the shadow models.
Should be an instance of `EnsembleAttackModelRunner`.
config: Configuration object set in config.yaml.
df_challenge: DataFrame containing the challenge data points for shadow model training.

Return:
A list containing three dictionaries, each representing a collection of shadow
models with their training data and generated synthetic outputs.
"""
shadow_model_paths = run_shadow_model_training(config, df_challenge_train=df_challenge)
shadow_model_paths = run_shadow_model_training(model_runner, config, df_challenge_train=df_challenge)

assert len(shadow_model_paths) == 3, "For testing, meta classifier needs the path to three sets of shadow models."

Expand Down Expand Up @@ -198,7 +210,7 @@ def collect_challenge_and_train_data(
# Load master challenge train data
df_master_train = load_dataframe(
processed_attack_data_path,
"master_challenge_train.csv",
PROCESSED_TRAIN_DATA_FILE_NAME,
)
log(
INFO,
Expand Down Expand Up @@ -254,12 +266,17 @@ def select_challenge_data_for_training(
return df_challenge


def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list[Any]]]:
def train_rmia_shadows_for_test_phase(
model_runner: EnsembleAttackModelRunner,
config: DictConfig,
) -> list[dict[str, list[Any]]]:
"""
Function to train RMIA shadow models for the testing phase using the dataset containing
challenge data points.

Args:
model_runner: The model runner to be used for training the shadow models.
Should be an instance of `EnsembleAttackModelRunner`.
config: Configuration object set in ``experiments_config.yaml``.

Returns:
Expand All @@ -279,7 +296,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
)
df_master_train = load_dataframe(
processed_attack_data_path,
"master_challenge_train.csv",
PROCESSED_TRAIN_DATA_FILE_NAME,
)
else:
# If challenge data does not exist, collect it from the cluster
Expand All @@ -292,15 +309,10 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
# Load the challenge dataframe for training RMIA shadow models.
rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice)
df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train)
return run_rmia_shadow_training(config, df_challenge=df_challenge)
return run_rmia_shadow_training(model_runner, config, df_challenge=df_challenge)


# TODO: Perform inference on all the target models sequentially in a single run instead of running this script
# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86
@hydra.main(config_path="configs", config_name="experiment_config", version_base=None)
def run_metaclassifier_testing(
config: DictConfig,
) -> None:
def run_metaclassifier_testing(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> None:
"""
Function to run the attack on a single target model using a trained metaclassifier.
Note that RMIA shadow models need to be trained for every new set of target models on
Expand All @@ -313,6 +325,8 @@ def run_metaclassifier_testing(
Test prediction probabilities are saved to the specified attack result path in the config.

Args:
model_runner: The model runner to be used for testing the metaclassifier.
Should be an instance of `EnsembleAttackModelRunner`.
config: Configuration object set in ``experiments_config.yaml``.
"""
log(
Expand Down Expand Up @@ -382,7 +396,7 @@ def run_metaclassifier_testing(

if not models_exists:
log(INFO, "Shadow models for testing phase do not exist. Training RMIA shadow models...")
shadow_data_collection = train_rmia_shadows_for_test_phase(config)
shadow_data_collection = train_rmia_shadows_for_test_phase(model_runner, config)

else:
log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...")
Expand Down Expand Up @@ -427,5 +441,32 @@ def run_metaclassifier_testing(
save_results(attack_results_path, metaclassifier_model_name, probabilities, pred_score)


# TODO: Perform inference on all the target models sequentially in a single run instead of running this script
# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86
@hydra.main(config_path="configs", config_name="experiment_config", version_base=None)
def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None:
"""
Run the attack on a single target model using a trained metaclassifier.
RMIA shadow models will be trained using the TabDDPM model.

Args:
config: Configuration object set in config.yaml.
"""
log(INFO, "Running metaclassifier testing with TabDDPM...")

with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file:
training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file))
training_config.fine_tuning_diffusion_iterations = (
config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations
)
training_config.fine_tuning_classifier_iterations = (
config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations
)

model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment here about config processing.


run_metaclassifier_testing(model_runner, config)


if __name__ == "__main__":
run_metaclassifier_testing()
run_metaclassifier_testing_with_tabddpm()
Loading