VectorInstitute · lotif · Jan 8, 2026 · Jan 8, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -47,6 +47,7 @@ site/
 # Test artifacts
 tests/integration/attacks/tartan_federer/assets/tabddpm_models/**/challenge_label_predictions.csv
 tests/integration/attacks/tartan_federer/assets/tartan_federer_attack_results
+tests/integration/attacks/ensemble/assets/workspace
 
 # Training Logs
 *.err

diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -3,6 +3,7 @@
 provided resources and data.
 """
 
+import json
 from logging import INFO
 from pathlib import Path
 
@@ -13,7 +14,8 @@
 import examples.ensemble_attack.run_shadow_model_training as shadow_pipeline
 from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME, collect_population_data_ensemble
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
-from midst_toolkit.attacks.ensemble.process_split_data import process_split_data
+from midst_toolkit.attacks.ensemble.models import EnsembleAttackTabDDPMModelRunner, EnsembleAttackTabDDPMTrainingConfig
+from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME, process_split_data
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.random import set_all_random_seeds
 
@@ -79,12 +81,24 @@ def main(config: DictConfig) -> None:
     if config.pipeline.run_shadow_model_training:
         df_master_challenge_train = load_dataframe(
             Path(config.data_paths.processed_attack_data_path),
-            "master_challenge_train.csv",
+            PROCESSED_TRAIN_DATA_FILE_NAME,
         )
-        shadow_data_paths = shadow_pipeline.run_shadow_model_training(config, df_master_challenge_train)
+
+        with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file:
+            training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file))
+        training_config.fine_tuning_diffusion_iterations = (
+            config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations
+        )
+        training_config.fine_tuning_classifier_iterations = (
+            config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations
+        )
+
+        model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config)
+
+        shadow_data_paths = shadow_pipeline.run_shadow_model_training(model_runner, config, df_master_challenge_train)
         shadow_data_paths = [Path(path) for path in shadow_data_paths]
 
-        target_model_synthetic_path = shadow_pipeline.run_target_model_training(config)
+        target_model_synthetic_path = shadow_pipeline.run_target_model_training(model_runner, config)
 
     if config.pipeline.run_metaclassifier_training:
         if not config.pipeline.run_shadow_model_training:

diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py
@@ -9,6 +9,12 @@
 from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME
 from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
+from midst_toolkit.attacks.ensemble.process_split_data import (
+    PROCESSED_TEST_DATA_FILE_NAME,
+    PROCESSED_TEST_LABELS_FILE_NAME,
+    PROCESSED_TRAIN_DATA_FILE_NAME,
+    PROCESSED_TRAIN_LABELS_FILE_NAME,
+)
 from midst_toolkit.common.logger import log
 
 
@@ -32,20 +38,20 @@ def run_metaclassifier_training(
     # Load the processed data splits.
     df_meta_train = load_dataframe(
         Path(config.data_paths.processed_attack_data_path),
-        "master_challenge_train.csv",
+        PROCESSED_TRAIN_DATA_FILE_NAME,
     )
 
     # y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train
     # belongs to the target model's training set.
     y_meta_train = np.load(
-        Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy",
+        Path(config.data_paths.processed_attack_data_path) / PROCESSED_TRAIN_LABELS_FILE_NAME,
     )
     df_meta_test = load_dataframe(
         Path(config.data_paths.processed_attack_data_path),
-        "master_challenge_test.csv",
+        PROCESSED_TEST_DATA_FILE_NAME,
     )
     y_meta_test = np.load(
-        Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy",
+        Path(config.data_paths.processed_attack_data_path) / PROCESSED_TEST_LABELS_FILE_NAME,
     )
 
     # Three sets of shadow models are trained separately and their paths are provided here.

diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py
@@ -1,37 +1,31 @@
 import shutil
 from logging import INFO
 from pathlib import Path
-from typing import cast
 
 import pandas as pd
 from omegaconf import DictConfig
 
-from examples.ensemble_attack.real_data_collection import COLLECTED_DATA_FILE_NAME
-from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
-from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import (
-    train_three_sets_of_shadow_models,
-)
-from midst_toolkit.attacks.ensemble.shadow_model_utils import (
-    ModelType,
-    TrainingResult,
-    save_additional_training_config,
-    train_or_fine_tune_and_synthesize_with_ctgan,
-    train_tabddpm_and_synthesize,
+from examples.ensemble_attack.real_data_collection import (
+    COLLECTED_DATA_FILE_NAME,
 )
-from midst_toolkit.common.config import ClavaDDPMTrainingConfig, CTGANTrainingConfig
+from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
+from midst_toolkit.attacks.ensemble.models import EnsembleAttackModelRunner
+from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import train_three_sets_of_shadow_models
+from midst_toolkit.attacks.ensemble.shadow_model_utils import update_and_save_training_config
 from midst_toolkit.common.logger import log
 
 
 DEFAULT_TABLE_NAME = "trans"
 DEFAULT_ID_COLUMN_NAME = "trans_id"
-DEFAULT_MODEL_TYPE = ModelType.TABDDPM
 
 
-def run_target_model_training(config: DictConfig) -> Path:
+def run_target_model_training(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> Path:
     """
     Function to run the target model training for RMIA attack.
 
     Args:
+        model_runner: The model runner to be used for training the target model.
+            Should be an instance of a subclass of `EnsembleAttackModelRunner`.
         config: Configuration object set in config.yaml.
 
     Returns:
@@ -54,11 +48,6 @@ def run_target_model_training(config: DictConfig) -> Path:
 
     target_folder = target_model_output_path / "target_model"
 
-    model_type = DEFAULT_MODEL_TYPE
-    if "model_name" in config.shadow_training:
-        model_type = ModelType(config.shadow_training.model_name)
-    log(INFO, f"Training target model with model type: {model_type.value}")
-
     target_folder.mkdir(parents=True, exist_ok=True)
     shutil.copyfile(
         target_training_json_config_paths.table_domain_file_path,
@@ -68,30 +57,16 @@ def run_target_model_training(config: DictConfig) -> Path:
         target_training_json_config_paths.dataset_meta_file_path,
         target_folder / "dataset_meta.json",
     )
-    configs, save_dir = save_additional_training_config(
+
+    configs = update_and_save_training_config(
+        config=model_runner.training_config,
         data_dir=target_folder,
-        training_config_json_path=Path(target_training_json_config_paths.training_config_path),
         final_config_json_path=target_folder / f"{table_name}.json",  # Path to the new json
         experiment_name="trained_target_model",
-        model_type=model_type,
     )
+    model_runner.training_config = configs
 
-    train_result: TrainingResult
-    if model_type == ModelType.TABDDPM:
-        train_result = train_tabddpm_and_synthesize(
-            train_set=df_real_data,
-            configs=cast(ClavaDDPMTrainingConfig, configs),
-            save_dir=save_dir,
-            synthesize=True,
-            number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
-        )
-    elif model_type == ModelType.CTGAN:
-        train_result = train_or_fine_tune_and_synthesize_with_ctgan(
-            dataset=df_real_data,
-            configs=cast(CTGANTrainingConfig, configs),
-            save_dir=save_dir,
-            synthesize=True,
-        )
+    train_result = model_runner.train_or_fine_tune_and_synthesize(dataset=df_real_data, synthesize=True)
 
     # To train the attack model (metaclassifier), we only need to save target's synthetic data,
     # and not the entire target model's training result object.
@@ -105,11 +80,17 @@ def run_target_model_training(config: DictConfig) -> Path:
     return target_model_synthetic_path
 
 
-def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFrame) -> list[Path]:
+def run_shadow_model_training(
+    model_runner: EnsembleAttackModelRunner,
+    config: DictConfig,
+    df_challenge_train: pd.DataFrame,
+) -> list[Path]:
     """
     Function to run the shadow model training for RMIA attack.
 
     Args:
+        model_runner: The model runner to be used for training the shadow models.
+            Should be an instance of `EnsembleAttackModelRunner`.
         config: Configuration object set in config.yaml.
         df_challenge_train: DataFrame containing the data that is used to train RMIA shadow models.
 
@@ -130,10 +111,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
     # Population data is used to pre-train some of the shadow models.
     df_population_with_challenge = load_dataframe(Path(config.data_paths.population_path), data_file_name)
 
-    model_type = DEFAULT_MODEL_TYPE
-    if "model_name" in config.shadow_training:
-        model_type = ModelType(config.shadow_training.model_name)
-    log(INFO, f"Training shadow models with model type: {model_type.value}")
+    log(INFO, f"Training shadow models with model runner: {model_runner}")
 
     # Make sure master challenge train and population data have the id column.
     assert id_column_name in df_challenge_train.columns, (
@@ -146,6 +124,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
     # ``master_challenge_df`` is used for fine-tuning for half of the shadow models.
     # For the other half of the shadow models, only ``master_challenge_df`` is used for training.
     first_set_result_path, second_set_result_path, third_set_result_path = train_three_sets_of_shadow_models(
+        model_runner=model_runner,
         population_data=df_population_with_challenge,
         master_challenge_data=df_challenge_train,
         shadow_models_output_path=Path(config.shadow_training.shadow_models_output_path),
@@ -157,9 +136,7 @@ def run_shadow_model_training(config: DictConfig, df_challenge_train: pd.DataFra
         # ``4 * n_models_per_set`` total shadow models.
         n_models_per_set=4,  # 4 based on the original code, must be even
         n_reps=12,  # Number of repetitions of challenge points in each shadow model training set. `12` based on the original code
-        number_of_points_to_synthesize=config.shadow_training.number_of_points_to_synthesize,
         random_seed=config.random_seed,
-        model_type=model_type,
     )
     log(
         INFO,

diff --git a/examples/ensemble_attack/test_attack_model.py b/examples/ensemble_attack/test_attack_model.py
@@ -21,6 +21,12 @@
 from examples.ensemble_attack.run_shadow_model_training import run_shadow_model_training
 from midst_toolkit.attacks.ensemble.blending import BlendingPlusPlus, MetaClassifierType
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
+from midst_toolkit.attacks.ensemble.models import (
+    EnsembleAttackModelRunner,
+    EnsembleAttackTabDDPMModelRunner,
+    EnsembleAttackTabDDPMTrainingConfig,
+)
+from midst_toolkit.attacks.ensemble.process_split_data import PROCESSED_TRAIN_DATA_FILE_NAME
 from midst_toolkit.common.logger import log
 from midst_toolkit.common.random import set_all_random_seeds
 from midst_toolkit.models.clavaddpm.train import get_df_without_id
@@ -87,7 +93,11 @@ def extract_primary_id_column(
     return data_frame[id_column_name]
 
 
-def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) -> list[dict[str, list[Any]]]:
+def run_rmia_shadow_training(
+    model_runner: EnsembleAttackModelRunner,
+    config: DictConfig,
+    df_challenge: pd.DataFrame,
+) -> list[dict[str, list[Any]]]:
     """
     Three sets of shadow models will be trained as a part of this attack.
     Note that shadow models need to be trained on the collection of challenge points once and used
@@ -96,14 +106,16 @@ def run_rmia_shadow_training(config: DictConfig, df_challenge: pd.DataFrame) ->
     of the shadow models, and these shadow models are used to attack all target models.
 
     Args:
-        config: Configuration object set in ``experiments_config.yaml``.
+        model_runner: The model runner to be used for training the shadow models.
+            Should be an instance of `EnsembleAttackModelRunner`.
+        config: Configuration object set in config.yaml.
         df_challenge: DataFrame containing the challenge data points for shadow model training.
 
     Return:
         A list containing three dictionaries, each representing a collection of shadow
             models with their training data and generated synthetic outputs.
     """
-    shadow_model_paths = run_shadow_model_training(config, df_challenge_train=df_challenge)
+    shadow_model_paths = run_shadow_model_training(model_runner, config, df_challenge_train=df_challenge)
 
     assert len(shadow_model_paths) == 3, "For testing, meta classifier needs the path to three sets of shadow models."
 
@@ -198,7 +210,7 @@ def collect_challenge_and_train_data(
     # Load master challenge train data
     df_master_train = load_dataframe(
         processed_attack_data_path,
-        "master_challenge_train.csv",
+        PROCESSED_TRAIN_DATA_FILE_NAME,
     )
     log(
         INFO,
@@ -254,12 +266,17 @@ def select_challenge_data_for_training(
     return df_challenge
 
 
-def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list[Any]]]:
+def train_rmia_shadows_for_test_phase(
+    model_runner: EnsembleAttackModelRunner,
+    config: DictConfig,
+) -> list[dict[str, list[Any]]]:
     """
     Function to train RMIA shadow models for the testing phase using the dataset containing
     challenge data points.
 
     Args:
+        model_runner: The model runner to be used for training the shadow models.
+            Should be an instance of `EnsembleAttackModelRunner`.
         config: Configuration object set in ``experiments_config.yaml``.
 
     Returns:
@@ -279,7 +296,7 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
         )
         df_master_train = load_dataframe(
             processed_attack_data_path,
-            "master_challenge_train.csv",
+            PROCESSED_TRAIN_DATA_FILE_NAME,
         )
     else:
         # If challenge data does not exist, collect it from the cluster
@@ -292,15 +309,10 @@ def train_rmia_shadows_for_test_phase(config: DictConfig) -> list[dict[str, list
     # Load the challenge dataframe for training RMIA shadow models.
     rmia_training_choice = RmiaTrainingDataChoice(config.target_model.attack_rmia_shadow_training_data_choice)
     df_challenge = select_challenge_data_for_training(rmia_training_choice, df_challenge_experiment, df_master_train)
-    return run_rmia_shadow_training(config, df_challenge=df_challenge)
+    return run_rmia_shadow_training(model_runner, config, df_challenge=df_challenge)
 
 
-# TODO: Perform inference on all the target models sequentially in a single run instead of running this script
-# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86
-@hydra.main(config_path="configs", config_name="experiment_config", version_base=None)
-def run_metaclassifier_testing(
-    config: DictConfig,
-) -> None:
+def run_metaclassifier_testing(model_runner: EnsembleAttackModelRunner, config: DictConfig) -> None:
     """
     Function to run the attack on a single target model using a trained metaclassifier.
     Note that RMIA shadow models need to be trained for every new set of target models on
@@ -313,6 +325,8 @@ def run_metaclassifier_testing(
     Test prediction probabilities are saved to the specified attack result path in the config.
 
     Args:
+        model_runner: The model runner to be used for testing the metaclassifier.
+            Should be an instance of `EnsembleAttackModelRunner`.
         config: Configuration object set in ``experiments_config.yaml``.
     """
     log(
@@ -382,7 +396,7 @@ def run_metaclassifier_testing(
 
     if not models_exists:
         log(INFO, "Shadow models for testing phase do not exist. Training RMIA shadow models...")
-        shadow_data_collection = train_rmia_shadows_for_test_phase(config)
+        shadow_data_collection = train_rmia_shadows_for_test_phase(model_runner, config)
 
     else:
         log(INFO, "All shadow models for testing phase found. Using existing RMIA shadow models...")
@@ -427,5 +441,32 @@ def run_metaclassifier_testing(
     save_results(attack_results_path, metaclassifier_model_name, probabilities, pred_score)
 
 
+# TODO: Perform inference on all the target models sequentially in a single run instead of running this script
+# multiple times. For more information, refer to https://app.clickup.com/t/868h4xk86
+@hydra.main(config_path="configs", config_name="experiment_config", version_base=None)
+def run_metaclassifier_testing_with_tabddpm(config: DictConfig) -> None:
+    """
+    Run the attack on a single target model using a trained metaclassifier.
+    RMIA shadow models will be trained using the TabDDPM model.
+
+    Args:
+        config: Configuration object set in config.yaml.
+    """
+    log(INFO, "Running metaclassifier testing with TabDDPM...")
+
+    with open(config.shadow_training.training_json_config_paths.training_config_path, "r") as file:
+        training_config = EnsembleAttackTabDDPMTrainingConfig(**json.load(file))
+    training_config.fine_tuning_diffusion_iterations = (
+        config.shadow_training.fine_tuning_config.fine_tune_diffusion_iterations
+    )
+    training_config.fine_tuning_classifier_iterations = (
+        config.shadow_training.fine_tuning_config.fine_tune_classifier_iterations
+    )
+
+    model_runner = EnsembleAttackTabDDPMModelRunner(training_config=training_config)
+
+    run_metaclassifier_testing(model_runner, config)
+
+
 if __name__ == "__main__":
-    run_metaclassifier_testing()
+    run_metaclassifier_testing_with_tabddpm()