diff --git a/.gitignore b/.gitignore
index 7ddc2c6d67..2d48ddac55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,3 +106,4 @@ bench_input
 bench_output
 bench_save
 bench_work
+/models
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a2f9527976..23eb8c39fe 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,3 +1,5 @@
+include: ci/.gitlab-setup.yml
+
 stages:
   - sync
   - build
@@ -17,6 +19,9 @@ variables:
   CPU_CORES:
     description: "Select number of CPU cores and test workers"
     value: "32"
+  CPU_CORES_BENCH:
+    description: "Select number of CPU cores for benchmark runs"
+    value: "8"
   PARALLEL_JOBS:
     description: "Number of parallel Slurm array jobs per Benchmark job"
     value: "1"
@@ -30,15 +35,14 @@ variables:
     description: "Optional QoS option (include --qos, e.g., --qos express)"
     value: ""
   MANUAL_CFG_PATH:
-    description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner"
+    description: "Name (in ci/cfg/) or path (relative to LOCAL_CFG_DIR) of benchmarking config to run"
     value: ""
 
 workflow:
   name: '$PIPELINE_NAME'
   rules:
-    # Run pipeline for GitHub PRs to dev or main (does not support PRs from forks)
+    # Run pipeline for GitHub PRs to dev (does not support PRs from forks)
     - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
-    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "main"
     # Run pipeline for pushes to dev or main
     - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "main"
     # Run pipeline if manually triggered via API or web GUI
@@ -67,38 +71,6 @@ Sync finn-dev:
     - git pull upstream dev
     - git push origin finn-dev
 
-.n2_setup_general: &n2_setup_general
-  - module load lang/Python/3.10.4-GCCcore-11.3.0
-  - module load devel/Autoconf/2.71-GCCcore-11.3.0
-  - module load lang/Bison/3.8.2-GCCcore-11.3.0
-  - module load lang/flex/2.6.4-GCCcore-11.3.0
-  - module load compiler/GCC/11.3.0
-  - module load lib/pybind11/2.9.2-GCCcore-11.3.0
-  - module load devel/Boost/1.79.0-GCC-11.3.0
-  - module load lib/fmt/9.1.0-GCCcore-11.3.0
-  - ulimit -s unlimited # Increase stack size limit
-
-.n2_setup_xilinx_2022_2: &n2_setup_xilinx_2022_2
-  - module load fpga
-  - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2
-  # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH
-  - export PLATFORM_REPO_PATHS=$PLATFORM_PATH
-
-.n2_setup_xilinx_2024_2: &n2_setup_xilinx_2024_2
-  - module load fpga
-  - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2
-  - module swap xilinx/vitis/24.2 # switch to Vitis/Vivado 2024.2
-  # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH
-  - export PLATFORM_REPO_PATHS=$PLATFORM_PATH
-
-.setup_venv_from_whl: &setup_venv_from_whl
-  # Move everything to working directory (e.g., RAMdisk)
-  - cp -dfR . $PATH_WORKDIR
-  - cd $PATH_WORKDIR
-  # Create fresh virtual environment and install finn-plus from .whl (artifact)
-  - python3 -m venv finn-plus-venv
-  - finn-plus-venv/bin/pip install dist/*.whl
-
 Build:
   id_tokens:
     CI_JOB_JWT:
@@ -113,8 +85,8 @@ Build:
     # Otherwise run
     - when: always
   before_script:
-    - *n2_setup_general
-    - *n2_setup_xilinx_2022_2
+    - !reference [.n2_setup_general, before_script]
+    - !reference [.n2_setup_xilinx_2022_2, before_script]
     # Install current version of Poetry
     - python3 -m venv poetry-install
     - poetry-install/bin/pip install poetry
@@ -151,6 +123,9 @@ FINN Test Suite 2022.2:
     # Do not run if test suite has been deselected
     - if: $TEST_SUITE == "none"
       when: never
+    # Do not run for PRs to dev (run only for pushes)
+    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
+      when: never
     # Always run, as long as there was no prior failure
     - when: on_success
   cache:
@@ -159,13 +134,10 @@ FINN Test Suite 2022.2:
     paths:
       - deps
   variables:
-    GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead
+    GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
     PYTEST_PARALLEL: "$CPU_CORES"
-  before_script:
-    - *n2_setup_general
-    - *n2_setup_xilinx_2022_2
-    - *setup_venv_from_whl
+  extends: .setup_full_2022_2
   script:
     # Launch additional monitoring
     - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
@@ -182,8 +154,71 @@ FINN Test Suite 2022.2:
       junit: reports/*.xml
 
 FINN Test Suite 2024.2:
-  extends: FINN Test Suite 2022.2
-  before_script:
-    - *n2_setup_general
-    - *n2_setup_xilinx_2024_2
-    - *setup_venv_from_whl
+  extends:
+    - FINN Test Suite 2022.2
+    - .setup_full_2024_2
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Do not run if test suite has been deselected
+    - if: $TEST_SUITE == "none"
+      when: never
+    # Always run, as long as there was no prior failure
+    - when: on_success
+
+Bench (Manual):
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    - if: $MANUAL_CFG_PATH != ""
+  trigger:
+    include: ci/.gitlab-bench.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    BENCH_CFG: "manual"
+
+Bench (Basic):
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    - if: $MANUAL_CFG_PATH == ""
+  trigger:
+    include: ci/.gitlab-bench.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+  parallel:
+    matrix:
+      - BENCH_CFG: [regression_basic]
+
+Bench (Extended):
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Do not run for PRs to dev (run only for pushes)
+    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
+      when: never
+    - if: $MANUAL_CFG_PATH == ""
+  trigger:
+    include: ci/.gitlab-bench.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    PARALLEL_JOBS: "4"
+  parallel:
+    matrix:
+      - BENCH_CFG: [regression_extended, microbenchmark_basic]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 048a3becda..10ff4d4415 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,6 +43,7 @@ repos:
   - id: check-merge-conflict
   - id: check-xml
   - id: check-yaml
+    args: ['--unsafe']
   - id: debug-statements
     exclude: '^src/finn/builder/build_dataflow.py$'
   - id: end-of-file-fixer
diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml
new file mode 100644
index 0000000000..6ddeb11858
--- /dev/null
+++ b/ci/.gitlab-bench.yml
@@ -0,0 +1,81 @@
+include: ci/.gitlab-setup.yml
+
+stages:
+  - build
+  - measure
+  - collect
+
+variables:
+  BENCH_CFG:
+    description: "Select config, usually provided by parent pipeline"
+    value: ""
+
+workflow:
+  name: "bench_$BENCH_CFG"
+
+FINN Build:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: build
+  needs:
+    - job: Build
+      pipeline: $PARENT_PIPELINE_ID
+  variables:
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH"
+  extends: .setup_full_2022_2
+  script:
+    # Launch additional monitoring
+    - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
+    # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
+    # TODO: cache dvc pull
+    - |
+      source finn-plus-venv/bin/activate
+      dvc pull
+      finn bench --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG
+  cache:
+    key: $CI_COMMIT_SHA
+    policy: pull
+    paths:
+      - deps
+  artifacts:
+    name: "build_artifacts"
+    when: always
+    paths:
+      - build_artifacts/
+
+Measurement:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: measure
+  tags:
+    - board
+  rules:
+    # Also run on failure of previous tasks to measure partial results
+    - when: always
+  script:
+    # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment
+    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python ci/measure.py"
+  artifacts:
+    name: "measurement_artifacts"
+    when: always
+    paths:
+      - measurement_artifacts/
+
+Result Collection:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: collect
+  tags:
+    - image_build
+  rules:
+    # Also run on failure of previous tasks to collect partial results
+    - when: always
+  script:
+    # pulling models seems to be needed for dvclive to save experiments, even though they are not used or modified
+    - dvc pull
+    - python3.10 ci/collect.py
+    - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git
diff --git a/ci/.gitlab-setup.yml b/ci/.gitlab-setup.yml
new file mode 100644
index 0000000000..5dad320a34
--- /dev/null
+++ b/ci/.gitlab-setup.yml
@@ -0,0 +1,49 @@
+# This file defines some basic scripts used to setup the FINN environment on the runner
+
+.n2_setup_general:
+  before_script:
+    - module load lang/Python/3.10.4-GCCcore-11.3.0
+    - module load devel/Autoconf/2.71-GCCcore-11.3.0
+    - module load lang/Bison/3.8.2-GCCcore-11.3.0
+    - module load lang/flex/2.6.4-GCCcore-11.3.0
+    - module load compiler/GCC/11.3.0
+    - module load lib/pybind11/2.9.2-GCCcore-11.3.0
+    - module load devel/Boost/1.79.0-GCC-11.3.0
+    - module load lib/fmt/9.1.0-GCCcore-11.3.0
+    - ulimit -s unlimited # Increase stack size limit
+
+.n2_setup_xilinx_2022_2:
+  before_script:
+    - module load fpga
+    - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2
+    # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH
+    - export PLATFORM_REPO_PATHS=$PLATFORM_PATH
+
+.n2_setup_xilinx_2024_2:
+  before_script:
+    - module load fpga
+    - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2
+    - module swap xilinx/vitis/24.2 # switch to Vitis/Vivado 2024.2
+    # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH
+    - export PLATFORM_REPO_PATHS=$PLATFORM_PATH
+
+.setup_venv_from_whl:
+  before_script:
+    # Move everything to working directory (e.g., RAMdisk)
+    - cp -dfR . $PATH_WORKDIR
+    - cd $PATH_WORKDIR
+    # Create fresh virtual environment and install finn-plus from .whl (artifact)
+    - python3 -m venv finn-plus-venv
+    - finn-plus-venv/bin/pip install dist/*.whl
+
+.setup_full_2022_2:
+  before_script:
+    - !reference [.n2_setup_general, before_script]
+    - !reference [.n2_setup_xilinx_2022_2, before_script]
+    - !reference [.setup_venv_from_whl, before_script]
+
+.setup_full_2024_2:
+  before_script:
+    - !reference [.n2_setup_general, before_script]
+    - !reference [.n2_setup_xilinx_2024_2, before_script]
+    - !reference [.setup_venv_from_whl, before_script]
diff --git a/ci/cfg/live_fifosizing.yml b/ci/cfg/live_fifosizing.yml
new file mode 100644
index 0000000000..f121bacf6d
--- /dev/null
+++ b/ci/cfg/live_fifosizing.yml
@@ -0,0 +1,50 @@
+[
+    # Real models
+    {
+        "dut": ["vgg10"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["resnet50"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # Synthetic non-linear models
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [64],
+        "kernel_size": [5],
+        "ch": [8],
+        "simd": [8],
+        "pe": [8],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [4, 8, 16],
+
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [64],
+        "kernel_size": [5],
+        "ch": [8],
+        "simd": [1],
+        "pe": [1],
+        "parallel_window": [0],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [4, 8, 16],
+
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/microbenchmark_basic.yml b/ci/cfg/microbenchmark_basic.yml
new file mode 100644
index 0000000000..e9a102e51c
--- /dev/null
+++ b/ci/cfg/microbenchmark_basic.yml
@@ -0,0 +1,48 @@
+[
+    # MVAU Test
+    {
+        "dut": ["mvau"],
+        "idt": ["INT4","INT2"],
+        "wdt": ["INT4"],
+        "act": ["INT4"],
+
+        "sparsity_type": ["none"],
+        "sparsity_amount": [0],
+
+        "nhw": [[1,32,32]],
+        "mw": [64],
+        "mh": [64],
+        "sf": [-1],
+        "nf": [-1],
+        "m": [1],
+
+        "mem_mode": ["internal_embedded"],
+        "ram_style": ["distributed"],
+        "ram_style_thr": ["distributed"],
+
+        "dut_duplication": [1],
+
+        "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # Transformer Dummy
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [1],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [32],
+        "model_mlp_dim": [192],
+        "model_seq_len": [64],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/regression_basic.yml b/ci/cfg/regression_basic.yml
new file mode 100644
index 0000000000..9a7604fe19
--- /dev/null
+++ b/ci/cfg/regression_basic.yml
@@ -0,0 +1,10 @@
+[
+    {
+        "dut": ["vgg10"],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml
new file mode 100644
index 0000000000..a95dfa06d8
--- /dev/null
+++ b/ci/cfg/regression_extended.yml
@@ -0,0 +1,48 @@
+[
+    # ResNet-50
+    {
+        "dut": ["resnet50"],
+        "board": ["U280"],
+        "synth_clk_period_ns": [4],
+        "rtlsim_batch_size": [3],
+        # no deployment package because Alveo deployment is not yet supported by CI
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile"]]
+    },
+
+    # 4x GPT Transformer models (currently disabled due to streamlining issues!)
+    # {
+    #     "dut": ["transformer"],
+    #     "seed": [12],
+    #     "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256",
+    #                   "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256",
+    #                   "models/gpt_c_gpt2-s512-t2048-l2-h4-e512",
+    #                   "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"],
+    #     "board": ["U280"],
+    #     "synth_clk_period_ns": [10],
+    #     "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
+    # }
+
+    # 5x RadioML Transformer models
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+        "model_dir": ["models/rml_transformer_0",
+                      "models/rml_transformer_a",
+                      "models/rml_transformer_b",
+                      "models/rml_transformer_c",
+                      "models/rml_transformer_d",],
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # 1x RadioML Conformer model
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+        "model_dir": ["models/rml_conformer"],
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/collect.py b/ci/collect.py
new file mode 100644
index 0000000000..c7042abf25
--- /dev/null
+++ b/ci/collect.py
@@ -0,0 +1,413 @@
+import json
+import os
+import shutil
+from dvclive.live import Live
+
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
+
+
+def log_dvc_metric(live, prefix, name, value):
+    # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
+    live.log_metric(prefix + name.replace("/", "-"), value, plot=False)
+
+
+def open_json_report(id, report_name):
+    # look in both, build & measurement, artifacts
+    path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
+    path2 = os.path.join(
+        "measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name
+    )
+    if os.path.isfile(path1):
+        with open(path1, "r") as f:
+            report = json.load(f)
+        return report
+    elif os.path.isfile(path2):
+        with open(path2, "r") as f:
+            report = json.load(f)
+        return report
+    else:
+        return None
+
+
+def log_all_metrics_from_report(id, live, report_name, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in report:
+            log_dvc_metric(live, prefix, key, report[key])
+
+
+def log_metrics_from_report(id, live, report_name, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in keys:
+            if key in report:
+                log_dvc_metric(live, prefix, key, report[key])
+
+
+def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        if key_top in report:
+            for key in keys:
+                if key in report[key_top]:
+                    log_dvc_metric(live, prefix, key, report[key_top][key])
+
+
+if __name__ == "__main__":
+    # Go through all runs found in the artifacts and log their results to DVC
+    run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output"))
+    print("Looking for runs in build artifacts")
+    run_ids = []
+    for run_dir in run_dir_list:
+        if run_dir.startswith("run_"):
+            run_id = int(run_dir[4:])
+            run_ids.append(run_id)
+    run_ids.sort()
+    print("Found %d runs" % len(run_ids))
+
+    follow_up_bench_cfg = list()
+    # Prepare (local) output directory where follow-up bench configs will be stored
+    output_cfg_dir = os.path.join(
+        os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID")
+    )
+    output_folding_dir = os.path.join(output_cfg_dir, "folding")
+    output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json")
+
+    for id in run_ids:
+        print("Processing run %d" % id)
+        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
+        experiment_msg = (
+            "[CI] "
+            + os.environ.get("CI_PIPELINE_NAME")
+            + " ("
+            + os.environ.get("CI_PIPELINE_ID")
+            + "_"
+            + str(id)
+            + ")"
+        )
+        # TODO: cache images once we switch to a cache provider that works with DVC Studio
+        with Live(exp_name=experiment_name, exp_message=experiment_msg, cache_images=False) as live:
+            # PARAMS
+            # input parameters logged by benchmarking infrastructure
+            metadata_bench = open_json_report(id, "metadata_bench.json")
+            params = {"params": metadata_bench["params"]}
+            live.log_params(params)
+
+            # optional metadata logged by builder
+            metadata_builder = open_json_report(id, "metadata_builder.json")
+            if metadata_builder:
+                metadata = {
+                    "metadata": {
+                        "tool_version": metadata_builder["tool_version"],
+                    }
+                }
+                live.log_params(metadata)
+
+            # optional dut_info.json (additional information generated during model generation)
+            dut_info_report = open_json_report(id, "dut_info.json")
+            if dut_info_report:
+                dut_info = {"dut_info": dut_info_report}
+                live.log_params(dut_info)
+
+            # METRICS
+            # TODO: for microbenchmarks, only summarize results for target node (surrounding SDP?)
+            # TODO: make all logs consistent (at generation), e.g., BRAM vs BRAM18 vs BRAM36)
+
+            # status
+            status = metadata_bench["status"]
+            if status == "ok":
+                # mark as failed if either bench or builder indicates failure
+                if metadata_builder:
+                    status_builder = metadata_builder["status"]
+                    if status_builder == "failed":
+                        status = "failed"
+            log_dvc_metric(live, "", "status", status)
+
+            # verification steps
+            if "output" in metadata_bench:
+                if "builder_verification" in metadata_bench["output"]:
+                    log_dvc_metric(
+                        live,
+                        "",
+                        "verification",
+                        metadata_bench["output"]["builder_verification"]["verification"],
+                    )
+
+            # estimate_layer_resources.json
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "estimate_layer_resources.json",
+                "total",
+                [
+                    "LUT",
+                    "DSP",
+                    "BRAM_18K",
+                    "URAM",
+                ],
+                prefix="estimate/resources/",
+            )
+
+            # estimate_layer_resources_hls.json
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "estimate_layer_resources_hls.json",
+                "total",
+                [
+                    "LUT",
+                    "FF",
+                    "DSP",
+                    "DSP48E",
+                    "DSP58E",  # TODO: aggregate/unify DSP reporting
+                    "BRAM_18K",
+                    "URAM",
+                ],
+                prefix="hls_estimate/resources/",
+            )
+
+            # estimate_network_performance.json
+            log_metrics_from_report(
+                id,
+                live,
+                "estimate_network_performance.json",
+                [
+                    "critical_path_cycles",
+                    "max_cycles",
+                    "max_cycles_node_name",
+                    "estimated_throughput_fps",
+                    "estimated_latency_ns",
+                ],
+                prefix="estimate/performance/",
+            )
+
+            # rtlsim_performance.json
+            log_metrics_from_report(
+                id,
+                live,
+                "rtlsim_performance.json",
+                [
+                    "N",
+                    "TIMEOUT",
+                    "latency_cycles",
+                    "cycles",
+                    "fclk[mhz]",
+                    "throughput[images/s]",
+                    "stable_throughput[images/s]",
+                    # add INPUT_DONE, OUTPUT_DONE, number transactions?
+                ],
+                prefix="rtlsim/performance/",
+            )
+
+            # fifo_sizing.json
+            log_metrics_from_report(
+                id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/"
+            )
+
+            # stitched IP DCP synth resource report
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "post_synth_resources_dcp.json",
+                "(top)",
+                [
+                    "LUT",
+                    "FF",
+                    "SRL",
+                    "DSP",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth(dcp)/resources/",
+            )
+
+            # stitched IP DCP synth resource breakdown
+            # TODO: generalize to all build flows and bitfile synth
+            layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"]
+            for category in layer_categories:
+                log_nested_metrics_from_report(
+                    id,
+                    live,
+                    "res_breakdown_build_output.json",
+                    category,
+                    [
+                        "LUT",
+                        "FF",
+                        "SRL",
+                        "DSP",
+                        "BRAM_18K",
+                        "BRAM_36K",
+                        "URAM",
+                    ],
+                    prefix="synth(dcp)/resources(breakdown)/" + category + "/",
+                )
+
+            # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
+            log_metrics_from_report(
+                id,
+                live,
+                "ooc_synth_and_timing.json",
+                [
+                    "LUT",
+                    "LUTRAM",
+                    "FF",
+                    "DSP",
+                    "BRAM",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth(ooc)/resources/",
+            )
+            log_metrics_from_report(
+                id,
+                live,
+                "ooc_synth_and_timing.json",
+                [
+                    "WNS",
+                    "fmax_mhz",
+                    # add TNS? what is "delay"?
+                ],
+                prefix="synth(ooc)/timing/",
+            )
+
+            # post_synth_resources.json (shell synth / step_synthesize_bitfile)
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "post_synth_resources.json",
+                "(top)",
+                [
+                    "LUT",
+                    "FF",
+                    "SRL",
+                    "DSP",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth/resources/",
+            )
+
+            # post synth timing report
+            # TODO: only exported as post_route_timing.rpt, not .json
+
+            # instrumentation measurement
+            log_all_metrics_from_report(
+                id, live, "measured_performance.json", prefix="measurement/performance/"
+            )
+
+            # IODMA validation accuracy
+            log_metrics_from_report(
+                id,
+                live,
+                "validation.json",
+                [
+                    "top-1_accuracy",
+                ],
+                prefix="measurement/validation/",
+            )
+
+            # power measurement
+            # TODO
+
+            # live fifosizing report + graph png
+            log_metrics_from_report(
+                id,
+                live,
+                "fifo_sizing_report.json",
+                [
+                    "error",
+                    "fifo_size_total_kB",
+                ],
+                prefix="fifosizing/live/",
+            )
+
+            image = os.path.join(
+                "measurement_artifacts",
+                "runs_output",
+                "run_%d" % (id),
+                "reports",
+                "fifo_sizing_graph.png",
+            )
+            if os.path.isfile(image):
+                live.log_image("fifosizing_pass_1", image)
+
+            # time_per_step.json
+            log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
+
+            # ARTIFACTS
+            # Log build reports as they come from GitLab artifacts,
+            # but copy them to a central dir first so all runs share the same path
+            run_report_dir1 = os.path.join(
+                "build_artifacts", "runs_output", "run_%d" % (id), "reports"
+            )
+            run_report_dir2 = os.path.join(
+                "measurement_artifacts", "runs_output", "run_%d" % (id), "reports"
+            )
+            dvc_report_dir = "reports"
+            os.makedirs(dvc_report_dir, exist_ok=True)
+            delete_dir_contents(dvc_report_dir)
+            if os.path.isdir(run_report_dir1):
+                shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True)
+            if os.path.isdir(run_report_dir2):
+                shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True)
+            live.log_artifact(dvc_report_dir)
+
+        # Prepare benchmarking config for follow-up runs after live FIFO-sizing
+        folding_config_lfs_path = os.path.join(
+            "measurement_artifacts",
+            "runs_output",
+            "run_%d" % (id),
+            "reports",
+            "folding_config_lfs.json",
+        )
+        if os.path.isfile(folding_config_lfs_path):
+            # Copy folding config produced by live FIFO-sizing
+            output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json")
+            os.makedirs(output_folding_dir, exist_ok=True)
+            print(
+                "Saving lfs-generated folding config of this run to use in future builds: %s"
+                % output_folding_path
+            )
+            shutil.copy(folding_config_lfs_path, output_folding_path)
+
+            # Create benchmarking config
+            metadata_bench = open_json_report(id, "metadata_bench.json")
+            configuration = dict()
+            for key in metadata_bench["params"]:
+                # wrap in list
+                configuration[key] = [metadata_bench["params"][key]]
+            # overwrite FIFO-related params
+            import_folding_path = os.path.join(
+                os.environ.get("LOCAL_CFG_DIR"),
+                "lfs",
+                "CI_" + os.environ.get("CI_PIPELINE_ID"),
+                "folding",
+                experiment_name + ".json",
+            )
+            configuration["live_fifo_sizing"] = [False]
+            configuration["auto_fifo_depths"] = [False]
+            configuration["target_fps"] = ["None"]
+            configuration["folding_config_file"] = [import_folding_path]
+
+            follow_up_bench_cfg.append(configuration)
+
+    # Save aggregated benchmarking config for follow-up job
+    if follow_up_bench_cfg:
+        print("Saving follow-up bench config for lfs: %s" % output_cfg_path)
+        with open(output_cfg_path, "w") as f:
+            json.dump(follow_up_bench_cfg, f, indent=2)
+
+    print("Done")
diff --git a/ci/measure.py b/ci/measure.py
new file mode 100644
index 0000000000..42db938d33
--- /dev/null
+++ b/ci/measure.py
@@ -0,0 +1,95 @@
+import os
+import shutil
+import subprocess
+import sys
+
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
+
+
+if __name__ == "__main__":
+    exit_code = 0
+    print("Looking for deployment packages in artifacts..")
+    # Find deployment packages from artifacts
+    artifacts_in_dir = os.path.join("build_artifacts", "runs_output")
+    artifacts_out_dir = os.path.join("measurement_artifacts", "runs_output")
+    for run in os.listdir(artifacts_in_dir):
+        run_in_dir = os.path.join(artifacts_in_dir, run)
+        run_out_dir = os.path.join(artifacts_out_dir, run)
+        reports_dir = os.path.join(run_out_dir, "reports")
+        deploy_archive = os.path.join(run_in_dir, "deploy.zip")
+        extract_dir = "measurement"
+        if os.path.isfile(deploy_archive):
+            print("Found deployment package in %s, extracting.." % run_in_dir)
+
+            # Extract to temporary dir
+            shutil.unpack_archive(deploy_archive, extract_dir)
+
+            # Run driver
+            print("Running driver..")
+            # run validate.py (from IODMA driver) if present, otherwise driver.py (instrumentation)
+            # TODO: unify IODMA/instrumentation shell & driver
+            if os.path.isfile(f"{extract_dir}/driver/validate.py"):
+                result = subprocess.run(
+                    [
+                        "python",
+                        f"{extract_dir}/driver/validate.py",
+                        "--bitfile",
+                        f"{extract_dir}/bitfile/finn-accel.bit",
+                        "--settingsfile",
+                        f"{extract_dir}/driver/settings.json",
+                        "--reportfile",
+                        f"{extract_dir}/validation.json",
+                        "--dataset_root",
+                        "/home/xilinx/datasets",  # TODO: env var
+                    ]
+                )
+            else:
+                result = subprocess.run(
+                    [
+                        "python",
+                        f"{extract_dir}/driver/driver.py",
+                        "--bitfile",
+                        f"{extract_dir}/bitfile/finn-accel.bit",
+                        "--settingsfile",
+                        f"{extract_dir}/driver/settings.json",
+                        "--reportfile",
+                        f"{extract_dir}/measured_performance.json",
+                    ]
+                )
+            if result.returncode != 0:
+                print("Driver reported error!")
+                exit_code = 1
+            else:
+                print("Driver finished successfully.")
+
+            # Copy results back to artifact directory
+            for report in [
+                "measured_performance.json",
+                "fifo_sizing_report.json",
+                "fifo_depth_export.json",
+                "fifo_sizing_graph.png",
+                "folding_config_lfs.json",
+                "validation.json",
+            ]:
+                report_path = os.path.join(extract_dir, report)
+                if os.path.isfile(report_path):
+                    print("Copying %s to %s" % (report_path, reports_dir))
+                    os.makedirs(reports_dir, exist_ok=True)
+                    shutil.copy(report_path, reports_dir)
+
+            print("Clearing temporary directory..")
+            # Clear temporary dir
+            delete_dir_contents(extract_dir)
+            print("Done.")
+    print("Processed all deployment packages.")
+    sys.exit(exit_code)
diff --git a/models.dvc b/models.dvc
new file mode 100644
index 0000000000..35b5292128
--- /dev/null
+++ b/models.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 20c3f996d17ef035c8189c0d0ac44cf6.dir
+  size: 203029833
+  nfiles: 42
+  hash: md5
+  path: models
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index 1e544cf513..73ae7f555c 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -964,7 +964,7 @@
    "source": [
     "import json\n",
     "\n",
-    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n",
     "    folding_config = json.load(json_file)\n",
     "\n",
     "print(json.dumps(folding_config, indent=1))"
@@ -1035,7 +1035,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n",
     "    folding_config = json.load(json_file)\n",
     "\n",
     "# Set all ram_style to LUT RAM\n",
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 7a23a3628e..39ae1dd5f6 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -323,7 +323,7 @@
    "source": [
     "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n",
     "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n",
-    "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")"
+    "assert os.path.exists(rtlsim_output_dir + \"/report/final_hw_config.json\")"
    ]
   },
   {
@@ -410,7 +410,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! cat {rtlsim_output_dir}/final_hw_config.json"
+    "! cat {rtlsim_output_dir}/report/final_hw_config.json"
    ]
   },
   {
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
new file mode 100644
index 0000000000..1a478a466c
--- /dev/null
+++ b/src/finn/benchmarking/bench.py
@@ -0,0 +1,242 @@
+import itertools
+import json
+import onnxruntime as ort
+import os
+import sys
+import time
+import traceback
+import yaml
+
+from finn.benchmarking.bench_base import bench
+from finn.benchmarking.dut.mvau import bench_mvau
+from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear
+from finn.benchmarking.dut.transformer import bench_transformer
+from finn.benchmarking.util import delete_dir_contents
+
+# Register custom bench subclasses that offer more control than YAML-based flow
+dut = dict()
+dut["mvau"] = bench_mvau
+dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
+dut["transformer"] = bench_transformer
+
+
+class PrefixPrinter(object):
+    """
+    Create a custom stream handler that adds a prefix
+    """
+
+    def __init__(self, prefix, originalstream):
+        self.console = originalstream
+        self.prefix = prefix
+        self.linebuf = ""
+
+    def write(self, buf):
+        for line in buf.rstrip().splitlines():
+            self.console.write(f"[{self.prefix}] " + line + "\n")
+
+    def flush(self):
+        self.console.flush()
+
+
+def start_bench_run(config_name):
+    exit_code = 0
+    # Attempt to work around onnxruntime issue on Slurm-managed clusters:
+    # See https://github.com/microsoft/onnxruntime/issues/8313
+    # This seems to happen only when assigned CPU cores are not contiguous
+    _default_session_options = ort.capi._pybind_state.get_default_session_options()
+
+    def get_default_session_options_new():
+        _default_session_options.inter_op_num_threads = 1
+        _default_session_options.intra_op_num_threads = 1
+        return _default_session_options
+
+    ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
+
+    try:
+        # Launched via SLURM, expect additional CI env vars
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        # original experiment dir (before potential copy to ramdisk):
+        # experiment_dir = os.environ.get("EXPERIMENT_DIR")
+        experiment_dir = os.environ.get("CI_PROJECT_DIR")
+        save_dir = os.path.join(
+            os.environ.get("LOCAL_ARTIFACT_DIR"),
+            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"),
+        )
+        work_dir = os.environ["PATH_WORKDIR"]
+
+        # Gather benchmarking configs
+        if config_name == "manual":
+            # First check if the repo contains a config with this name (in ci/cfg/*)
+            config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml")
+            if not os.path.exists(config_path):
+                # Otherwise look in LOCAL_CFG_DIR for the filename
+                config_path = os.path.join(
+                    os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")
+                )
+        else:
+            config_path = os.path.join("ci", "cfg", config_name + ".yml")
+        print("Job launched with SLURM ID: %d" % (job_id))
+    except KeyError:
+        # Launched without SLURM, assume test run on local machine
+        job_id = 0
+        experiment_dir = "bench_output/" + time.strftime("%d_%H_%M")
+        save_dir = "bench_save/" + time.strftime("%d_%H_%M")
+        work_dir = "bench_work"
+        os.makedirs(work_dir, exist_ok=True)
+        delete_dir_contents(work_dir)
+        config_path = config_name  # expect caller to provide direct path to a single config file
+        print("Local test job launched without SLURM")
+
+    try:
+        # Launched as SLURM job array
+        array_id = int(os.environ["SLURM_ARRAY_JOB_ID"])
+        task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+        task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
+        print(
+            "Launched as job array (Array ID: %d, Task ID: %d, Task count: %d)"
+            % (array_id, task_id, task_count)
+        )
+    except KeyError:
+        # Launched as single (SLURM or non-SLURM) job
+        array_id = job_id
+        task_id = 0
+        task_count = 1
+        print("Launched as single job")
+
+    # Prepare result directory
+    artifacts_dir = os.path.join(experiment_dir, "build_artifacts")
+    os.makedirs(artifacts_dir, exist_ok=True)
+    print("Collecting results in path: %s" % artifacts_dir)
+
+    # Prepare local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
+    os.makedirs(save_dir, exist_ok=True)
+    print("Saving additional artifacts in path: %s" % save_dir)
+
+    # Load config
+    print("Loading config %s" % (config_path))
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = yaml.load(f, Loader=yaml.SafeLoader)
+    else:
+        print("ERROR: config file not found")
+        return
+
+    # Expand all specified config combinations (gridsearch)
+    config_expanded = []
+    for param_set in config:
+        param_set_expanded = list(
+            dict(zip(param_set.keys(), x)) for x in itertools.product(*param_set.values())
+        )
+        config_expanded.extend(param_set_expanded)
+
+    # Save config (only first job of array) for logging purposes
+    if task_id == 0:
+        with open(os.path.join(artifacts_dir, "bench_config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        with open(os.path.join(artifacts_dir, "bench_config_exp.json"), "w") as f:
+            json.dump(config_expanded, f, indent=2)
+
+    # Determine which runs this job will work on
+    total_runs = len(config_expanded)
+    if total_runs <= task_count:
+        if task_id < total_runs:
+            selected_runs = [task_id]
+        else:
+            return
+    else:
+        selected_runs = []
+        idx = task_id
+        while idx < total_runs:
+            selected_runs.append(idx)
+            idx = idx + task_count
+    print(
+        "STARTING JOB %d. IT WILL PERFORM %d OUT OF %d TOTAL RUNS"
+        % (task_id, len(selected_runs), total_runs)
+    )
+
+    # Run benchmark
+    successful_runs = []
+    skipped_runs = []
+    failed_runs = []
+    for run, run_id in enumerate(selected_runs):
+        print(
+            "STARTING RUN %d/%d (ID %d OF %d TOTAL RUNS)"
+            % (run + 1, len(selected_runs), run_id, total_runs)
+        )
+
+        params = config_expanded[run_id]
+        print("RUN %d PARAMETERS: %s" % (run_id, str(params)))
+
+        log_dict = {"run_id": run_id, "task_id": task_id, "params": params}
+
+        # Create bench object for respective DUT
+        if "dut" in params:
+            if params["dut"] in dut:
+                bench_object = dut[params["dut"]](
+                    params, task_id, run_id, work_dir, artifacts_dir, save_dir
+                )
+            else:
+                # If no custom bench subclass is defined, fall back to base class,
+                # expect DUT-specific YAML definition instead
+                bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir)
+        else:
+            print("ERROR: NO DUT SPECIFIED")
+            return 1
+
+        # Wrap stdout/stderr with an additional prefix to identify the run in the live console
+        original_stdout = sys.stdout
+        original_stderr = sys.stderr
+        sys.stdout = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stdout)
+        sys.stderr = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stderr)
+        try:
+            result = bench_object.run()
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+            if result == "skipped":
+                log_dict["status"] = "skipped"
+                print("BENCH RUN %d SKIPPED" % run_id)
+                skipped_runs.append(run_id)
+            else:
+                log_dict["status"] = "ok"
+        except Exception:
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
+            log_dict["status"] = "failed"
+            print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc()))
+            failed_runs.append(run_id)
+            exit_code = 1
+
+        log_dict["output"] = bench_object.output_dict
+
+        # examine status reported by builder (which catches all exceptions before they reach us)
+        # we could also fail the pipeline if functional verification fails (TODO)
+        builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json")
+        if os.path.isfile(builder_log_path):
+            with open(builder_log_path, "r") as f:
+                builder_log = json.load(f)
+            if builder_log["status"] == "failed":
+                print("BENCH RUN %d FAILED (BUILDER REPORTED FAILURE)" % run_id)
+                failed_runs.append(run_id)
+                exit_code = 1
+            else:
+                print("BENCH RUN %d COMPLETED (BUILDER REPORTED SUCCESS)" % run_id)
+                successful_runs.append(run_id)
+        else:
+            print("BENCH RUN %d COMPLETED" % run_id)
+            successful_runs.append(run_id)
+
+        # log metadata of this run to its own report directory
+        log_path = os.path.join(bench_object.report_dir, "metadata_bench.json")
+        with open(log_path, "w") as f:
+            json.dump(log_dict, f, indent=2)
+
+        # save GitLab artifacts of this run (e.g., reports and deployment package)
+        bench_object.save_artifacts_collection()
+        # save local artifacts of this run (e.g., full build dir, detailed debug info)
+        bench_object.save_local_artifacts_collection()
+
+    print("STOPPING JOB %d (of %d total jobs)" % (task_id, task_count))
+    print("JOB %d SUCCESSFUL RUNS: %s" % (task_id, successful_runs))
+    print("JOB %d SKIPPED RUNS: %s" % (task_id, skipped_runs))
+    print("JOB %d FAILED RUNS: %s" % (task_id, failed_runs))
+    return exit_code
diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
new file mode 100644
index 0000000000..e0bea7ee13
--- /dev/null
+++ b/src/finn/benchmarking/bench_base.py
@@ -0,0 +1,315 @@
+import glob
+import json
+import os
+import shutil
+import subprocess
+import yaml
+from shutil import copy as shcopy
+from shutil import copytree
+
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.templates import (
+    template_open,
+    template_sim_power,
+    template_single_test,
+    template_switching_simulation_tb,
+)
+from finn.benchmarking.util import delete_dir_contents, power_xml_to_dict
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from finn.util.basic import alveo_default_platform, alveo_part_map, part_map
+
+
+def start_test_batch_fast(results_path, project_path, run_target, pairs):
+    # Prepare tcl script
+    script = template_open.replace("$PROJ_PATH$", project_path)
+    # script = script.replace("$PERIOD$", period)
+    script = script.replace("$RUN$", run_target)
+    for toggle_rate, static_prob in pairs:
+        script = script + template_single_test
+        script = script.replace("$TOGGLE_RATE$", str(toggle_rate))
+        script = script.replace("$STATIC_PROB$", str(static_prob))
+        # script = script.replace("$SWITCH_TARGET$", switch_target)
+        script = script.replace("$REPORT_PATH$", results_path)
+        script = script.replace("$REPORT_NAME$", f"{toggle_rate}_{static_prob}")
+    with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file:
+        tcl_file.write(script)
+
+    # Prepare bash script
+    bash_script = os.getcwd() + "/report_power.sh"
+    with open(bash_script, "w") as script:
+        script.write("#!/bin/bash \n")
+        script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n")
+
+    # Run script
+    sub_proc = subprocess.Popen(["bash", bash_script])
+    sub_proc.communicate()
+
+    # Parse results
+    for toggle_rate, static_prob in pairs:
+        power_report_dict = power_xml_to_dict(f"{results_path}/{toggle_rate}_{static_prob}.xml")
+        power_report_json = f"{results_path}/{toggle_rate}_{static_prob}.json"
+        with open(power_report_json, "w") as json_file:
+            json_file.write(json.dumps(power_report_dict, indent=2))
+
+
+def sim_power_report(results_path, project_path, in_width, out_width, dtype_width, sim_duration_ns):
+    # Prepare tcl script
+    script = template_open.replace("$PROJ_PATH$", project_path)
+    script = script.replace("$RUN$", "impl_1")
+    script = script + template_sim_power
+    script = script.replace("$TB_FILE_PATH$", os.getcwd() + "/switching_simulation_tb.v")
+    script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif")
+    script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns)))
+    script = script.replace("$REPORT_PATH$", results_path)
+    script = script.replace("$REPORT_NAME$", "sim")
+    with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file:
+        tcl_file.write(script)
+
+    # Prepare testbench
+    testbench = template_switching_simulation_tb.replace("$INSTREAM_WIDTH$", str(in_width))
+    testbench = testbench.replace("$OUTSTREAM_WIDTH$", str(out_width))
+    testbench = testbench.replace("$DTYPE_WIDTH$", str(dtype_width))
+    testbench = testbench.replace(
+        "$RANDOM_FUNCTION$", "$urandom_range(0, {max})".format(max=2**dtype_width - 1)
+    )
+    with open(os.getcwd() + "/switching_simulation_tb.v", "w") as tb_file:
+        tb_file.write(testbench)
+
+    # Prepare shell script
+    bash_script = os.getcwd() + "/report_power.sh"
+    with open(bash_script, "w") as script:
+        script.write("#!/bin/bash \n")
+        script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n")
+
+    # Run script
+    sub_proc = subprocess.Popen(["bash", bash_script])
+    sub_proc.communicate()
+
+    # Parse results
+    power_report_dict = power_xml_to_dict(f"{results_path}/sim.xml")
+    power_report_json = f"{results_path}/sim.json"
+    with open(power_report_json, "w") as json_file:
+        json_file.write(json.dumps(power_report_dict, indent=2))
+
+
+class bench:
+    def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True):
+        super().__init__()
+        self.params = params
+        self.task_id = task_id
+        self.run_id = run_id
+        self.work_dir = work_dir
+        self.artifacts_dir = artifacts_dir
+        self.save_dir = save_dir
+        self.debug = debug
+
+        # Setup some basic global default configuration
+        # TODO: clean up or remove these attributes
+        if "synth_clk_period_ns" in params:
+            self.clock_period_ns = params["synth_clk_period_ns"]
+        else:
+            self.clock_period_ns = 10
+            self.params["synth_clk_period_ns"] = self.clock_period_ns
+
+        # TODO: do not allow multiple targets in a single bench job due to measurement?
+        if "board" in params:
+            self.board = params["board"]
+        else:
+            self.board = "RFSoC2x2"
+            self.params["board"] = self.board
+
+        if "part" in params:
+            self.part = params["part"]
+        elif self.board in part_map:
+            self.part = part_map[self.board]
+        else:
+            raise Exception("No part specified for board %s" % self.board)
+
+        if self.board in alveo_part_map:
+            self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO
+            self.params["vitis_platform"] = alveo_default_platform[self.board]
+        else:
+            self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ
+
+        # Load custom (= non build_dataflow_config) parameters from topology-specific .yml
+        custom_params = [
+            "model_dir",  # used to setup onnx/npy input
+            "model_path",  # used to setup onnx/npy input
+            # model-gen parameters, such as seed, simd, pe, etc.
+            # TODO: separate these more cleanly from builder options
+        ]
+
+        dut_yaml_name = self.params["dut"] + ".yml"
+        dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name)
+        if os.path.isfile(dut_path):
+            with open(dut_path, "r") as f:
+                dut_cfg = yaml.load(f, Loader=yaml.SafeLoader)
+            for key in dut_cfg:
+                if key in custom_params:
+                    self.params[key] = dut_cfg[key]
+
+        # Clear FINN tmp build dir before every run
+        print("Clearing FINN BUILD DIR ahead of run")
+        delete_dir_contents(os.environ["FINN_BUILD_DIR"])
+
+        # Initialize dictionary to collect all benchmark results
+        # TODO: remove completely or only use for meta data,
+        # actual results go into run-specific .json files within /report
+        self.output_dict = {}
+
+        # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.)
+        self.build_inputs = {}
+
+        # Collect tuples of (name, source path, archive?) to save as pipeline artifacts
+        self.artifacts_collection = []
+
+        # Collect tuples of (name, source path, archive?) to save as local artifacts
+        self.local_artifacts_collection = []
+        if self.debug:
+            # Save entire FINN_BUILD_DIR
+            # TODO: add option to only save upon error/exception
+            self.local_artifacts_collection.append(
+                ("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True)
+            )
+
+        # SETUP
+        # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
+        # Ensure it exists but is empty (clear potential artifacts from previous runs)
+        tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow")
+        os.makedirs(tmp_buildflow_dir, exist_ok=True)
+        delete_dir_contents(tmp_buildflow_dir)
+        self.build_inputs["build_dir"] = os.path.join(
+            tmp_buildflow_dir, "build_output"
+        )  # TODO remove in favor of self.build_dir
+        self.build_dir = os.path.join(tmp_buildflow_dir, "build_output")
+        self.report_dir = os.path.join(self.build_dir, "report")
+        os.makedirs(self.report_dir, exist_ok=True)
+
+        # Save full build dir as local artifact
+        self.local_artifacts_collection.append(("build_output", self.build_dir, False))
+        # Save reports and deployment package as pipeline artifacts
+        self.artifacts_collection.append(("reports", self.report_dir, False))
+        self.artifacts_collection.append(
+            ("reports", os.path.join(self.build_dir, "build_dataflow.log"), False)
+        )
+        self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True))
+
+    def save_artifact(self, target_path, source_path, archive=False):
+        if os.path.isdir(source_path):
+            if archive:
+                os.makedirs(os.path.dirname(target_path), exist_ok=True)
+                shutil.make_archive(target_path, "zip", source_path)
+            else:
+                os.makedirs(target_path, exist_ok=True)
+                copytree(source_path, target_path, dirs_exist_ok=True)
+        elif os.path.isfile(source_path):
+            os.makedirs(target_path, exist_ok=True)
+            shcopy(source_path, target_path)
+
+    def save_artifacts_collection(self):
+        # this should be called upon successful or failed completion of a run
+        for name, source_path, archive in self.artifacts_collection:
+            target_path = os.path.join(
+                self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name
+            )
+            self.save_artifact(target_path, source_path, archive)
+
+    def save_local_artifacts_collection(self):
+        # this should be called upon successful or failed completion of a run
+        for name, source_path, archive in self.local_artifacts_collection:
+            target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
+            self.save_artifact(target_path, source_path, archive)
+
+    # must be defined by subclass
+    def step_export_onnx(self):
+        pass
+
+    # can be overwritten by subclass if setup is too complex for YAML definition
+    def step_build_setup(self):
+        dut_yaml_name = self.params["dut"] + ".yml"
+        dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name)
+        if os.path.isfile(dut_path):
+            with open(dut_path, "r") as f:
+                return DataflowBuildConfig.from_yaml(f)
+        else:
+            raise Exception("No DUT-specific YAML build definition found")
+
+    # defaults to normal build flow, may be overwritten by subclass
+    def run(self):
+        return self.steps_full_build_flow()
+
+    def step_parse_builder_output(self, build_dir):
+        # TODO: output as .json or even add as new build step
+        # CHECK FOR VERIFICATION STEP SUCCESS
+        if os.path.exists(os.path.join(build_dir, "verification_output")):
+            # Collect all verification output filenames
+            outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy"))
+            # Extract the verification status for each verification output by matching
+            # to the SUCCESS string contained in the filename
+            status = all([out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs])
+
+            # Construct a dictionary reporting the verification status as string
+            self.output_dict["builder_verification"] = {
+                "verification": {True: "success", False: "fail"}[status]
+            }
+            # TODO: mark job as failed if verification fails?
+
+    def steps_full_build_flow(self):
+        # Default step sequence for benchmarking a full FINN builder flow
+        # MODEL CREATION/IMPORT
+        # TODO: track fixed input onnx models with DVC
+        if "model_dir" in self.params:
+            # input ONNX model and verification input/output pairs are provided
+            model_dir = self.params["model_dir"]
+            self.build_inputs["onnx_path"] = os.path.join(model_dir, "model.onnx")
+            self.build_inputs["input_npy_path"] = os.path.join(model_dir, "inp.npy")
+            self.build_inputs["output_npy_path"] = os.path.join(model_dir, "out.npy")
+        elif "model_path" in self.params:
+            self.build_inputs["onnx_path"] = self.params["model_path"]
+        else:
+            # input ONNX model (+ optional I/O pair for verification) will be generated
+            self.build_inputs["onnx_path"] = os.path.join(
+                self.build_inputs["build_dir"], "model_export.onnx"
+            )
+            if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped":
+                # microbenchmarks might skip because no model can be generated for given params
+                return "skipped"
+
+        # BUILD SETUP
+        # Initialize from YAML (default) or custom script (if dedicated subclass is defined)
+        cfg = self.step_build_setup()
+
+        # Set some global defaults (could still be overwritten by run-specific YAML)
+        cfg.output_dir = self.build_inputs["build_dir"]
+        # enable extra performance optimizations (physopt)
+        # TODO: check OMX synth strategy again!
+        cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST
+        cfg.verbose = True
+        cfg.console_log_level = "ERROR"
+        cfg.enable_build_pdb_debug = False
+        # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
+        cfg.force_python_rtlsim = False
+        cfg.split_large_fifos = True
+        cfg.save_intermediate_models = True  # Save the intermediate model graphs
+        cfg.verify_save_full_context = True  # Output full context dump for verification steps
+        cfg.enable_instrumentation = True
+        # rtlsim_use_vivado_comps # TODO ?
+        # cfg.default_swg_exception
+        # cfg.large_fifo_mem_style
+
+        # Overwrite build config settings with run-specific YAML build definition
+        # TODO: warn/error if there are unrecognized options set?
+        for key in self.params:
+            if hasattr(cfg, key):
+                setattr(cfg, key, self.params[key])
+
+        # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
+        # TODO: make configurable or set on pipeline level?
+        os.environ["LIVENESS_THRESHOLD"] = "10000000"
+
+        # BUILD
+        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
+
+        # ANALYSIS
+        self.step_parse_builder_output(self.build_inputs["build_dir"])
diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml
new file mode 100644
index 0000000000..16a68f4143
--- /dev/null
+++ b/src/finn/benchmarking/dut/mobilenetv1.yml
@@ -0,0 +1,23 @@
+model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx
+folding_config_file: models/mobilenetv1/ZCU102_folding_config_live_fifo.json
+specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json
+
+steps:
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_convert_to_hw_layers_separate_th # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
+
+# folding config comes with FIFO sizes
+auto_fifo_depths: False
diff --git a/src/finn/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py
new file mode 100644
index 0000000000..2c4a6b730a
--- /dev/null
+++ b/src/finn/benchmarking/dut/mvau.py
@@ -0,0 +1,344 @@
+import json
+import math
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model,
+)
+
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.bench_base import bench
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth
+
+
+class bench_mvau(bench):
+    def _make_single_mvau_model(
+        self,
+        W,
+        numInputVectors,
+        pe,
+        simd,
+        m,
+        wdt,
+        idt,
+        odt,
+        T=None,
+        tdt=None,
+        mem_mode="const",
+        ram_style="auto",
+        ram_style_thresholds="auto",
+    ):
+        mw = W.shape[0]
+        mh = W.shape[1]
+
+        # there are two ways to implement bipolar weights and inputs for
+        # MatrixVectorActivation:
+        # - specify their datatypes as such
+        # - specify their datatypes as BINARY as use binaryXnorMode
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            # we'll internally convert weights/inputs to binary and specify the
+            # datatypes as such, and also set the binaryXnorMode attribute to 1
+            export_wdt = DataType["BINARY"]
+            export_idt = DataType["BINARY"]
+            binary_xnor_mode = 1
+        else:
+            export_wdt = wdt
+            export_idt = idt
+            binary_xnor_mode = 0
+
+        # numInputVectors for dense = [N]
+        # numInputVectors for conv  = [N, H, W]
+        inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, numInputVectors + [mw])
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, numInputVectors + [mh])
+        if T is not None:
+            no_act = 0
+            node_inp_list = ["inp", "weights", "thresh"]
+            if odt == DataType["BIPOLAR"]:
+                actval = 0
+            else:
+                actval = odt.min()
+        else:
+            # no thresholds
+            node_inp_list = ["inp", "weights"]
+            actval = 0
+            no_act = 1
+        mvau_node = helper.make_node(
+            "MVAU_hls",  # TODO: add rtl support (configurable as param)
+            node_inp_list,
+            ["outp"],
+            domain="finn.custom_op.fpgadataflow.hls",
+            backend="fpgadataflow",
+            MW=mw,
+            MH=mh,
+            SIMD=simd,
+            PE=pe,
+            M=m,
+            numInputVectors=numInputVectors,
+            inputDataType=export_idt.name,
+            weightDataType=export_wdt.name,
+            outputDataType=odt.name,
+            ActVal=actval,
+            binaryXnorMode=binary_xnor_mode,
+            noActivation=no_act,
+            resType="lut",
+            mem_mode=mem_mode,
+            ram_style=ram_style,
+            ram_style_thresholds=ram_style_thresholds,
+            runtime_writeable_weights=0,
+        )
+
+        graph = helper.make_graph(
+            nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp]
+        )
+        model = qonnx_make_model(graph, producer_name="mvau-model")
+        model = ModelWrapper(model)
+
+        model.set_tensor_datatype("inp", idt)
+        model.set_tensor_datatype("outp", odt)
+        model.set_tensor_datatype("weights", wdt)
+        # model.set_tensor_shape("weights", (channels, 1, k_h, k_w)) from VVAU
+        if binary_xnor_mode:
+            # convert bipolar to binary
+            model.set_initializer("weights", (W + 1) / 2)
+        else:
+            model.set_initializer("weights", W)
+        if T is not None:
+            model.set_tensor_datatype("thresh", tdt)
+            model.set_initializer("thresh", T)
+
+        # Minimize weight & accumulator width to obtain realistic resource consumption
+        # model = model.transform(InferShapes())
+        model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(InferDataTypes())
+
+        return model
+
+    def step_export_onnx(self, onnx_export_path):
+        # Read params
+        idt = self.params["idt"]
+        wdt = self.params["wdt"]
+        act = self.params["act"]
+
+        numInputVectors = self.params["nhw"]
+        mw = self.params["mw"]
+        mh = self.params["mh"]
+        sf = self.params["sf"]
+        nf = self.params["nf"]
+        m = self.params["m"]
+
+        mem_mode = self.params["mem_mode"]
+        ram_style = self.params["ram_style"]
+        ram_style_thr = self.params["ram_style_thr"]
+
+        output_dict = {}
+
+        # convert string to FINN DataType
+        idt = DataType[idt]
+        wdt = DataType[wdt]
+        if act is not None:
+            act = DataType[act]
+
+        # Determine and log folding
+        if sf == -1:
+            sf = mw
+        simd = mw // sf
+        if nf == -1:
+            nf = mh
+        pe = mh // nf
+        if mw % simd != 0 or mh % pe != 0:
+            print("Invalid simd/pe configuration, skipping")
+            return "skipped"
+        if m > 1 and (simd != mw or pe != mh):
+            print("M > 1 not possible for non-max simd/pe, skipping")
+            return "skipped"
+        output_dict["simd"] = simd
+        output_dict["pe"] = pe
+
+        # Generate weights
+        np.random.seed(123456)  # TODO: verify or switch to modern numpy random generation
+
+        W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+        if "sparsity_type" in self.params:
+            sparsity_type = self.params["sparsity_type"]
+        else:
+            sparsity_type = "none"
+
+        if sparsity_type == "none":
+            if "sparsity_amount" in self.params:
+                if self.params["sparsity_amount"] > 0:
+                    print("sparsity amount > 0 not applicable for none sparsity, skipping")
+                    return "skipped"
+        else:
+            if self.params["sparsity_amount"] == 0:
+                print("sparsity amount = 0 not applicable for selected sparsity, skipping")
+                return "skipped"
+            if sparsity_type == "unstructured":
+                idx = np.random.choice(
+                    mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False
+                )
+                W = np.reshape(W, -1)
+                W[idx] = 0.0
+                W = np.reshape(W, (mw, mh))
+            elif sparsity_type == "rows_random":
+                idx_mw = np.random.choice(
+                    mw, size=int(self.params["sparsity_amount"] * mw), replace=False
+                )
+                W[idx_mw, :] = 0.0
+            elif sparsity_type == "cols_random":
+                idx_mh = np.random.choice(
+                    mh, size=int(self.params["sparsity_amount"] * mh), replace=False
+                )
+                W[:, idx_mh] = 0.0
+            elif sparsity_type == "rows_regular":
+                if self.params["sparsity_amount"] == 0.25:
+                    idx_mw = np.arange(0, mw, step=4)
+                elif self.params["sparsity_amount"] == 0.5:
+                    idx_mw = np.arange(0, mw, step=2)
+                elif self.params["sparsity_amount"] == 0.75:
+                    idx_mw = np.concatenate(
+                        (
+                            np.arange(0, mw, step=4),
+                            np.arange(1, mw, step=4),
+                            np.arange(2, mw, step=4),
+                        )
+                    )
+                else:
+                    print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
+                    return "skipped"
+                W[idx_mw, :] = 0.0
+            elif sparsity_type == "cols_regular":
+                if self.params["sparsity_amount"] == 0.25:
+                    idx_mh = np.arange(0, mh, step=4)
+                elif self.params["sparsity_amount"] == 0.5:
+                    idx_mh = np.arange(0, mh, step=2)
+                elif self.params["sparsity_amount"] == 0.75:
+                    idx_mh = np.concatenate(
+                        (
+                            np.arange(0, mh, step=4),
+                            np.arange(1, mh, step=4),
+                            np.arange(2, mh, step=4),
+                        )
+                    )
+                else:
+                    print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
+                    return "skipped"
+                W[:, idx_mh] = 0.0
+
+            else:
+                print("ERROR: unknown sparsity type")
+                raise Exception("ERROR: unknown sparsity type")
+
+        # TODO: implement enforce option which prevents naturally occurring sparsity
+        # params["sparsity_enforce"]
+        # TODO: implement distribution option which selects between uniform/normal/??
+        # params["sparsity_distribution"]
+
+        # log resulting sparsity statistics
+        # could be higher than selected due to naturally occurring sparsity
+        num_zeros = (W == 0).sum()
+        num_ones = (W == 1).sum() + (W == -1).sum()
+        num_p2 = 0
+        for w in np.nditer(W):
+            if w != 0 and w != 1 and w != -1:
+                if w > 0:
+                    if math.log2(w).is_integer():
+                        num_p2 = num_p2 + 1
+                else:
+                    if math.log2(-w).is_integer():
+                        num_p2 = num_p2 + 1
+        output_dict["zero_weights"] = round(num_zeros / W.size, 2)
+        output_dict["easy_weights"] = round((num_zeros + num_ones + num_p2) / W.size, 2)
+
+        # Generate thresholds
+        if act is None:
+            # no activation, produce accumulators
+            T = None
+            tdt = None
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                odt = DataType["UINT32"]
+            else:
+                odt = DataType["INT32"]
+        else:
+            odt = act
+            # set range for threshold values according to worst-case accumulator range
+            # (not weight value specific)
+            # this could result in some thresholds being clipped by MinimizeAccumulatorWidth
+            # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt)
+            # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt)
+            # acc_min = min(min(lower_range), min(upper_range))
+            # acc_max = max(max(lower_range), max(upper_range))
+            # set range for threshold values according to actual accumulator range
+            # for the generated weights
+            (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt)
+            n_steps = act.get_num_possible_values() - 1
+            T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32)
+            # provide non-decreasing thresholds
+            T = np.sort(T, axis=1)
+            # generate thresholds for activation
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                tdt = DataType["UINT32"]
+                # bias thresholds to be positive
+                T = np.ceil((T + mw) / 2)
+                assert (T >= 0).all()
+            else:
+                tdt = DataType["INT32"]
+
+        # Create model
+        model = self._make_single_mvau_model(
+            W,
+            numInputVectors,
+            pe,
+            simd,
+            m,
+            wdt,
+            idt,
+            odt,
+            T,
+            tdt,
+            mem_mode,
+            ram_style,
+            ram_style_thr,
+        )
+        model = model.transform(GiveUniqueNodeNames())
+        # node = model.get_nodes_by_op_type("MVAU_hls")[0]
+        # inst = getCustomOp(node)
+
+        # display results of analysis passes only for the first occurence of this op type
+        self.target_node = "MVAU_hls"
+
+        # log additional info about the generated model (e.g. SIMD/PE or sparsity)
+        with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f:
+            json.dump(output_dict, f, indent=2)
+
+        # TODO: also generate golden I/O pair for further verification steps
+        model.save(onnx_export_path)
+
+    def step_build_setup(self):
+        # create build config for synthetic microbenchmark models
+        cfg = build_cfg.DataflowBuildConfig(
+            # manual folding
+            target_fps=None,
+            steps=[
+                "step_create_dataflow_partition",
+                "step_minimize_bit_width",
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                "step_create_stitched_ip",
+                "step_measure_rtlsim_performance",
+                "step_out_of_context_synthesis",
+                "step_synthesize_bitfile",
+                "step_make_driver",
+                "step_deployment_package",
+            ],
+        )
+        return cfg
diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml
new file mode 100644
index 0000000000..c8779e5654
--- /dev/null
+++ b/src/finn/benchmarking/dut/resnet50.yml
@@ -0,0 +1,26 @@
+model_path: models/resnet50/resnet50_w1a2_exported.onnx
+folding_config_file: models/resnet50/U250_folding_config_live_fifo.json
+specialize_layers_config_file: models/resnet50/U250_specialize_layers.json
+vitis_floorplan_file: models/resnet50/floorplan_resnet50.json
+
+steps:
+  - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step
+  - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step
+  - finn.builder.custom_step_library.resnet.step_resnet50_convert_to_hw # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_measure_rtlsim_performance
+  - step_out_of_context_synthesis
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
+
+# folding config comes with FIFO sizes
+auto_fifo_depths: False
diff --git a/src/finn/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py
new file mode 100644
index 0000000000..ff33436976
--- /dev/null
+++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py
@@ -0,0 +1,288 @@
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.transformation.general import (
+    GiveRandomTensorNames,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.bench_base import bench
+
+from finn.util.basic import make_build_dir
+
+
+def generate_random_threshold_values(
+    data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
+):
+    if per_tensor:
+        num_input_channels = 1
+    if narrow:
+        num_steps -= 1
+
+    return np.random.randint(
+        data_type.min(),
+        data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
+
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+
+def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0):
+    # hardcoded parameters
+    idt = DataType["UINT4"]
+    wdt = DataType["UINT4"]
+    odt = DataType["UINT4"]
+    tdt = DataType["UINT32"]
+    stride = 1
+    in_ch = out_ch = ch  # input channel = output channel for stacking
+    # pad so that input dim = output dim for stacking (only supports odd kernel_size for now)
+    pad = int(np.floor(kernel_size / 2))
+
+    total_pad = 2 * pad
+    out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad)
+    weights_shape = [in_ch * kernel_size * kernel_size, out_ch]
+    thresholds_shape = [1, odt.get_num_possible_values() - 1]
+    input_shape = [1, ifm_dim, ifm_dim, in_ch]
+    padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch]
+    inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size]
+    output_shape = [1, out_feature_dim, out_feature_dim, out_ch]
+
+    assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable"
+
+    padding_config = {}
+    padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    padding_config["backend"] = "fpgadataflow"
+    padding_config["ImgDim"] = [ifm_dim, ifm_dim]
+    padding_config["NumChannels"] = in_ch
+    padding_config["SIMD"] = simd
+    padding_config["Padding"] = [pad, pad, pad, pad]
+    padding_config["inputDataType"] = idt.name
+
+    inpgen_config = {}
+    inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    inpgen_config["backend"] = "fpgadataflow"
+    inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size]
+    inpgen_config["IFMChannels"] = in_ch
+    inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad]
+    inpgen_config["OFMDim"] = [ifm_dim, ifm_dim]
+    inpgen_config["inputDataType"] = idt.name
+    inpgen_config["outputDataType"] = idt.name
+    inpgen_config["SIMD"] = simd
+    inpgen_config["parallel_window"] = parallel_window
+    inpgen_config["Stride"] = [stride, stride]
+    inpgen_config["Dilation"] = [1, 1]
+
+    mvau_config = {}
+    mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    mvau_config["backend"] = "fpgadataflow"
+    mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    mvau_config["MW"] = in_ch * kernel_size * kernel_size
+    mvau_config["MH"] = in_ch
+    mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size
+    mvau_config["PE"] = pe
+    mvau_config["resType"] = "lut"
+    mvau_config["mem_mode"] = "internal_embedded"  # internal_decoupled
+    mvau_config["inputDataType"] = idt.name
+    mvau_config["weightDataType"] = wdt.name
+    mvau_config["outputDataType"] = odt.name
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape),
+        helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape),
+        helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape),
+        helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape),
+    ]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="building_block",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config),
+                helper.make_node(
+                    "ConvolutionInputGenerator_rtl",
+                    ["padding_out"],
+                    ["inpgen_out"],
+                    **inpgen_config,
+                ),
+                helper.make_node(
+                    "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config
+                ),
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_layout("top_in", ["N", "H", "W", "C"])
+    model.set_tensor_datatype("top_out", odt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("thresholds", tdt)
+
+    weights = gen_finn_dt_tensor(wdt, weights_shape)
+    # TODO: thresholds are all the same
+    thresholds = generate_random_threshold_values(
+        tdt, out_ch, odt.get_num_possible_values() - 1, False, True
+    )
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    model.set_initializer("weights", weights)
+    model.set_initializer("thresholds", thresholds)
+
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+def combine_blocks(lb, rb, ifm_dim, ch, pe):
+    # assumes left branch (lb) and right branch (rb) each have a
+    # single (dynamic) input/output with the same shape
+
+    # to avoid mix-ups, start by giving all tensors random names
+    lb = lb.transform(GiveRandomTensorNames())
+    rb = rb.transform(GiveRandomTensorNames())
+    # erase all node names to avoid conflict
+    for n in lb.graph.node:
+        n.name = ""
+    for n in rb.graph.node:
+        n.name = ""
+
+    lb_input = lb.graph.input[0]
+    lb_output = lb.graph.output[0]
+    rb_input = rb.graph.input[0]
+    rb_output = rb.graph.output[0]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+
+    dup_config = {}
+    dup_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    dup_config["backend"] = "fpgadataflow"
+    dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    dup_config["NumChannels"] = ch
+    dup_config["PE"] = pe
+    dup_config["NumOutputStreams"] = 2
+    dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name
+    # We always need to set outFIFODepths explictly for DuplicateStreams
+    # because it has no default value that corresponds automatically to NumOutputStreams
+    dup_config["outFIFODepths"] = [2] * 2
+
+    add_config = {}
+    add_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    add_config["backend"] = "fpgadataflow"
+    add_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    add_config["NumChannels"] = ch
+    add_config["PE"] = pe
+    add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name
+
+    nodes_lb = [node for node in lb.graph.node]
+    nodes_rb = [node for node in rb.graph.node]
+    nodes_new = (
+        nodes_lb
+        + nodes_rb
+        + [
+            helper.make_node(
+                "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config
+            ),
+            helper.make_node(
+                "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config
+            ),
+        ]
+    )
+
+    value_info_lb = [x for x in lb.graph.value_info]
+    value_info_rb = [x for x in rb.graph.value_info]
+    value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output]
+
+    initializer_lb = [x for x in lb.graph.initializer]
+    initializer_rb = [x for x in rb.graph.initializer]
+    initializer_new = initializer_lb + initializer_rb
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="branching_model",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info_new,
+            nodes=nodes_new,
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name))
+    model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name))
+    for i in initializer_new:
+        model.graph.initializer.append(i)
+
+    # tidy-up
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+
+class bench_synthetic_nonlinear(bench):
+    def step_export_onnx(self, onnx_export_path):
+        np.random.seed(0)
+        tmp_output_dir = make_build_dir("test_fifosizing")
+
+        # TODO: allow manual folding/fifo config as input
+        # TODO: how to determine rtlsim_n automatically?
+
+        # conv parameters
+        dim = self.params["dim"]
+        kernel_size = self.params["kernel_size"]
+        ch = self.params["ch"]
+        simd = self.params["simd"]
+        pe = self.params["pe"]
+        parallel_window = self.params["parallel_window"]
+
+        lb = None
+        for i in range(self.params["lb_num_layers"]):
+            new_block = make_conv_building_block(
+                dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+            )
+            lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block))
+        lb.save(tmp_output_dir + "/lb.onnx")
+
+        rb = None
+        for i in range(self.params["rb_num_layers"]):
+            new_block = make_conv_building_block(
+                dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+            )
+            rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block))
+        rb.save(tmp_output_dir + "/rb.onnx")
+
+        model = combine_blocks(lb, rb, dim, ch, pe=4)
+        model.save(onnx_export_path)
+
+    def step_build_setup(self):
+        # create build config for synthetic test models
+
+        cfg = build_cfg.DataflowBuildConfig(
+            # manual folding
+            target_fps=None,
+        )
+
+        return cfg
diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py
new file mode 100644
index 0000000000..83002ef418
--- /dev/null
+++ b/src/finn/benchmarking/dut/transformer.py
@@ -0,0 +1,974 @@
+# Adapted from Christoph's attention-dummy repository
+
+# PyTorch base package: Math and Tensor Stuff
+import json
+import numpy as np
+import random
+import torch
+from brevitas.export import export_qonnx
+
+# Brevitas: Quantized versions of PyTorch layers
+from brevitas.nn import (
+    QuantEltwiseAdd,
+    QuantIdentity,
+    QuantLinear,
+    QuantMultiheadAttention,
+    QuantReLU,
+)
+
+# Brevitas wrapper around PyTorch tensors adding quantization information
+from brevitas.quant_tensor import QuantTensor
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Range information structure for seeding the range analysis for converting
+# quantized activations to MultiThreshold
+from qonnx.util.range_analysis import RangeInfo
+
+# Progressbar
+from tqdm import trange
+
+# FINN dataflow builder
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.bench_base import bench
+
+# Custom build steps required to streamline and convert the attention operator
+from finn.builder.custom_step_library.transformer import (
+    node_by_node_cppsim,
+    prepare_graph,
+    set_fifo_depths,
+    set_target_parallelization,
+    step_apply_folding_config,
+    step_convert_attention_to_hw,
+    step_convert_depth_wise_to_hw,
+    step_convert_elementwise_binary_to_hw,
+    step_convert_lookup_to_hw,
+    step_convert_split_concat_to_hw,
+    step_replicate_streams,
+    step_streamline,
+)
+
+
+# ADAPTED FROM utils.py
+# Seeds all relevant random number generators to the same seed for
+# reproducibility
+def seed(s):
+    random.seed(s)
+    np.random.seed(s)
+    torch.manual_seed(s)
+
+
+# ADAPTED FROM model.py
+# Derives a weight quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def weight_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.inject.enum import RestrictValueType
+    from brevitas.quant.base import MaxStatsScaling, NarrowIntQuant
+    from brevitas.quant.solver import WeightQuantSolver
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Per tensor quantization, not per channel
+        scaling_per_output_channel = False
+        # What is this? Copied from PerTensorFloatScaling*
+        #   Probably restricts the scale to be floating-point?
+        restrict_scaling_type = RestrictValueType.FP
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Derives a bias quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def bias_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.quant import IntBias
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(IntBias):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Do not require the bit-width to be adjusted to fit the accumulator to
+        # which the bias is added
+        requires_input_bit_width = False
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Derives an activation quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def act_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.inject.enum import RestrictValueType
+    from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling
+    from brevitas.quant.solver import ActQuantSolver
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Per tensor quantization, not per channel
+        scaling_per_output_channel = False
+        # What is this? Copied from PerTensorFloatScaling*
+        #   Probably restricts the scale to be floating-point?
+        restrict_scaling_type = RestrictValueType.FP
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Gets the normalization layer from configuration key
+def get_norm(key, normalized_shape):
+    # Transposes Sequence and Embedding dimensions
+    class Transpose(torch.nn.Module):
+        # Forward pass transposing the feature map
+        def forward(self, x):  # noqa: May be static
+            # Transpose the last two dimensions of batch x seq x emb layout
+            return torch.transpose(x, dim0=-1, dim1=-2)
+
+    # Dictionary mapping keys to supported normalization layer implementations
+    norms = {
+        # PyTorch default layer normalization. Needs to know the shape of the
+        # feature map to be normalized
+        "layer-norm": torch.nn.LayerNorm(
+            # Note: Disable affine parameters as potential negative scale causes
+            # streamlining issues later
+            normalized_shape=normalized_shape,
+            elementwise_affine=False,
+        ),
+        # PyTorch default 1-dimensional batch normalization. Needs to transpose
+        # embedding and sequence dimension to normalized over the embedding
+        # dimension, which is expected to be second.
+        "batch-norm": torch.nn.Sequential(
+            # Note: Disable affine parameters as potential negative scale causes
+            # streamlining issues later
+            Transpose(),
+            torch.nn.LazyBatchNorm1d(affine=False),
+            Transpose(),
+        ),
+        # No normalization by a PyTorch built-in identity layer. Should not
+        # appear in the graph.
+        "none": torch.nn.Identity(),
+    }
+
+    # Select the normalization layer by key
+    return norms[key]
+
+
+# Gets the attention mask from configuration key and shape
+def get_mask(key, length):
+    # Dictionary mapping keys to supported normalization layer implementations
+    masks = {
+        # No attention mask
+        "none": None,
+        # Generate the upper triangular mask for causal attention
+        "causal": torch.nn.Transformer.generate_square_subsequent_mask(length),
+        # Square matrix with entries randomly set to -inf or 0.0 with 50%
+        # probability each
+        "random": torch.where(  # noqa: Confused by types?
+            torch.rand(length, length) > 0.5, -torch.inf, 0.0
+        ),
+    }
+    # Select the mask type by key
+    return masks[key]
+
+
+# Single-layer scaled dot-product attention block with MLP and normalization
+class TransformerBlock(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+
+        # Input quantizer to the scaled dot-product attention operations, shared
+        # by queries, keys and values inputs. It is important to have this
+        # quantizer separate and not preceding the fork node of the residual
+        # branches to avoid consecutive quantizers in the skip branch.
+        # Note: For some reason it seems not to be possible to use the
+        #   in_proj_input_quant of the attention operator
+        self.sdp_input_quant = QuantIdentity(
+            # Quantize at the output
+            act_quant=act_quantizer(bits, _signed=True),
+            # Pass quantization information on to the next layer.
+            return_quant_tensor=True,
+        )
+        # Quantized scaled dot-product attention operator
+        self.sdp = QuantMultiheadAttention(
+            # Size of the embedding dimension (input and output)
+            embed_dim=emb_dim,
+            # Number of attention heads
+            num_heads=num_heads,
+            # Enable a bias added to the input and output projections
+            bias=bias,
+            # Layout of the inputs:
+            #   Batch x Sequence x Embedding (batch-first, True)
+            #   Sequence x Batch x Embedding (batch-second, False)
+            batch_first=True,
+            # If query, key and value input are the same, packed input
+            # projections use a single, large linear projection to produce
+            # the actual query, key and value inputs. Otherwise, use
+            # separate linear projections on each individual input.
+            packed_in_proj=False,
+            # Brevitas has this as an unsigned quantizer by default, but
+            # finn can only handle signed quantizer
+            attn_output_weights_quant=act_quantizer(bits, _signed=True),
+            # Insert an additional quantizer in front ot the softmax. In our
+            # finn custom-op, this will be matched to the quantizer
+            # following the query and key matmul.
+            # Note: Disable to prevent the quantizer from tripping over -inf
+            # from the attention mask
+            softmax_input_quant=None,
+            # Quantize the input projections weights as configured
+            in_proj_weight_quant=weight_quantizer(bits, _signed=True),
+            # Quantize the bias of the input projections as configured
+            in_proj_bias_quant=bias_quantizer(bits, _signed=True),
+            # No quantization in front of the input projections as this is
+            # either done by a standalone quantizer preceding the whole block
+            in_proj_input_quant=None,
+            # Quantize the output projections weights as configured
+            out_proj_weight_quant=weight_quantizer(bits, _signed=True),
+            # Quantize the bias of the output projections as configured
+            out_proj_bias_quant=bias_quantizer(bits, _signed=True),
+            # Quantize the input to the output projection as configured
+            out_proj_input_quant=act_quantizer(bits, _signed=True),
+            # Quantizer the key after projections as configured
+            k_transposed_quant=act_quantizer(bits, _signed=True),
+            # Quantize the queries after projections as configured
+            q_scaled_quant=act_quantizer(bits, _signed=True),
+            # Quantize the values after projection as configured
+            v_quant=act_quantizer(bits, _signed=True),
+            # No output quantization for now, as stacking multiple layers
+            # results in multiple multi-thresholds in succession
+            out_proj_output_quant=None,
+            # Return the quantization parameters so the next layer can
+            # quantize the bias
+            return_quant_tensor=True,
+        )
+        # Residual branch addition skipping over the attention layer
+        self.residual_sdp = QuantEltwiseAdd(
+            # Shared input activation quantizer such that the scales at both
+            # input branches are identical. This allows floating point scale
+            # factor to be streamlined past the add-node.
+            input_quant=act_quantizer(bits, _signed=True),
+            # Disable the output quantizer after the add operation. Output of
+            # the add will have one more bit than the inputs, which is probably
+            # fine and does not require re-quantization.
+            output_quant=None,
+            # Pass quantization information on to the next layer.
+            return_quant_tensor=True,
+        )
+        # Normalization following the attention layer
+        self.norm_sdp = torch.nn.Sequential(
+            # Select the normalization layer implementation
+            get_norm(key=norm, normalized_shape=emb_dim),
+            # No quantizer to avoid consecutive quantizer in the MLP residual
+            # branch. See input quantizer in front of the first MLP layer.
+        )
+
+        # Quantized MLP following the scaled dot-product attention
+        self.mlp = torch.nn.Sequential(
+            # Quantize the inputs to the MLP block. Placed here to not have this
+            # at the input of the residual branch.
+            QuantIdentity(
+                # Quantize at the output
+                act_quant=act_quantizer(bits, _signed=True),
+                # Pass quantization information on to the next layer.
+                return_quant_tensor=True,
+            ),
+            # First mlp layer projecting to the mlp dimension
+            QuantLinear(
+                # Inputs have the size of the attention embedding dimension
+                emb_dim,
+                # Project to the configured mlp dimension, which is typically
+                # larger than the embedding dimension
+                mlp_dim,
+                # Enable the learned bias vector
+                bias=bias,
+                # Quantize weights to the same representation as all other
+                # layers
+                weight_quant=weight_quantizer(bits, _signed=True),
+                # Quantize the bias to the same representation as all other
+                # layers
+                bias_quant=bias_quantizer(bits, _signed=True),
+                # No input quantizer as this is directly preceded by a
+                # standalone quantizer
+                input_quant=None,
+                # Not output quantizer as this is directly followed by a
+                # quantized ReLU activation taking care of quantization
+                output_quant=None,
+                # Return the quantization parameters so the next layer can
+                # quantize the bias
+                return_quant_tensor=True,
+            ),
+            # Use the ReLU activation function instead of the more commonly used
+            # GELU, as the latter is not mapped easily to hardware with FINN
+            QuantReLU(
+                # Note: ReLU must be quantized to unsigned representation
+                act_quant=act_quantizer(bits, _signed=False),
+                # Return the quantization parameters so the next layer can
+                # quantize the bias
+                return_quant_tensor=True,
+            ),
+            # Second mlp layer projecting back to the embedding dimension
+            QuantLinear(
+                # Inputs have the configured mlp dimension, which is typically
+                # larger than the embedding dimension
+                mlp_dim,
+                # Project back to the size of the attention embedding dimension
+                emb_dim,
+                # Enable the learned bias vector
+                bias=bias,
+                # Quantize weights to the same representation as all other
+                # layers
+                weight_quant=weight_quantizer(bits, _signed=True),
+                # Quantize the bias to the same representation as all other
+                # layers
+                bias_quant=bias_quantizer(bits, _signed=True),
+                # No input quantizer as the inputs are already quantized by the
+                # preceding ReLU layer
+                input_quant=None,
+                # Not output quantizer as this is directly followed by a
+                # quantized element-wise addition taking care of quantization
+                output_quant=None,
+                # Pass quantization information on to the next layer.
+                return_quant_tensor=True,
+            ),
+        )
+        # Residual branch addition skipping over the MLP layer
+        self.residual_mlp = QuantEltwiseAdd(
+            # Shared input activation quantizer such that the scales at both
+            # input branches are identical. This allows floating point scale
+            # factor to be streamlined past the add-node.
+            input_quant=act_quantizer(bits, _signed=True),
+            # Disable the output quantizer after the add operation. Output of
+            # the add will have one more bit than the inputs, which is probably
+            # fine and does not require re-quantization.
+            output_quant=None,
+            # Pass quantization information on to the next layer.
+            # Note: Not for the last layer to allow this to be combined with
+            # standard pytorch calls like .detach() or .numpy(), which are
+            # not directly available on QuantTensor.
+            return_quant_tensor=True,
+        )
+        # Normalization following the attention layer
+        self.norm_mlp = torch.nn.Sequential(
+            # Select the normalization layer implementation
+            get_norm(key=norm, normalized_shape=emb_dim),
+            # No quantizer to avoid consecutive quantizer in the SDP residual
+            # branch
+        )
+        # Generate the attention mask according to configuration
+        self.mask = get_mask(mask, seq_len)
+
+    # Forward pass through the transformer block
+    def forward(self, x):
+        # Move the mask to the same device as the input, just in case...
+        mask = self.mask.to(x.device) if self.mask is not None else None
+        # Quantize the input to the attention block
+        q = self.sdp_input_quant(x)
+        # Scaled dot-product attention with residual branch and normalization
+        x = self.norm_sdp(self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0]))
+        # MLP layer with residual branch and normalization
+        return self.norm_mlp(self.residual_mlp(x, self.mlp(x)))
+
+
+# Quantized sinusoidal positional encoding layer
+class QuantSinusoidalPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor,
+        )
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Get the size of the inputs to dynamically generate encodings of the
+        # same size
+        _, seq, emb = x.shape
+        # Start by enumerating all steps of the sequence
+        i = torch.as_tensor([[n] for n in range(seq)])
+        # Scale factor adjusting the frequency/wavelength of the sinusoid
+        # depending on the embedding dimension index
+        f = torch.as_tensor([1e4 ** -(i / emb) for i in range(0, emb, 2)])
+        # Prepare empty positional encoding tensor of the same size as the input
+        pos = torch.empty(seq, emb)
+        # Fill the positional encoding with alternating sine and cosine waves
+        pos[:, 0::2] = torch.sin(f * i)
+        pos[:, 1::2] = torch.cos(f * i)
+        # Move the encoding tensor to the same device as the input tensor
+        pos = pos.to(x.device, dtype=x.dtype)
+        # Add the quantized encoding to the quantized input
+        return self.add(x, pos)
+
+
+# Quantized learned positional encoding layer
+class QuantLearnedPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, seq_len, emb_dim, input_quant, output_quant, return_quant_tensor):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor,
+        )
+        # Register a parameter tensor representing the not quantized positional
+        # encoding
+        self.pos = torch.nn.Parameter(torch.empty(seq_len, emb_dim))
+        # Reset/Initialize the parameter tensor
+        self.reset_parameters()
+
+    # Resets/Initializes the positional encoding parameter tensor
+    def reset_parameters(self):
+        # Initialize the positional encoding from a normal distribution with
+        # zero mean and unit standard deviation
+        torch.nn.init.normal_(self.pos, mean=0, std=1)
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Add the quantized encoding to the quantized input
+        return self.add(x, self.pos)
+
+
+# Lazy version of the learned encoding not requiring input dimensions at
+# initialization, inferring these at the first forward pass
+class LazyQuantLearnedPositionalEncoding(
+    torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding  # noqa
+):
+    # Once initialized, this will become a QuantLearnedPositionalEncoding as
+    # defined above
+    cls_to_become = QuantLearnedPositionalEncoding
+    # Parameter tensor of the QuantLearnedPositionalEncoding is uninitialized
+    pos: torch.nn.UninitializedParameter
+
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the quantizer parts of QuantLearnedPositionalEncoding,
+        # leaving the dimensions empty
+        super().__init__(0, 0, input_quant, output_quant, return_quant_tensor)
+        # Register an uninitialized parameter tensor for the positional encoding
+        self.pos = torch.nn.UninitializedParameter()
+
+    # Resets/Initializes the positional encoding parameter tensor
+    def reset_parameters(self):
+        # If this has already been initialized, delegate to the actual
+        # implementation
+        if not self.has_uninitialized_params():
+            super().reset_parameters()
+
+    # Initializes/Materializes the uninitialized parameter tensor given some
+    # sample input tensor to infer the dimensions
+    def initialize_parameters(self, x):
+        # Only materialize the parameter tensor if it is not yet initialized
+        if self.has_uninitialized_params():
+            # Do not accumulate gradient information from initialization
+            with torch.no_grad():
+                # Get the size of the inputs to generate encodings of the same
+                # size
+                _, seq, emb = x.shape
+                # Materialize the positional encoding parameter tensor
+                self.pos.materialize((seq, emb))
+                # Properly initialize the parameters by resetting the values
+                self.reset_parameters()
+
+
+# Quantized binary positional encoding layer
+class QuantBinaryPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor,
+        )
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Get the size of the inputs to dynamically generate encodings of the
+        # same size
+        _, seq, emb = x.shape
+        # Binary positional encoding fills the embedding dimension with the bit
+        # pattern corresponding to the position in the sequence
+        pos = torch.as_tensor([[(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)])
+        # Move the encoding tensor to the same device as the input tensor
+        pos = pos.to(x.device, dtype=x.dtype)
+        # Add the quantized encoding tp the quantized input
+        #   Note: Convert encoding to bipolar representation
+        return self.add(x, 2 * pos - 1)
+
+
+# Gets the positional encoding layer from configuration key, quantizers and
+# shape
+def get_positional_encoding(key, input_quant, output_quant, return_quant_tensor):
+    # Dictionary mapping keys to supported normalization layer implementations
+    masks = {
+        # No positional encoding
+        "none": QuantIdentity(act_quant=input_quant, return_quant_tensor=return_quant_tensor),
+        # Fixed, sinusoidal positional encoding according to Vaswani et al. with
+        # added quantizers
+        "sinusoidal": QuantSinusoidalPositionalEncoding(
+            input_quant, output_quant, return_quant_tensor
+        ),
+        # Fixed, binary positional encoding with quantizers
+        "binary": QuantBinaryPositionalEncoding(input_quant, output_quant, return_quant_tensor),
+        # Learned positional encoding with quantizers
+        "learned": LazyQuantLearnedPositionalEncoding(
+            input_quant, output_quant, return_quant_tensor
+        ),
+    }
+    # Select the positional encoding type by key
+    return masks[key]
+
+
+# Unpacks the standard PyTorch tensor from a brevitas QuantTensor
+def unpack_from_quant(tensor: torch.Tensor | QuantTensor):
+    # If this is a QuantTensor we can extract the wrapped tensor
+    if isinstance(tensor, QuantTensor):
+        # The underlying tensor is wrapped as the value attribute
+        return tensor.value
+    # Assume this is already a plain PyTorch tensor
+    return tensor
+
+
+# Dummy transformer encoder model
+class DummyTransformer(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(
+        self,
+        # Number of layers of attention blocks
+        num_layers,
+        # Number of attention heads per block
+        num_heads,
+        # Size of embedding dimension going into/out of the attention block
+        emb_dim,
+        # Size of MLP dimension in each attention block
+        mlp_dim,
+        # Length of the input sequence, i.e., context size
+        seq_len,
+        # Enables bias term added to Linear layers
+        bias,
+        # Quantization bit-width: For now all layers are quantized to the
+        # same bit-width
+        bits,
+        # Type of normalization layer to use in the transformer blocks
+        #   Options are: layer-norm, batch-norm and none
+        norm="none",
+        # Type of attention mask to use
+        #   Options are: none, causal or const
+        mask="none",
+        # Type of positional encoding to use at the input
+        #   Options are: none, sinusoidal, binary, learned
+        positional_encoding="none",
+    ):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+
+        # Positional encoding layer at the input
+        self.pos = get_positional_encoding(
+            # Select the implementation by configuration key
+            key=positional_encoding,
+            # Quantize the inputs to the positional encoding to the same
+            # bit-width as the input
+            input_quant=act_quantizer(bits, _signed=True),
+            # Quantize the sum of input and positional encoding to the same
+            # bit-width as the input
+            output_quant=None,
+            # Pass quantization information on to the next layer
+            return_quant_tensor=True,
+        )
+
+        # Sequence of num_layers transformer encoder blocks
+        self.encoder = torch.nn.Sequential(
+            *[
+                TransformerBlock(num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits)
+                for _ in range(num_layers)
+            ]
+        )
+
+    # Model forward pass taking an input sequence and returning a single set of
+    # class probabilities
+    def forward(self, x):
+        # Add positional encoding to the input and feed through the encoder
+        # stack
+        # Note: Get the wrapped value out of the QuantTensor to have only a
+        # single output from the model.
+        return unpack_from_quant(self.encoder(self.pos(x)))
+
+
+# ADAPTED FROM export.py
+
+
+# Check whether a layer is a normalization layer of some supported type
+def is_norm_layer(module):
+    # Set of normalization layer (bases) which maybe need to be patched
+    norm_layers = {
+        # All BatchNorm and InstanceNorm variants derive from this baseclass
+        torch.nn.modules.batchnorm._NormBase,  # noqa: Access to _NormBase
+        # LayerNorm has a unique implementation
+        torch.nn.LayerNorm,
+    }
+    # Check the module against all supported norm layer types
+    return any(isinstance(module, norm) for norm in norm_layers)
+
+
+# Fixes export issues of normalization layers with disabled affine parameters.
+# Somehow the export to ONNX trips when it encounters the weight and bias tensor
+# to be 'None'.
+def patch_non_affine_norms(model: torch.nn.Module):  # noqa: Shadows model
+    # Iterate all modules in the model container
+    for name, module in model.named_modules():
+        # If the module is a normalization layer it might require patching the
+        # affine parameters
+        if is_norm_layer(module):
+            # Check whether affine scale parameters are missing
+            if hasattr(module, "weight") and module.weight is None:
+                # There need to be running statistics to patch the scales
+                if hasattr(module, "running_var"):
+                    # Patch the affine bias by all 1 tensor of the same shape,
+                    # type and device as the running variance
+                    module.weight = torch.nn.Parameter(torch.ones_like(module.running_var))
+            # Check whether affine bias parameters are missing
+            if hasattr(module, "bias") and module.bias is None:
+                # There need to be running statistics to patch the scales
+                if hasattr(module, "running_mean"):
+                    # Patch the affine bias by all 0 tensor of the same shape,
+                    # type and device as the running mean
+                    module.bias = torch.nn.Parameter(torch.zeros_like(module.running_var))
+    # Return the patched model container
+    return model
+
+
+template_folding_yaml = """
+# Per operator type default configurations
+defaults:
+    # Scaled dot-product attention head implemented via HLS
+    ScaledDotProductAttention_hls:
+        # Type of memory to be used for internal buffer storage
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Type of memory to be used fo the attention mask (if present)
+        #   Options: auto, block, distributed
+        ram_style_mask: block
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        mac_resource: lut
+    # Addition of two inputs (constants or streamed) implemented via HLS
+    ElementwiseAdd_hls:
+        # Type of memory to be used for internal buffer storage and/or constant
+        # parameter tensors
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Matrix vector activation unit implemented via HLS
+    MVAU_hls:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Matrix vector activation unit implemented via RTL
+    MVAU_rtl:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        # Note: RTL MVAU currently does not support LUT-based implementation
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
+    Thresholding_hls:
+        # Memory mode for threshold storage
+        #   Options: internal_embedded, internal_decoupled
+        mem_mode: internal_decoupled
+        # Type of memory to be used for threshold storage if "internal_decoupled"
+        #   Options: distributed, block
+        ram_style: distributed
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
+    Thresholding_rtl:
+        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
+        # depth of the thresholds
+        # Note: This combination forces "distributed" LUT implementation
+        depth_trigger_uram: 2147483647  # "infinity"
+        depth_trigger_bram: 2147483647  # "infinity"
+    #    # Note: This combination forces "block" RAM implementation
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 1
+    #    # Note: This combination forces "ultra" RAM implementation
+    #    depth_trigger_uram: 1
+    #    depth_trigger_bram: 0
+    #    # Note: This combination is equivalent to "auto"
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 0
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
+    StreamingFIFO_rtl:
+        # RTL vs. IPI implementation of FIFOs
+        #   Options: rtl, vivado
+        impl_style: rtl
+        # Resource type for FIFOs when impl_style is vivado
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Individual, named node-specific configurations here
+    # ...
+"""
+
+
+class bench_transformer(bench):
+    def step_export_onnx(self, output_onnx_path):
+        # Generates a dummy transformer block,
+        # not used for actual models (RadioML, GPT, etc.)
+
+        # Load the parameters file
+        # params = dvc.api.params_show("params.yaml")
+        # Seed all RNGs
+        seed(self.params["seed"])
+        # Make PyTorch behave deterministically if possible
+        torch.use_deterministic_algorithms(mode=True, warn_only=True)
+        # Create a model instance from the configuration parameters
+        # model = DummyTransformer(**params["model"])
+        model = DummyTransformer(
+            num_layers=self.params["model_num_layers"],
+            num_heads=self.params["model_num_heads"],
+            emb_dim=self.params["model_emb_dim"],
+            mlp_dim=self.params["model_mlp_dim"],
+            seq_len=self.params["model_seq_len"],
+            bias=self.params["model_bias"],
+            bits=self.params["model_bits"],
+            norm=self.params["model_norm"],
+            mask=self.params["model_mask"],
+            positional_encoding=self.params["model_positional_encoding"],
+        )
+
+        # Get the configured sequence length and embedding dimension to generate
+        # test inputs
+        seq, dim = self.params["model_seq_len"], self.params["model_emb_dim"]
+        # No gradient accumulation for calibration passes required
+        with torch.no_grad():
+            # Check whether GPU training is available and select the appropriate
+            # device
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            # Move the model to the training device
+            model = model.to(device)
+            # Multiple passes of calibration might be necessary for larger/deep
+            # models
+            for _ in trange(0, self.params["calibration_passes"], desc="calibrating"):
+                # Pass random data through the model to "calibrate" dummy quantizer.
+                # Large batch to have more calibration samples. Otherwise, there is
+                # too much deviation between this calibration and the verification
+                # samples.
+                model(torch.rand(128, seq, dim, device=device))
+            # Move the model back to the CPU
+            model = model.cpu()
+        # Prevent export issue for missing affine normalization parameters
+        model = patch_non_affine_norms(model)
+        # Switch model to evaluation mode to have it fixed for export
+        model = model.eval()
+        # Sample random input tensor in batch-first layout
+        x = torch.rand(1, seq, dim)
+        # Compute attention output
+        o = model(x)
+        # Save the input and output data for verification purposes later
+        np.save("inp.npy", x.detach().numpy())
+        np.save("out.npy", o.detach().numpy())
+        self.build_inputs["input_npy_path"] = "inp.npy"
+        self.build_inputs["output_npy_path"] = "out.npy"
+        # Export the model graph to QONNX
+        # export_qonnx(model, (x,), "attention.onnx", **self.params["export"])
+        export_qonnx(model, (x,), output_onnx_path, opset_version=14, do_constant_folding=True)
+
+    def step_build_setup(self):
+        # with open("params.yaml") as file:
+        #    params = yaml.safe_load(file)
+        # Seed all RNGs
+        seed(self.params["seed"])
+        # Extract sequence length and embedding dimension from parameters
+        if "model_seq_len" in self.params and "model_emb_dim" in self.params:
+            # for dummy Transformer DUT
+            seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"]
+        else:
+            # for real input models
+            inp_shape = np.load(self.build_inputs["input_npy_path"]).shape
+            if len(inp_shape) == 3:
+                # for RadioML Transformers
+                _, seq_len, emb_dim = inp_shape
+            else:
+                # for GPTs (why is this different?)
+                model = ModelWrapper(self.build_inputs["onnx_path"])
+                _, seq_len, emb_dim = model.get_tensor_shape(
+                    "/emb_add/input_quant/export_handler/Quant_output_0"
+                )
+
+        # Read the input value range information for the dataset from the parameters
+        # Note: Consider calibrating this on the fly from the dataset
+        value_range = [-100, +100]  # params["build"]["range"] # TODO: make configurable?
+        input_range = tuple(np.array([value_range]).T)
+        # Construct the seed range information of the input tensor
+        range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range)
+
+        # Prepare config files
+        # TODO: make configurable
+        # TODO: log intermediate files such as inp.npy, folding.yaml,
+        # or specialize_layers.jon as artifacts, maybe create in unique temp dirs
+        specialize_layers_dict = {
+            "Defaults": {"preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]},
+            "": {"preferred_impl_style": ""},
+        }
+        with open("specialize_layers.json", "w") as f:
+            json.dump(specialize_layers_dict, f, indent=2)
+        with open("folding.yaml", "w") as f:
+            f.write(template_folding_yaml)
+
+        # Create a configuration for building the scaled dot-product attention
+        # operator to a hardware accelerator
+        cfg = build_cfg.DataflowBuildConfig(
+            folding_config_file="folding.yaml",
+            specialize_layers_config_file="specialize_layers.json",
+            standalone_thresholds=True,
+            max_multithreshold_bit_width=16,
+            mvau_wwidth_max=2048,
+            verify_steps=[
+                # Verify the model after converting to the FINN onnx dialect
+                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
+                # Verify the model again using python mode after the default
+                # streamlining step
+                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+                # Verify the model again after tidy up transformations, right before
+                # converting to HLS
+                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+                # Verify the model after generating C++ HLS and applying folding
+                # only inserted if live FIFO-sizing is off:
+                # build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+                # No RTL Simulation support for now
+            ],
+            # File with test inputs for verification
+            verify_input_npy=self.build_inputs["input_npy_path"],
+            # File with expected test outputs for verification
+            verify_expected_output_npy=self.build_inputs["output_npy_path"],
+            # Build steps to execute
+            steps=[
+                # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering
+                # and Quant to MultiThreshold conversion
+                prepare_graph(range_info=range_info),
+                # Unified exhaustive streamlining of complex model topologies
+                # including attention, residuals and splits
+                step_streamline,
+                # conversion of the scaled dot-product attention pattern to
+                # hardware, including cleanup and data layout squeezing
+                step_convert_attention_to_hw,
+                # Convert the elementwise binary operations to hardware operators.
+                # These include for example adding residual branches and positional
+                # encoding
+                step_convert_elementwise_binary_to_hw,
+                # Convert Lookup layers, e.g., token embedding, to hardware custom
+                # operators
+                step_convert_lookup_to_hw,
+                # Convert Split and Concat operators to hardware, e.g., splits
+                # contained in the GLU activation
+                step_convert_split_concat_to_hw,
+                # Convert depth-wise convolution MatMuls to VVUs
+                step_convert_depth_wise_to_hw,
+                # Properly replicate the stream feeding the query, key and value
+                # projections
+                step_replicate_streams,
+                # Convert most other layers supported by FINN to HW operators
+                "step_convert_to_hw",
+                # Specialize HW layer implementations as either HLS or RTL
+                "step_specialize_layers",
+                "step_create_dataflow_partition",
+                # Set the folding configuration to meet the cycles per sequence
+                # target
+                set_target_parallelization(seq_len, emb_dim),
+                # Apply folding configuration, specifying hardware implementation
+                # details
+                # Note: This triggers a verification step
+                step_apply_folding_config,
+                "step_minimize_bit_width",
+                # The ScaledDotProductAttention custom op does not define any
+                # estimates
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                # Run additional node-by-node verification in RTL simulation of the
+                # model before creating the stitched IP
+                # Note: end-to-end verification of the stitched IP in RTL simulation
+                # is still not possible due to missing float IPs
+                # node_by_node_cppsim, #only inserted if live FIFO-sizing is off
+                # Only for debugging for now, does not work if "vivado" style
+                # StreamingFIFOs are used
+                # node_by_node_rtlsim,
+                "step_create_stitched_ip",
+                # "step_measure_rtlsim_performance", # not possible due to float components
+                "step_out_of_context_synthesis",  # for synthesis results (e.g. utilization)
+                "step_synthesize_bitfile",
+                "step_make_driver",
+                "step_deployment_package",
+            ],
+        )
+
+        # TESTING custom vs live FIFO-sizing
+        if self.params.get("live_fifo_sizing"):
+            # insert default FIFO-sizing step (behind step_generate_estimate_reports)
+            for i in range(len(cfg.steps)):
+                if cfg.steps[i] == "step_generate_estimate_reports":
+                    cfg.steps.insert(i + 1, "step_set_fifo_depths")
+        else:
+            # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen)
+            for i in range(len(cfg.steps)):
+                if cfg.steps[i] == "step_hw_ipgen":
+                    cfg.steps.insert(
+                        i + 1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len)
+                    )
+                    # also enable cppsim, which doesn't work with virtual FIFOs
+                    cfg.steps.insert(i + 2, node_by_node_cppsim)
+                    cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM)
+
+        return cfg
diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml
new file mode 100644
index 0000000000..99a9ab333d
--- /dev/null
+++ b/src/finn/benchmarking/dut/vgg10.yml
@@ -0,0 +1,31 @@
+model_path: models/vgg10/radioml_w4a4_small_tidy.onnx
+folding_config_file: models/vgg10/ZCU104_folding_config.json
+specialize_layers_config_file: models/vgg10/ZCU104_specialize_layers.json
+
+steps:
+  - step_tidy_up
+  - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step
+  - step_streamline
+  - step_convert_to_hw
+  - finn.builder.custom_step_library.conv1d.step_convert_final_layers # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_target_fps_parallelization
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_measure_rtlsim_performance
+  - step_out_of_context_synthesis
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
+
+# folding config doesn't come with FIFO sizes
+auto_fifo_depths: True
+auto_fifo_strategy: largefifo_rtlsim
+
+standalone_thresholds: True
diff --git a/src/finn/benchmarking/templates.py b/src/finn/benchmarking/templates.py
new file mode 100644
index 0000000000..44c2ebced8
--- /dev/null
+++ b/src/finn/benchmarking/templates.py
@@ -0,0 +1,214 @@
+# Template strings for benchmarking
+
+# flake8: noqa
+
+# power report scripting based on Lucas Reuter:
+template_open = """
+open_project  $PROJ_PATH$
+open_run $RUN$
+"""
+
+template_single_test = """
+set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type lut [get_cells -r finn_design_i/.*]
+set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type register [get_cells -r finn_design_i/.*]
+set_switching_activity -deassert_resets
+report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+reset_switching_activity -hier -type lut [get_cells -r finn_design_i/.*]
+reset_switching_activity -hier -type register [get_cells -r finn_design_i/.*]
+"""
+
+# template_single_test_type = """
+# set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*]
+# set_switching_activity -deassert_resets
+# report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+# reset_switching_activity -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*]
+# """
+
+template_sim_power = """
+set_property SOURCE_SET sources_1 [get_filesets sim_1]
+import_files -fileset sim_1 -norecurse $TB_FILE_PATH$
+set_property top switching_simulation_tb [get_filesets sim_1]
+update_compile_order -fileset sim_1
+
+launch_simulation -mode post-implementation -type functional
+restart
+open_saif $SAIF_FILE_PATH$
+log_saif [get_objects -r /switching_simulation_tb/dut/*]
+run $SIM_DURATION_NS$ ns
+close_saif
+
+read_saif $SAIF_FILE_PATH$
+report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+"""
+
+# TODO: configurable clock frequency
+template_switching_simulation_tb = """
+`timescale 1 ns/10 ps
+
+module switching_simulation_tb;
+reg clk;
+reg rst;
+
+//dut inputs
+reg tready;
+reg [$INSTREAM_WIDTH$-1:0] tdata;
+reg tvalid;
+
+//dut outputs
+wire [$OUTSTREAM_WIDTH$-1:0] accel_tdata;
+wire accel_tready;
+wire accel_tvalid;
+
+finn_design_wrapper dut(
+        .ap_clk(clk),
+        .ap_rst_n(rst),
+        .m_axis_0_tdata(accel_tdata),
+        .m_axis_0_tready(tready),
+        .m_axis_0_tvalid(accel_tvalid),
+        .s_axis_0_tdata(tdata),
+        .s_axis_0_tready(accel_tready),
+        .s_axis_0_tvalid(tvalid)
+        );
+
+always
+    begin
+        clk = 0;
+        #2.5;
+        clk = 1;
+        #2.5;
+    end
+
+integer i;
+initial
+    begin
+        tready = 0;
+        tdata = 0;
+        tvalid = 0;
+        rst = 0;
+        #50;
+        rst = 1;
+        tvalid = 1;
+        tready = 1;
+        while(1)
+            begin
+                for (i = 0; i < $INSTREAM_WIDTH$/$DTYPE_WIDTH$; i = i+1) begin
+                    tdata[i*$DTYPE_WIDTH$ +: $DTYPE_WIDTH$] = $RANDOM_FUNCTION$;
+                end
+                #5;
+            end
+    end
+endmodule
+"""
+
+zynq_harness_template = """
+set FREQ_MHZ %s
+set NUM_AXILITE %d
+if {$NUM_AXILITE > 9} {
+    error "Maximum 10 AXI-Lite interfaces supported"
+}
+set NUM_AXIMM %d
+set BOARD %s
+set FPGA_PART %s
+create_project finn_zynq_link ./ -part $FPGA_PART
+
+# set board part repo paths to find boards installed by FINN
+set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
+set paths_param [get_param board.repoPaths]
+lappend paths_prop $::env(FINN_ROOT)/deps/board_files
+lappend paths_param $::env(FINN_ROOT)/deps/board_files
+set_property BOARD_PART_REPO_PATHS $paths_prop [current_project]
+set_param board.repoPaths $paths_param
+
+if {$BOARD == "RFSoC2x2"} {
+    set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project]
+    set ZYNQ_TYPE "zynq_us+"
+} else {
+    puts "Unrecognized board"
+}
+
+create_bd_design "top"
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]]
+    create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
+    apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
+    set_property CONFIG.PSU__DISPLAYPORT__PERIPHERAL__ENABLE {0} [get_bd_cells zynq_ps]
+    #activate one slave port, deactivate the second master port
+    set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {0}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps]
+    #set frequency of PS clock (this can't always be exactly met)
+    set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
+} else {
+    puts "Unrecognized Zynq type"
+}
+
+#instantiate axi interconnect, axi smartconnect
+set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]]
+#set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]]
+create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0
+#create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0
+#set number of axilite interfaces, and number of axi master interfaces
+#set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0]
+set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0]
+
+#create reset controller and connect interconnects to PS
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    set axi_peripheral_base 0xA0000000
+    #connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD]
+    connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI]
+    #connect interconnect clocks and resets
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/S00_ACLK]
+    #apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins zynq_ps/saxihp0_fpd_aclk]
+}
+#connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn]
+
+#procedure used by below IP instantiations to map BD address segments based on the axi interface aperture
+proc assign_axi_addr_proc {axi_intf_path} {
+    #global variable holds current base address
+    global axi_peripheral_base
+    #infer range
+    set range [expr 2**[get_property CONFIG.ADDR_WIDTH [get_bd_intf_pins $axi_intf_path]]]
+    set range [expr $range < 4096 ? 4096 : $range]
+    #align base address to range
+    set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)]
+    #perform assignment
+    assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg*] -offset $offset -range $range
+    #advance base address
+    set axi_peripheral_base [expr $offset + $range]
+}
+
+#custom IP instantiations/connections start here
+%s
+
+#finalize clock and reset connections for interconnects
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_0/M*_ACLK]
+}
+
+save_bd_design
+assign_bd_address
+validate_bd_design
+
+set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ]
+make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top
+
+#set_property strategy Flow_PerfOptimized_high [get_runs synth_1]
+#set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1]
+#set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]
+#set_property strategy Performance_ExtraTimingOpt [get_runs impl_1]
+#set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1]
+#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+#set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
+
+# out-of-context synth can't be used for bitstream generation
+# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1]
+launch_runs -to_step write_bitstream impl_1
+wait_on_run [get_runs impl_1]
+
+# generate synthesis report
+open_run impl_1
+report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml
+close_project
+"""
diff --git a/src/finn/benchmarking/util.py b/src/finn/benchmarking/util.py
new file mode 100644
index 0000000000..1e08bd2501
--- /dev/null
+++ b/src/finn/benchmarking/util.py
@@ -0,0 +1,118 @@
+# Utility functions for benchmarking
+import json
+import os
+import shutil
+import xml.etree.ElementTree as ET
+
+
+def _find_rows_and_headers(table):
+    rows = table.findall("tablerow")
+    headers = []
+
+    for row in rows:
+        headers = row.findall("tableheader")
+        if len(headers) > 0:
+            break
+    return (rows, headers)
+
+
+def summarize_table(table):
+    table_summary = {}
+    table_summary["headers"] = []
+    rows, headers = _find_rows_and_headers(table)
+
+    if len(headers) > 0:
+        string = "Header: "
+        for header in headers:
+            table_summary["headers"].append(header.attrib["contents"])
+            string = string + header.attrib["contents"] + " "
+        # print(string.rstrip())
+
+    for row in rows:
+        cells = row.findall("tablecell")
+        if len(cells) > 0:
+            cell_name = cells[0].attrib["contents"]
+            string = cell_name
+            table_summary[cell_name] = []
+            for cell in cells[1:]:
+                table_summary[cell_name].append(cell.attrib["contents"])
+                string = string + cell.attrib["contents"] + " "
+            # print(string.rstrip())
+
+    return table_summary
+
+
+def summarize_section(section):
+    section_summary = {}
+    section_summary["tables"] = []
+    section_summary["subsections"] = {}
+
+    # print("Section:", section.attrib["title"])
+    tables = section.findall("table")
+    sub_sections = section.findall("section")
+    for table in tables:
+        section_summary["tables"].append(summarize_table(table))
+    # print("")
+    for sub_section in sub_sections:
+        section_summary["subsections"][sub_section.attrib["title"]] = summarize_section(sub_section)
+
+    return section_summary
+
+
+def power_xml_to_dict(xml_path):
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    sections = root.findall("section")
+    result = {}
+
+    for section in sections:
+        result[section.attrib["title"]] = summarize_section(section)
+
+    return result
+
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
+
+
+def merge_dicts(a: dict, b: dict):
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_dicts(a[key], b[key])
+            elif a[key] != b[key]:
+                raise Exception("ERROR: Dict merge conflict")
+        else:
+            a[key] = b[key]
+    return a
+
+
+def merge_logs(log_a, log_b, log_out):
+    # merges json log (list of nested dicts) b into a, not vice versa (TODO)
+
+    with open(log_a, "r") as f:
+        a = json.load(f)
+    with open(log_b, "r") as f:
+        b = json.load(f)
+
+    for idx, run_a in enumerate(a):
+        for run_b in b:
+            if run_a["run_id"] == run_b["run_id"]:
+                # a[idx] |= run_b # requires Python >= 3.9
+                # a[idx] = {**run_a, **run_b}
+                a[idx] = merge_dicts(run_a, run_b)
+                break
+
+    # also sort by run id
+    out = sorted(a, key=lambda x: x["run_id"])
+
+    with open(log_out, "w") as f:
+        json.dump(out, f, indent=2)
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 8bb8a850f7..2184531443 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -39,8 +39,10 @@
 import sys
 import time
 from qonnx.core.modelwrapper import ModelWrapper
+from rich import print as rprint
 from rich.console import Console
 from rich.logging import RichHandler
+from rich.traceback import Traceback
 
 from finn.builder.build_dataflow_config import DataflowBuildConfig, default_build_dataflow_steps
 from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
@@ -159,21 +161,12 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta:
     return filename
 
 
-def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
-    """Best-effort build a dataflow accelerator using the given configuration.
-
-    :param model_filename: ONNX model filename to build
-    :param cfg: Build configuration
-    """
-    finn_build_dir = os.environ["FINN_BUILD_DIR"]
-
-    print(f"Intermediate outputs will be generated in {finn_build_dir}")
-    print(f"Final outputs will be generated in {cfg.output_dir}")
-    print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
-    # create the output dir if it doesn't exist
-    os.makedirs(cfg.output_dir, exist_ok=True)
-
-    # set up logger
+def setup_logging(cfg: DataflowBuildConfig):
+    # Set up global logger, the force=True has the following effects:
+    # - If multiple build are run in a row, the log file will be re-created for each,
+    #   which is needed if the file was deleted/moved or the output dir changed
+    # - In a PyTest session, this logger will replace the PyTest log handlers, so logs
+    #   (+ captured warnings!) will end up in the log file instead of being collected by PyTest
     logpath = os.path.join(cfg.output_dir, "build_dataflow.log")
     if cfg.verbose:
         logging.basicConfig(
@@ -181,6 +174,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             format="[%(asctime)s]%(levelname)s: %(pathname)s:%(lineno)d: %(message)s",
             filename=logpath,
             filemode="w",
+            force=True,
         )
     else:
         logging.basicConfig(
@@ -188,22 +182,25 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             format="[%(asctime)s]%(levelname)s: %(message)s",
             filename=logpath,
             filemode="w",
+            force=True,
         )
 
-    # Capture all warnings.warn calls of qonnx,...
+    # Capture all warnings.warn calls of qonnx, ...
     logging.captureWarnings(True)
 
+    # Mirror stdout and stderr to log
     log = logging.getLogger("build_dataflow")
-
-    # mirror stdout and stderr to log
-    sys.stdout = PrintLogger(log, logging.INFO, sys.stdout)
-    sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr)
+    if not isinstance(sys.stdout, PrintLogger):
+        # Prevent rediricting stdout/sterr multiple times
+        sys.stdout = PrintLogger(log, logging.INFO, sys.stdout)
+        sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr)
     console = Console(file=sys.stdout.console)
 
+    # Mirror a configurable log level to console (default = ERROR)
     if cfg.console_log_level != "NONE":
-        # set up console logger
-        consoleHandler = RichHandler(show_time=True, show_path=False, console=console)
-
+        consoleHandler = RichHandler(
+            show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console
+        )
         if cfg.console_log_level == "DEBUG":
             consoleHandler.setLevel(logging.DEBUG)
         elif cfg.console_log_level == "INFO":
@@ -216,9 +213,52 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             consoleHandler.setLevel(logging.CRITICAL)
         logging.getLogger().addHandler(consoleHandler)
 
-    # Setup done, start processing
+    return log
+
+
+def exit_buildflow(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0):
+    if exit_code:
+        print("Build failed")
+        status = "failed"
+    else:
+        print("Build completed successfully")
+        status = "ok"
+
+    # Generate metadata_builder.json
+    metadata = {
+        "status": status,
+        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
+    }
+    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Generate time_per_step.json
+    if time_per_step is not None:
+        time_per_step["total_build_time"] = sum(time_per_step.values())
+        with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f:
+            json.dump(time_per_step, f, indent=2)
+
+    return exit_code
+
+
+def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
+    """Best-effort build a dataflow accelerator using the given configuration.
+
+    :param model_filename: ONNX model filename to build
+    :param cfg: Build configuration
+    """
+    # Create the output (report) dir if it doesn't exist
+    os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
+
+    log = setup_logging(cfg)
+
+    print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}")
+    print(f"Final outputs will be generated in {cfg.output_dir}")
+    print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
+
+    # Setup done, start build flow
     try:
-        # if start_step is specified, override the input model
+        # If start_step is specified, override the input model
         if cfg.start_step is None:
             print(f"Building dataflow accelerator from {model_filename}")
             model = ModelWrapper(model_filename)
@@ -240,7 +280,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             model = ModelWrapper(intermediate_model_filename)
         assert type(model) is ModelWrapper
 
-        # start processing
+        # Start processing
         step_num = 1
         time_per_step = dict()
         build_dataflow_steps = resolve_build_steps(cfg)
@@ -249,11 +289,11 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             step_name = transform_step.__name__
             print(f"Running step: {step_name} [{step_num}/{len(build_dataflow_steps)}]")
 
-            # run the step
+            # Run the step
             step_start = time.time()
             model = transform_step(model, cfg)
             step_end = time.time()
-            time_per_step[step_name] = step_end - step_start
+            time_per_step[step_name] = round(step_end - step_start)
             chkpt_name = f"{step_name}.onnx"
             if cfg.save_intermediate_models:
                 intermediate_model_dir = os.path.join(cfg.output_dir, "intermediate_models")
@@ -263,36 +303,28 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             step_num += 1
     except KeyboardInterrupt:
         print("KeyboardInterrupt detected. Aborting...")
-        print("Build failed")
-        return -1
+        return exit_buildflow(cfg, time_per_step, -1)
     except (Exception, FINNError) as e:
-        # Print full traceback if we are on debug log level
-        # or encountered a non-user error
-        print_full_traceback = True
-        if issubclass(type(e), FINNUserError) and log.level != logging.DEBUG:
-            print_full_traceback = False
-
-        extype, value, tb = sys.exc_info()
-        if print_full_traceback:
-            # print exception info and traceback
-            log.error("FINN Internal compiler error:")
-            console.print_exception(show_locals=False)
-        else:
-            console.print(f"[bold red]FINN Error: [/bold red]{e}")
-            log.error(f"{e}")
-            print("Build failed")
-            return -1  # A user error shouldn't be need to be fixed using PDB
-
-        # start postmortem debug if configured
-        if cfg.enable_build_pdb_debug:
-            pdb.post_mortem(tb)
-        print("Build failed")
-        return -1
+        # Re-raise exception if we are in a PyTest session so we don't miss it
+        if "PYTEST_CURRENT_TEST" in os.environ:
+            raise
 
-    with open(os.path.join(cfg.output_dir, "time_per_step.json"), "w") as f:
-        json.dump(time_per_step, f, indent=2)
-    print("Completed successfully")
-    return 0
+        if issubclass(type(e), FINNUserError):
+            # Handle FINN USER ERROR
+            log.error(f"FINN ERROR: {e}")
+        else:
+            # Handle remaining errors (= FINN INTERNAL COMPILER ERROR)
+            log.error(f"FINN INTERNAL COMPILER ERROR: {e}")
+
+        # Print traceback for interal errors or if in debug mode
+        if not issubclass(type(e), FINNUserError) or log.level == logging.DEBUG:
+            rprint(Traceback(show_locals=False))
+            # Start postmortem debug if configured
+            if cfg.enable_build_pdb_debug:
+                pdb.post_mortem(e.__traceback__)
+
+        return exit_buildflow(cfg, time_per_step, -1)
+    return exit_buildflow(cfg, time_per_step, 0)
 
 
 def build_dataflow_directory(path_to_cfg_dir: str):
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 02e1d66d54..57204c5745 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -173,16 +173,16 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin):
     """
 
     #: Directory where the final build outputs will be written into
-    output_dir: str
+    output_dir: Optional[str] = None
 
     #: Target clock frequency (in nanoseconds) for Vivado synthesis.
     #: e.g. synth_clk_period_ns=5.0 will target a 200 MHz clock.
     #: If hls_clk_period_ns is not specified it will default to this value.
-    synth_clk_period_ns: float
+    synth_clk_period_ns: Optional[float] = None
 
     #: Which output(s) to generate from the build flow.  See documentation of
     #: DataflowOutputType for available options.
-    generate_outputs: List[DataflowOutputType]
+    generate_outputs: Optional[List[DataflowOutputType]] = None
 
     #: (Optional) Path to configuration JSON file in which user can specify
     #: a preferred implementation style (HLS or RTL) for each node.
@@ -350,14 +350,14 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin):
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = False
 
-    #: When True, additional verbose information will be written to the log file.
-    #: Otherwise, these additional information will be suppressed.
+    #: When True, additional information (level = DEBUG) will be written to the log file.
+    #: Otherwise, this additional information will be suppressed (level = INFO).
     verbose: Optional[bool] = False
 
     #: Log level to be used on the command line for finn-plus internal logging.
-    #: This is different from the log level used for the build process,
+    #: This is different from the log level used for build_dataflow.log,
     #: which is controlled using the verbose flag.
-    console_log_level: Optional[LogLevel] = LogLevel.NONE
+    console_log_level: Optional[LogLevel] = LogLevel.ERROR
 
     #: If given, only run the steps in the list. If not, run default steps.
     #: See `default_build_dataflow_steps` for the default list of steps.
@@ -395,6 +395,9 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin):
     #: If set to commit hash specified version will be used
     cpp_driver_version: Optional[str] = "latest"
 
+    #: Specify validation dataset to be used for deployment of the PYNQ driver
+    validation_dataset: Optional[str] = None
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 6de7f1dc0f..aab45b9972 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -418,7 +418,9 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
             "depth_trigger_uram",
             "depth_trigger_bram",
         ]
-        extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs)
+        extract_model_config_to_json(
+            model, cfg.output_dir + "/report/auto_folding_config.json", hw_attrs
+        )
 
     return model
 
@@ -507,6 +509,7 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
     report_dir = cfg.output_dir + "/report"
     os.makedirs(report_dir, exist_ok=True)
     estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation)
+    estimate_layer_resources_hls["total"] = aggregate_dict_keys(estimate_layer_resources_hls)
     with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f:
         json.dump(estimate_layer_resources_hls, f, indent=2)
 
@@ -663,7 +666,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(ApplyConfig(cfg.folding_config_file))
 
     # extract the final configuration and save it as json
-    extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
+    extract_model_config_to_json(model, cfg.output_dir + "/report/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
     # json file has been written. otherwise, since these transforms may add/remove
@@ -827,7 +830,9 @@ def step_make_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
                 )
             )
         else:
-            model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
+            model = model.transform(
+                MakePYNQDriverIODMA(cfg._resolve_driver_platform(), cfg.validation_dataset)
+            )
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
         log.info("PYNQ Python driver written into " + driver_dir)
     elif DataflowOutputType.CPP_DRIVER in cfg.generate_outputs:
diff --git a/src/finn/builder/custom_step_library/__init__.py b/src/finn/builder/custom_step_library/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py
new file mode 100644
index 0000000000..f6de8edaae
--- /dev/null
+++ b/src/finn/builder/custom_step_library/conv1d.py
@@ -0,0 +1,20 @@
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
+from qonnx.transformation.general import GiveUniqueNodeNames
+
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+
+
+def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Change3DTo4DTensors())
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    return model
+
+
+def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(GiveUniqueNodeNames())
+    return model
diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py
new file mode 100644
index 0000000000..0c251ad299
--- /dev/null
+++ b/src/finn/builder/custom_step_library/mobilenet.py
@@ -0,0 +1,114 @@
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.general import ApplyConfig, GiveReadableTensorNames, GiveUniqueNodeNames
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
+
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    ShellFlowType,
+    VerificationStepType,
+)
+from finn.builder.build_dataflow_steps import verify_step
+from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+
+
+def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Streamline())
+    additional_streamline_transformations = [
+        DoubleToSingleFloat(),
+        reorder.MoveMulPastDWConv(),
+        absorb.AbsorbMulIntoMultiThreshold(),
+        ChangeDataLayoutQuantAvgPool2d(),
+        InferDataLayouts(),
+        reorder.MoveTransposePastScalarMul(),
+        absorb.AbsorbTransposeIntoFlatten(),
+        reorder.MoveFlattenPastAffine(),
+        reorder.MoveFlattenPastTopK(),
+        reorder.MoveScalarMulPastMatMul(),
+        CollapseRepeatedMul(),
+        RemoveIdentityOps(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in additional_streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
+
+    return model
+
+
+def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RoundAndClipThresholds())
+    model = model.transform(InferDataLayouts())
+    return model
+
+
+def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+
+def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
+    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+        try:
+            from finnexperimental.analysis.partitioning import partition
+
+            # apply partitioning of the model, restricting the first and last layers
+            # to SLR0
+            default_slr = 0
+            abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
+            floorplan = partition(
+                model,
+                cfg.synth_clk_period_ns,
+                cfg.board,
+                abs_anchors=abs_anchors,
+                multivariant=False,
+            )[0]
+            # apply floorplan to model
+            model = model.transform(ApplyConfig(floorplan))
+            print("SLR floorplanning applied")
+        except Exception:
+            print("No SLR floorplanning applied")
+    return model
+
+
+def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferThresholdingLayer())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py
new file mode 100644
index 0000000000..3e1c61063b
--- /dev/null
+++ b/src/finn/builder/custom_step_library/resnet.py
@@ -0,0 +1,208 @@
+# Copyright (C) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+from qonnx.transformation.composed import ComposedTransformation
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.fold_constants import FoldConstants
+from qonnx.transformation.general import (
+    ConvertDivToMul,
+    ConvertSubToAdd,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
+    SortGraph,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
+
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+from finn.transformation.streamline.absorb import (
+    Absorb1BitMulIntoConv,
+    Absorb1BitMulIntoMatMul,
+    AbsorbAddIntoMultiThreshold,
+    AbsorbConsecutiveTransposes,
+    AbsorbMulIntoMultiThreshold,
+    AbsorbScalarMulAddIntoTopK,
+    AbsorbTransposeIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+)
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedAdd,
+    CollapseRepeatedMul,
+)
+
+# just for not linear
+from finn.transformation.streamline.reorder import (
+    MoveAddPastConv,
+    MoveAddPastMul,
+    MoveLinearPastEltwiseAdd,
+    MoveLinearPastFork,
+    MoveMaxPoolPastMultiThreshold,
+    MoveScalarAddPastMatMul,
+    MoveScalarLinearPastInvariants,
+    MoveScalarMulPastConv,
+    MoveScalarMulPastMatMul,
+    MoveTransposePastEltwise,
+    MoveTransposePastFork,
+    MoveTransposePastJoinAdd,
+)
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
+
+
+def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    streamline_transformations = [
+        AbsorbScalarMulAddIntoTopK(),  # before MoveAddPastMul to avoid int->float
+        ConvertSubToAdd(),
+        ConvertDivToMul(),
+        RemoveIdentityOps(),
+        CollapseRepeatedMul(),
+        BatchNormToAffine(),
+        ConvertSignToThres(),
+        MoveAddPastMul(),
+        MoveScalarAddPastMatMul(),
+        MoveAddPastConv(),
+        MoveScalarMulPastMatMul(),
+        MoveScalarMulPastConv(),
+        MoveScalarLinearPastInvariants(),
+        MoveAddPastMul(),
+        CollapseRepeatedAdd(),
+        CollapseRepeatedMul(),
+        AbsorbAddIntoMultiThreshold(),
+        FactorOutMulSignMagnitude(),
+        MoveMaxPoolPastMultiThreshold(),
+        AbsorbMulIntoMultiThreshold(),
+        Absorb1BitMulIntoMatMul(),
+        Absorb1BitMulIntoConv(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    streamline_transformations = [
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    for iter_id in range(4):
+        model = step_resnet50_streamline_linear(model, cfg)
+        model = step_resnet50_streamline_nonlinear(model, cfg)
+
+        # big loop tidy up
+        model = model.transform(RemoveUnusedTensors())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+        model = model.transform(SortGraph())
+
+    model = model.transform(DoubleToSingleFloat())
+
+    # Lower convolutions and streamline resulting transposes
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(
+        ComposedTransformation(
+            [
+                MoveTransposePastJoinAdd(),
+                MoveTransposePastFork(),
+                MoveTransposePastEltwise(),
+                AbsorbConsecutiveTransposes(),
+                AbsorbTransposeIntoMultiThreshold(),
+            ]
+        )
+    )
+    return model
+
+
+def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
+    model = model.transform(InferDataLayouts())
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferDataTypes())
+    model = model.transform(SortGraph())
+
+    to_hw_transformations = [
+        to_hw.InferChannelwiseLinearLayer,
+        to_hw.InferPool,
+        AbsorbConsecutiveTransposes,
+        RoundAndClipThresholds,
+        to_hw.InferQuantizedMatrixVectorActivation,
+        to_hw.InferThresholdingLayer,
+        to_hw.InferConvInpGen,
+        to_hw.InferDuplicateStreamsLayer,
+        to_hw.InferAddStreamsLayer,
+        to_hw.InferLabelSelectLayer,
+    ]
+    for trn in to_hw_transformations:
+        model = model.transform(trn())
+        model = model.transform(InferDataLayouts())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InferDataTypes())
+
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveUnusedTensors())
+    model = model.transform(SortGraph())
+
+    return model
diff --git a/src/finn/builder/custom_step_library/transformer.py b/src/finn/builder/custom_step_library/transformer.py
new file mode 100644
index 0000000000..79cfa29353
--- /dev/null
+++ b/src/finn/builder/custom_step_library/transformer.py
@@ -0,0 +1,772 @@
+# ADAPTED FROM Christoph's radioml-transformer repository, specifically these files:
+# build_steps.py
+# custom/apply_config.py
+
+# Copies (deep-copies) python objects
+import copy
+import json
+
+# Numpy for loading and comparing the verification input/output
+import numpy as np
+
+# YAML for loading experiment configurations
+import yaml
+
+# QONNX quantization data types
+from qonnx.core.datatype import DataType
+
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+
+# Converts ONNX graph nodes to QONNX custom-ops if possible
+from qonnx.custom_op.registry import getCustomOp
+
+# Converts BatchNorm operation to affine transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+
+# Transformation for exhaustively composing transformations
+from qonnx.transformation.composed import ComposedTransformation
+
+# If we have a convolution with a bias tensors input, QONNX and later FINN
+# expect the bias to be expressed as a standalone Add node following the Conv
+# node.
+from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv
+
+# Collapses chains of constants into a single constant operation or even
+# initializer tensors.
+from qonnx.transformation.fold_constants import FoldConstants
+
+# Converts Gemm operation to MatMul with extracted standalone bias op
+from qonnx.transformation.gemm_to_matmul import GemmToMatMul
+
+# QONNX graph transformations for renaming and cleaning up
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
+    Transformation,
+)
+
+# QONNX graph transformations for annotating the graph with datatype and shape
+# information
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+
+# Converts Conv to Im2Col and MatMul with extracted standalone bias op
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+# Transposes the initializer tensors of a Quant node instead of having a
+# standalone Transpose following
+from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+
+# Range information structure for seeding the range analysis for converting
+# quantized activations to MultiThreshold
+from qonnx.util.range_analysis import RangeInfo
+
+# FINN dataflow builder configuration
+from finn.builder.build_dataflow_config import DataflowBuildConfig, VerificationStepType
+
+# FINN verification after build/graph transformation steps
+from finn.builder.build_dataflow_steps import verify_step
+
+# Detects the attention pattern and converts to hardware custom op
+from finn.transformation.fpgadataflow.attention import (
+    AbsorbMultiThresholdIntoScaledDotProductAttention,
+    InferScaledDotProductAttention,
+)
+
+# Mult-Head Attention support
+from finn.transformation.fpgadataflow.attention_heads import (
+    InferMultiHeads,
+    MoveMergeMultiHeadsPastMultiThreshold,
+    MoveSplitMultiHeadsPastMultiThreshold,
+    UnrollMultiHeadAttention,
+)
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+
+# Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferConcatLayer,
+    InferElementwiseBinaryOperation,
+    InferLookupLayer,
+    InferSplitLayer,
+    InferSqueeze,
+    InferUnsqueeze,
+    InferVectorVectorActivation,
+)
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+
+# Inserts data-width converter and FIFO nodes into the model graph
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+
+# Transformations preparing the operators for synthesis and simulation
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+
+# Converts fork-nodes to ReplicateStream hardware operator
+from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+# Splitting and removing of FIFOs from the model graph
+from finn.transformation.fpgadataflow.set_fifo_depths import RemoveShallowFIFOs, SplitLargeFIFOs
+
+# Graph transformation setting the folding, i.e., parallelization configuration
+from finn.transformation.fpgadataflow.set_folding import SetFolding
+
+# Specializes each layer's implementation style: HLS or RTL implementation
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+
+# Standard QONNX to FINN conversion function
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+
+# Folds quantizers into weight tensor initializers, needed for lowering
+# convolutions to MatMuls
+from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
+from finn.transformation.qonnx.quant_act_to_multithreshold import default_filter_function_generator
+
+# Cleanup transformation getting rid of 3d data layout
+from finn.transformation.squeeze import Squeeze
+from finn.transformation.streamline.absorb import (
+    AbsorbAddIntoMultiThreshold,
+    AbsorbSignBiasIntoMultiThreshold,
+)
+
+# FINN streamlining transformations fusing/collapsing operations of the same
+# kind
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedTranspose
+
+# FINN streamlining transformations removing nodes without real effect from the
+# graph
+from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose
+
+# FINN streamlining transformations reordering the graph
+from finn.transformation.streamline.reorder import (
+    MoveMulPastAdd,
+    MoveSqueezePastMatMul,
+    MoveSqueezePastMultiThreshold,
+    MoveTransposePastEltwise,
+    MoveTransposePastFork,
+    MoveTransposePastJoinAdd,
+    MoveTransposePastJoinConcat,
+    MoveTransposePastJoinMul,
+    MoveTransposePastSplit,
+)
+from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline
+
+# Execute onnx model graphs from the dataflow parent for verification
+from finn.util.test import execute_parent
+
+
+# Prepares the graph to be consumed by FINN:
+# 1. Some graph cleanup removing unused tensors, nodes without effect and
+#  folding constants, i.e., collapsing chains of operations on constant tensors
+# 2. Lowers some "more complex" operations: converts Conv and Gemm to MatMul and
+#  BatchNorm to Mul and Add operations followed by some necessary cleanup
+# 3. Converts all QONNX Quant nodes to MultiThreshold operations which can
+#  absorb scales and biases during streamlining
+def prepare_graph(range_info: RangeInfo):
+    # Wrap the actual transformation/build step function
+    def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig):
+        # Exhaustively apply the set of cleanup transformations
+        model = model.transform(
+            ComposedTransformation(
+                [
+                    # Adds shape and datatype annotations to all tensors in this graph
+                    InferDataTypes(),
+                    InferShapes(),
+                    # Cleanup the graph by removing redundant, unnecessary and constant
+                    # nodes and tensors and give unique names to everything remaining
+                    GiveUniqueNodeNames(),
+                    GiveReadableTensorNames(),
+                    RemoveStaticGraphInputs(),
+                    RemoveUnusedTensors(),
+                    GiveUniqueParameterTensors(),
+                    FoldConstants(),
+                    # Remove unnecessary shape and layout transformations
+                    RemoveIdentityReshape(),
+                    RemoveIdentityTranspose(),
+                    # Redo shape and datatype annotations after removing nodes and
+                    # tensors
+                    InferShapes(),
+                    InferDataTypes(),
+                ]
+            )
+        )
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if VerificationStepType.TIDY_UP_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "tidied_up_python", need_parent=False)
+        # Exhaustively apply the lowering transformations
+        model = model.transform(
+            ComposedTransformation(
+                [
+                    # Moves the bias input to the Conv operator as a separate Add node
+                    # behind the Conv node
+                    ExtractBiasFromConv(),
+                    # Converts Gemm nodes to MatMul (+ bias)
+                    GemmToMatMul(),
+                    # Need to do some constant and weight folding first
+                    FoldConstants(),
+                    FoldTransposeIntoQuantInit(),
+                    FoldQuantWeights(),
+                    # Annotate the graph with shape and data type information
+                    InferShapes(),
+                    InferDataTypes(),
+                    # Converts Conv layers to MatMul
+                    LowerConvsToMatMul(),
+                    # Converts BatchNorm to affine scale and bias
+                    BatchNormToAffine(),
+                    # Annotate the graph with shape and data type information
+                    InferShapes(),
+                    InferDataTypes(),
+                ]
+            )
+        )
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "lowered_python", need_parent=False)
+
+        # Apply the standard QONNX to FINN conversion step to convert the
+        # remaining quantizers not yet covered by the new range analysis based
+        # method
+        model = model.transform(
+            ConvertQONNXtoFINN(
+                filter_function=default_filter_function_generator(
+                    max_multithreshold_bit_width=cfg.max_multithreshold_bit_width
+                )
+            )
+        )
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "prepared_graph_python", need_parent=False)
+        # Return the transformed model
+        return model
+
+    # Return the wrapped transformation step function
+    return step_prepare_graph
+
+
+# Applies the custom set of exhaustive streamlining transformations, also taking
+# special topology like attention, residuals, splits and transposes into account
+def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # These should not be applied exhaustively with the other streamlining
+    # transformations to not end up in cycles.
+    # Note: This is essential to allow some Add operations to be
+    # absorbed by the next round's AbsorbSignBiasIntoMultiThreshold
+    model = model.transform(MoveMulPastAdd())
+    model = model.transform(AbsorbSignBiasIntoMultiThreshold())
+    # Exhaustively apply the following set of transformations to streamline the
+    # graph with the overall goal of collecting scales and biases in front of
+    # MultiThreshold operations or, alternatively, at the end of the graph.
+    # Note: Contains some sets of nested exhaustive transformations meant for
+    # particular architectural patterns, e.g., residual topologies.
+    model = model.transform(Streamline())
+    # If configured, run a verification of the transformed model on some
+    # sample inputs
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():  # noqa
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
+    # Return the transformed model
+    return model
+
+
+# Converts scaled dot-product attention operations to FINN hardware operations
+# Note: This includes some necessary cleanup after converting the pattern, in
+# particular squeezing the data layouts throughout the graph
+def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
+    # Try to infer reshaping of attention heads
+    model = model.transform(InferMultiHeads())  # noqa: Duplicate
+    # Try to mode the mult-head splitting past the multi thresholds
+    model = model.transform(MoveSplitMultiHeadsPastMultiThreshold())
+    # Moving multi-head splitting past multi thresholds might enable absorbing
+    # adds into thresholds once again
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    # Try to infer a ScaledDotProductAttention custom op
+    model = model.transform(InferScaledDotProductAttention())
+    # Parallelize attention head in the onnx graph
+    model = model.transform(UnrollMultiHeadAttention())
+    # Swap the order of merging the multi heads and applying thresholds
+    model = model.transform(MoveMergeMultiHeadsPastMultiThreshold())
+    # If applicable, absorb the final thresholds into the attention operator
+    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+    # Squeeze (i.e., remove dimensions of size 1) the data layouts throughout
+    # the graph to treat the time dimension as the batch dimension for all MVU
+    # and Threshold operators
+    model = model.transform(Squeeze())
+    # Squeezing might have turned further transpose and reshape operations into
+    # identities (those which just swapped around the dimensions of size 1)
+    model = model.transform(
+        ComposedTransformation(
+            [
+                # Move transposes around to some place where they could be removed
+                # later, i.e., where they collapse into identities
+                MoveTransposePastFork(),
+                MoveTransposePastSplit(),
+                MoveTransposePastJoinConcat(),
+                MoveTransposePastEltwise(),
+                MoveTransposePastJoinMul(),
+                MoveTransposePastJoinAdd(),
+                CollapseRepeatedTranspose(),
+                # Remove identity shape/layout transformations
+                RemoveIdentityTranspose(),
+                RemoveIdentityReshape(),
+                # Squeeze operators can be moved past MatMuls and thresholding
+                MoveSqueezePastMatMul(),
+                MoveSqueezePastMultiThreshold(),
+            ]
+        )
+    )
+    # Squeezing might enable absorbing adds into thresholds once again
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    # If applicable, absorb the final thresholds into the attention operator
+    #   Note: Might be applicable again after squeezing a transpose away
+    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+    # We should do another round of streamlining to be sure and support more
+    # general architectural patterns, we are not aware of yet...
+    model = model.transform(Streamline())
+    # Convert Squeeze and Unsqueeze operators to hardware operations
+    model = model.transform(InferSqueeze())
+    model = model.transform(InferUnsqueeze())
+    # Return the model with attention and multi-heads mapped to hardware
+    # operators
+    return model
+
+
+# Function running the transformations to convert elementwise binary operations
+# to their hardware implementations
+def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _):
+    # Convert elementwise operations to hardware operators
+    #   Note: Do not convert the final Mul operator at the output
+    return model.transform(
+        InferElementwiseBinaryOperation(InferElementwiseBinaryOperation.reject_output_dequant)
+    )
+
+
+# Converts Split and Concat operations to hardware custom operators
+def step_convert_split_concat_to_hw(model: ModelWrapper, _):
+    return model.transform(InferSplitLayer()).transform(InferConcatLayer())
+
+
+# Function running the transformations to convert Gather, i.e., index lookup,
+# nodes to their hardware implementations
+def step_convert_lookup_to_hw(model: ModelWrapper, _):
+    # Iterate all nodes in the graph keeping track of the index
+    for index, node in enumerate(model.graph.node):
+        # If this is a Gather node, force the input (index) type annotation
+        if node.op_type == "Gather":
+            # Force to unsigned 64-bit integer for now
+            model.set_tensor_datatype(node.input[1], DataType["UINT64"])
+            # Get the value info for the input tensor to have access to the ONNX
+            # datatype of the tensor
+            value_info = model.get_tensor_valueinfo(node.input[1])
+            # Force the container datatype of the input to be a float
+            value_info.type.tensor_type.elem_type = 1
+    # Convert Gather to Lookup layers
+    return model.transform(InferLookupLayer())
+
+
+# Converts depth-wise convolution to hardware operator calling the
+# InferVectorVectorActivation transformation
+def step_convert_depth_wise_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
+    return model.transform(InferVectorVectorActivation())
+
+
+# Function running the InferReplicateStream transformation
+def step_replicate_streams(model: ModelWrapper, _):
+    # Properly replicate the stream feeding the query, key and value projections
+    return model.transform(InferReplicateStream())
+
+
+# Custom step for setting the parallelism to meet the target of T^2 cycles per
+# sequence
+def set_target_parallelization(seq_len: int, emb_dim: int):  # noqa: emb_dim
+    # The wrapping function is a generator and this is the actual build step
+    # function taking the model and build configuration
+    def step_set_target_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig):
+        # Run over all nodes in the model graph to look for attention operators,
+        # which are currently not handled by the SetFolding transformation
+        for index, node in enumerate(model.graph.node):
+            # Only handle attention operations here
+            if node.op_type == "ScaledDotProductAttention_hls":
+                # Convert this to the custom-op instance for easy access to node
+                # attributes
+                inst = getCustomOp(node)
+                # Set the sequence and embedding dimension folding to meet the
+                # T^2 cycles target, i.e., fully parallel along the embedding
+                # dimension and fully sequential along the sequence dimension
+                inst.set_nodeattr("EmbFold", 1)
+                inst.set_nodeattr("SeqFold", seq_len)
+        # Apply the built-in folding configuration transformation with the
+        # T^2 target cycles
+        model = model.transform(
+            SetFolding(seq_len**2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation)
+        )
+        # TODO: Extract the folding configuration
+        # Return the model with configured parallelization
+        return model
+
+    # Return the wrapped build step function
+    return step_set_target_parallelization
+
+
+# Applies configuration dictionary to the model graph
+class ApplyConfig(Transformation):
+    # Initializes the transformation with the configuration dictionary
+    def __init__(self, config):
+        # Initialize the transformation base class
+        super().__init__()
+        # Register the configuration dictionary to be used in apply()
+        self.config = config
+
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # A node should not be named "defaults"...
+            assert node.name != "defaults", "Node has reserved name 'defaults'"
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Apply the per operator type default configurations to the node
+            if node.op_type in self.config["defaults"]:
+                # Run over all default options to be applied to this node
+                for key, value in self.config["defaults"][node.op_type].items():
+                    # Set the nodes attribute to the default option value
+                    inst.set_nodeattr(key, value)
+            # If there is an individual, node-specific configuration apply
+            # this next, potentially overriding the defaults set above
+            if node.name in self.config:
+                # Run over all node-specific options to be applied to this
+                # node
+                for key, value in self.config[node.name].items():
+                    # Set the nodes attribute to the option value
+                    inst.set_nodeattr(key, value)
+        # Return model with configuration applied
+        # Note: Do not consider this as modifying the graph. This does not have
+        # to be reapplied multiple times.
+        return model, False
+
+
+# Custom build step trying to set appropriate FIFO sizes for the transformer
+def set_fifo_depths(seq_len: int, emb_dim: int, uram_threshold: int = 32):  # noqa: emb_dim
+    # The wrapping function is a generator and this is the actual build step
+    # function taking the model and build configuration
+    def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
+        # Run over all nodes in the model graph
+        for index, node in enumerate(model.graph.node):
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Extract the FIFO depths configuration of the node
+            in_depths = inst.get_nodeattr("inFIFODepths")
+            out_depths = inst.get_nodeattr("outFIFODepths")
+
+            # Number of inputs and outputs to/from the node
+            num_inputs = len(node.input)
+            num_outputs = len(node.output)
+
+            # If the input/output has only default configurations, fill with as
+            # many shallow FIFOs as there are inputs, to avoid later problems
+            # with to few FIFO depths specified
+            if in_depths == [2] and num_inputs > 1:
+                in_depths = num_inputs * [2]
+            if out_depths == [2] and num_outputs > 1:
+                out_depths = num_outputs * [2]
+
+            # Special case: Attention needs properly sized input FIFOs
+            if node.op_type == "ScaledDotProductAttention_hls":
+                # Each folded input stream needs to be buffered completely
+                # TODO: Not exactly sure whether this is always correct or just
+                #  the worst-case
+                in_depths = [inst.get_number_input_values(i) for i in range(num_inputs)]
+                # Note: No special treatment of the output FIFO
+                # out_depths = ...
+
+            # Special case: Adding residual branches needs to buffer the inputs
+            # to avoid deadlocks if one branch is running faster/slower
+            if node.op_type == "ElementwiseAdd_hls":
+                # Only relevant if for join-node operations, i.e., node actually
+                # consumes two branches, potentially operating at a different
+                # rate
+                if model.is_join_node(node):
+                    # Set both inputs to buffer as many cycles as we target for
+                    # the attention operations, i.e., the T^2 cycles per
+                    # sequence target
+                    # TODO: Not exactly sure whether this is always correct or
+                    #  just the worst-case
+                    # TODO: Currently we do not really have a reliable way of
+                    #  figuring out which of the two is the longer/deeper branch
+                    #  in terms of cycles to set a corresponding buffer only to
+                    #  the shorter branch.
+                    in_depths = [seq_len**2, seq_len**2]
+                    # Note: No special treatment of the output FIFO
+                    # out_depths = ...
+
+            # Set the updated FIFO depths attributes
+            inst.set_nodeattr("inFIFODepths", in_depths)
+            inst.set_nodeattr("outFIFODepths", out_depths)
+
+        # The following partially mirrors (or even copies from) the build-in
+        # step_set_fifo_depths using only manual FIFO depths and our YAML-based
+        # folding configuration.
+
+        # Insert data-width converters
+        model = model.transform(InsertDWC())
+        # Insert FIFOs between all operators (inserts shallow, depths 2 FIFOs if
+        # no other depth is specified)
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+        # Specialize the implementation variant of the (newly added FIFO) layers
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))  # noqa: Access _ method
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
+        # Only applies if a configuration file is given
+        if cfg.folding_config_file is not None:
+            # Load the configuration dictionary form YAML file
+            with open(cfg.folding_config_file, "r") as file:
+                # Load YAML string
+                config = yaml.safe_load(file)
+                # Assign unique names to the nodes which can be matched by
+                # individual per-node configuration options
+                model = model.transform(GiveUniqueNodeNames())
+                # Apply the configuration dictionary to the model graph
+                model = model.transform(ApplyConfig(config))
+
+        # Run over all nodes in the model graph once again to modify the
+        # inserted FIFOs
+        # Note: This overwrites the folding configuration...
+        # TODO: Find a better way to handle this
+        for index, node in enumerate(model.graph.node):
+            # Modify all RTL FIFO operators
+            if node.op_type == "StreamingFIFO_rtl":
+                # Convert this to the custom-op instance for easy access to node
+                # attributes
+                inst = getCustomOp(node)
+                # Check the depth of the FIFO: If this is not a shallow FIFO,
+                # implement this via the vivado strategy in URAM
+                if inst.get_nodeattr("depth") >= uram_threshold:
+                    # Change the implementation style to vivado
+                    inst.set_nodeattr("impl_style", "vivado")
+                    # Set the resource type for the memory to URAM
+                    inst.set_nodeattr("ram_style", "ultra")
+
+        # Hardware attributes to be extracted from each node
+        hw_attrs = {
+            "PE",
+            "SIMD",
+            "parallel_window",
+            "ram_style",
+            "ram_style_thresholds",
+            "ram_style_mask",
+            "depth",
+            "impl_style",
+            "resType",
+            "mac_resource",
+            "mem_mode",
+            "runtime_writeable_weights",
+            "inFIFODepths",
+            "outFIFODepths",
+            "depth_trigger_uram",
+            "depth_trigger_bram",
+        }
+
+        # Start collecting the configuration from the model graph as a
+        # dictionary
+        config = {"defaults": {}}
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(model.graph.node):
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Prepare the node-specific configuration entry for this node
+            config[node.name] = {}
+            # Collect attribute values for all specified hardware attributes
+            for key in hw_attrs:
+                # Some hardware attributes may not be present for all nodes or
+                # op-types, this will be signaled via exception
+                try:
+                    # Try extracting the configuration value from the node
+                    # custom-op instance
+                    config[node.name][key] = inst.get_nodeattr(key)
+                # Missing attributes are signaled va AttributeError
+                except AttributeError:
+                    # Can be safely ignored here
+                    pass
+            # Cleanup: If no attribute is present for this node, there is no
+            # need to keep this in the configuration dictionary as there is
+            # nothing to be restored later
+            if not config[node.name]:
+                # Remove the entry form the configuration dictionary
+                del config[node.name]
+
+        # Create/Open a YAML file to store the configuration for later reuse
+        # TODO: make consistent with .json report in default step
+        with open(cfg.output_dir + "/report/final_hw_config.yaml", "w") as file:
+            # Store the configuration dictionary as YAML code
+            yaml.safe_dump(config, file)
+
+        # Perform FIFO splitting and shallow FIFO removal only after the final
+        # config file has been written. Otherwise, since these transforms may
+        # add/remove FIFOs, we get name mismatch problems when trying to reuse
+        # the final config.
+        if cfg.split_large_fifos:
+            model = model.transform(SplitLargeFIFOs())
+        model = model.transform(RemoveShallowFIFOs())
+
+        # generate a dedicated report about final FIFO sizes
+        fifo_info = {}
+        fifo_info["fifo_depths"] = {}
+        fifo_info["fifo_sizes"] = {}
+        total_fifo_size = 0
+        for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            node_inst = getCustomOp(node)
+            fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
+            fifo_info["fifo_sizes"][
+                node.name
+            ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
+            total_fifo_size += fifo_info["fifo_sizes"][node.name]
+        fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
+
+        with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f:
+            json.dump(fifo_info, f, indent=2)
+
+        # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again
+        # this will only run for the new nodes (e.g. FIFOs and DWCs)
+        model = model.transform(
+            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())  # noqa
+        )
+        model = model.transform(HLSSynthIP())
+
+        # Return the model with configured parallelization
+        return model
+
+    # Return the wrapped build step function
+    return step_set_fifo_depths
+
+
+# Custom step applying our custom format of folding configuration to the graph
+def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Only applies if a configuration file is given
+    if cfg.folding_config_file is not None:
+        # Load the configuration dictionary form YAML file
+        with open(cfg.folding_config_file, "r") as file:
+            # Load YAML string
+            config = yaml.safe_load(file)
+            # Assign unique names to the nodes which can be matched by
+            # individual per-node configuration options
+            model = model.transform(GiveUniqueNodeNames())
+            # Apply the configuration dictionary to the model graph
+            model = model.transform(ApplyConfig(config))
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps():  # noqa
+        # Prepare C++ Simulation for verification
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+        # Execute a verification step of the model with inputs specified in
+        # build configuration
+        verify_step(model, cfg, "folded_hls_cppsim", need_parent=True)
+
+    # Return model with configuration applied
+    return model
+
+
+# Runs a node-by-node C++ simulation of the model saving the fill execution
+# context
+def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Save the original model
+    original = model
+    # Copy the model
+    model = copy.deepcopy(model)
+    # Set model execution mode to C++ simulation
+    model = model.transform(SetExecMode("cppsim"))
+    # Generates the C++ source and compiles the C++ simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+
+    # Load the verification input/output pair
+    inp = np.load(cfg.verify_input_npy)  # noqa
+    out = np.load(cfg.verify_expected_output_npy)
+
+    # Path to the parent model wrapping the streaming dataflow partition and the
+    # wrapped child model, i.e., the inside of the streaming dataflow partition
+    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
+    child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx"
+    # Save the child model prepared for C++ simulation
+    model.save(child)
+    # Load the parent model to pass to verification execution
+    parent_model = ModelWrapper(parent)
+
+    # Reshape the input/output to match the model
+    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
+    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
+
+    # Execute the onnx model to collect the result
+    # context = execute_onnx(model, context, return_full_exec_context=True)
+    context = execute_parent(parent, child, inp, return_full_ctx=True)
+    # Extract the output tensor from the execution context
+    model_out = context[parent_model.graph.output[0].name]
+    # Compare input to output
+    result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)]
+    # Save the verification outputs into the configured build directory
+    verification_output = f"{cfg.output_dir}/verification_output/"
+    # Save the verification execution context
+    np.savez(f"{verification_output}/verify_cppsim_{result}.npz", **context)
+    # Return the original, unmodified model
+    return original
+
+
+# Runs a node-by-node RTL simulation of the model saving the fill execution
+# context
+def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Save the original model
+    original = model
+    # Copy the model
+    model = copy.deepcopy(model)
+    # Set model execution mode to RTL simulation
+    model = model.transform(SetExecMode("rtlsim"))
+    # Generates the C++ source and compiles the RTL simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns))  # noqa
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+
+    # Load the verification input/output pair
+    inp = np.load(cfg.verify_input_npy)  # noqa
+    out = np.load(cfg.verify_expected_output_npy)
+
+    # Path to the parent model wrapping the streaming dataflow partition and the
+    # wrapped child model, i.e., the inside of the streaming dataflow partition
+    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
+    child = f"{cfg.output_dir}/intermediate_models/verify_rtlsim.onnx"
+    # Save the child model prepared for RTL simulation
+    model.save(child)
+    # Load the parent model to pass to verification execution
+    parent_model = ModelWrapper(parent)
+
+    # Reshape the input/output to match the model
+    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
+    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
+
+    # Execute the onnx model to collect the result
+    # context = execute_onnx(model, context, return_full_exec_context=True)
+    context = execute_parent(parent, child, inp, return_full_ctx=True)
+    # Extract the output tensor from the execution context
+    model_out = context[parent_model.graph.output[0].name]
+    # Compare input to output
+    result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)]
+    # Save the verification outputs into the configured build directory
+    verification_output = f"{cfg.output_dir}/verification_output/"
+    # Save the verification execution context
+    np.savez(f"{verification_output}/verify_rtlsim_{result}.npz", **context)
+    # Return the original, unmodified model
+    return original
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 61f2762039..46616599cb 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -28,8 +28,8 @@
 
 import numpy as np
 import os
-import sys
 from qonnx.custom_op.registry import getCustomOp
+from subprocess import CalledProcessError
 
 from finn.util.basic import (
     get_liveness_threshold_cycles,
@@ -39,6 +39,7 @@
 )
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 from finn.util.deps import get_deps_path
+from finn.util.exception import FINNError
 from finn.util.logging import log
 
 try:
@@ -294,11 +295,12 @@ def rtlsim_exec_cppxsi(
     # write compilation command to a file for easy re-running/debugging
     with open(sim_base + "/compile_rtlsim.sh", "w") as f:
         f.write(" ".join(build_cmd))
-    stdout, stderr = launch_process_helper(build_cmd, cwd=sim_base)
+    try:
+        launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False)
+    except CalledProcessError:
+        raise FINNError("Failed to compile rtlsim executable")
     if not os.path.isfile(sim_base + "/rtlsim_xsi"):
-        print(stdout)
-        print(stderr, file=sys.stderr)
-        raise RuntimeError("Failed to compile rtlsim executable")
+        raise FINNError("Failed to compile rtlsim executable")
 
     # launch the rtlsim executable
     # important to specify LD_LIBRARY_PATH here for XSI to work correctly
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
index f17bc48fc6..e7d02a4915 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
@@ -66,7 +66,7 @@ def strm_decl(self):
             )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+            'hls::stream<ap_uint<{}>> out0_{} ("out0_{}");'.format(
                 self.get_outstream_width(), self.hls_sname(), self.hls_sname()
             )
         )
@@ -88,7 +88,7 @@ def docompute(self):
             VirtualFIFO<Width>(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy);
 
             // FIFO -> AXI-Stream
-            move(out_fifo, out_%s);
+            move(out_fifo, out0_%s);
             """
             % (self.hls_sname(), self.hls_sname())
         ]
@@ -99,7 +99,7 @@ def blackboxfunction(self):
         out_packed_bits = self.get_outstream_width()
         out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode,
+            """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out0_%s, ap_uint<32> mode,
             ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)"""
             % (
                 self.onnx_node.name,
@@ -115,7 +115,7 @@ def pragmas(self):
             "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=out0_" + self.hls_sname()
         )
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode")
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth")
diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index ca5faef96d..a01b70bfb4 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -260,6 +260,30 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) ->
     )
 
 
+@click.command(help="Run a given benchmark configuration.")
+@click.option("--bench_config", help="Name or path of experiment configuration file", required=True)
+@click.option("--dependency-path", "-d", default="")
+@click.option("--num-workers", "-n", default=-1, show_default=True)
+@click.option(
+    "--build-path",
+    "-b",
+    help="Specify a build temp path of your choice",
+    default="",
+)
+def bench(bench_config: str, dependency_path: str, num_workers: int, build_path: str) -> None:
+    console = Console()
+    build_dir = Path(build_path).expanduser() if build_path != "" else None
+    dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None
+    prepare_finn(dep_path, Path(), build_dir, num_workers)
+    console.rule("RUNNING BENCHMARK")
+
+    # Late import because we need prepare_finn to setup remaining dependencies first
+    from finn.benchmarking.bench import start_bench_run
+
+    exit_code = start_bench_run(bench_config)
+    sys.exit(exit_code)
+
+
 @click.command(help="Run a given test. Uses /tmp/FINN_TMP as the temporary file location")
 @click.option(
     "--variant",
@@ -385,6 +409,7 @@ def main() -> None:
     main_group.add_command(config)
     main_group.add_command(deps)
     main_group.add_command(build)
+    main_group.add_command(bench)
     main_group.add_command(test)
     main_group.add_command(run)
     main_group()
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index c8bc1c009d..0e2bc27114 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -27,9 +27,69 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import json
 import numpy as np
+import os
+from dataset_loading import FileQueue, ImgQueue
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
+from PIL import Image
+from pynq import PL
+
+
+def img_resize(img, size):
+    w, h = img.size
+    if (w <= h and w == size) or (h <= w and h == size):
+        return img
+    if w < h:
+        ow = size
+        oh = int(size * h / w)
+        return img.resize((ow, oh), Image.BILINEAR)
+    else:
+        oh = size
+        ow = int(size * w / h)
+        return img.resize((ow, oh), Image.BILINEAR)
+
+
+def img_center_crop(img, size):
+    crop_height, crop_width = (size, size)
+    image_width, image_height = img.size
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height))
+
+
+def pre_process(img_np):
+    img = Image.fromarray(img_np.astype(np.uint8))
+    img = img_resize(img, 256)
+    img = img_center_crop(img, 224)
+    img = np.array(img, dtype=np.uint8)
+    return img
+
+
+def setup_dataloader(val_path, label_file_path=None, batch_size=100, n_images=50000):
+    if label_file_path is None:
+        val_folders = [f.name for f in os.scandir(val_path) if f.is_dir()]
+        val_folders = sorted(val_folders)
+        assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val"
+        files = []
+        labels = []
+        for idx, folder in enumerate(val_folders):
+            current_files = sorted(os.listdir(os.path.join(val_path, folder)))
+            current_files = [os.path.join(folder, file) for file in current_files]
+            files.extend(current_files)
+            labels.extend([idx] * len(current_files))
+        files = files[:n_images]
+    else:
+        files = ["ILSVRC2012_val_{:08d}.JPEG".format(i) for i in range(1, n_images + 1)]
+        labels = np.loadtxt(label_file_path, dtype=int, usecols=1)
+
+    file_queue = FileQueue()
+    file_queue.load_epochs(list(zip(files, labels)), shuffle=False)
+    img_queue = ImgQueue(maxsize=batch_size)
+    img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process)
+    return img_queue
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
@@ -38,7 +98,9 @@
     parser.add_argument(
         "--batchsize", help="number of samples for inference", type=int, default=100
     )
-    parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True)
+    parser.add_argument(
+        "--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default=""
+    )
     parser.add_argument(
         "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
     )
@@ -48,14 +110,43 @@
     parser.add_argument(
         "--dataset_root", help="dataset root dir for download/reuse", default="/tmp"
     )
+    parser.add_argument(
+        "--reportfile",
+        help="Name of output .json report file",
+        type=str,
+        default="validation.json",
+    )
+    parser.add_argument(
+        "--settingsfile", help="Name of optional input .json settings file", type=str, default=""
+    )
     # parse arguments
     args = parser.parse_args()
     bsize = args.batchsize
     dataset = args.dataset
     bitfile = args.bitfile
     platform = args.platform
+    reportfile = args.reportfile
+    settingsfile = args.settingsfile
     dataset_root = args.dataset_root
 
+    # overwrite settings if specified in settings file
+    if settingsfile != "":
+        with open(settingsfile, "r") as f:
+            settings = json.load(f)
+            if "validation_dataset" in settings:
+                dataset = settings["validation_dataset"]
+
+    # program FPGA and load driver
+    PL.reset()  # reset PYNQ cache
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+        runtime_weight_dir="runtime_weights/",
+    )
+
+    # prepare dataset
     if dataset == "mnist":
         from dataset_loading import mnist
 
@@ -68,40 +159,73 @@
         trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
             dataset_root, download=True, one_hot=False
         )
+    elif dataset == "cifar100":
+        from dataset_loading import cifar
+
+        trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
+            dataset_root, download=True, one_hot=False, cifar10=False
+        )
+    elif dataset == "imagenet":
+        val_dir = dataset_root + "/ImageNet/2012/val"
+        label_file = dataset_root + "/ImageNet/2012/val.txt"
+        img_queue = setup_dataloader(val_dir, label_file, bsize)
+        total = 50000
     else:
         raise Exception("Unrecognized dataset")
 
-    test_imgs = testx
-    test_labels = testy
-
-    ok = 0
-    nok = 0
-    total = test_imgs.shape[0]
+    # run accelerator on dataset
+    if dataset in ["mnist", "cifar10", "cifar100"]:
+        test_imgs = testx
+        test_labels = testy
 
-    driver = FINNExampleOverlay(
-        bitfile_name=bitfile,
-        platform=platform,
-        io_shape_dict=io_shape_dict,
-        batch_size=bsize,
-        runtime_weight_dir="runtime_weights/",
-    )
+        ok = 0
+        nok = 0
+        total = test_imgs.shape[0]
 
-    n_batches = int(total / bsize)
+        n_batches = int(total / bsize)
 
-    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
-    test_labels = test_labels.reshape(n_batches, bsize)
+        test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+        test_labels = test_labels.reshape(n_batches, bsize)
 
-    for i in range(n_batches):
-        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape)
-        exp = test_labels[i]
-        driver.copy_input_data_to_device(ibuf_normal)
-        driver.execute_on_buffers()
-        obuf_normal = np.empty_like(driver.obuf_packed_device[0])
-        driver.copy_output_data_from_device(obuf_normal)
-        ret = np.bincount(obuf_normal.flatten() == exp.flatten())
-        nok += ret[0]
-        ok += ret[1]
-        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+        print("Starting validation..")
+        for i in range(n_batches):
+            ibuf_normal = test_imgs[i].reshape(driver.ishape_normal())
+            exp = test_labels[i]
+            obuf_normal = driver.execute(ibuf_normal)
+            # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            if obuf_normal.shape[1] > 1:
+                obuf_normal = np.argmax(obuf_normal, axis=1)
+            ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
+            nok += ret[0]
+            ok += ret[1]
+            print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+    elif dataset in ["imagenet"]:
+        ok = 0
+        nok = 0
+        i = 0
+        print("Starting validation..")
+        while not img_queue.last_batch:
+            imgs, lbls = img_queue.get_batch(bsize, timeout=None)
+            imgs = np.array(imgs)
+            exp = np.array(lbls)
+            ibuf_normal = imgs.reshape(driver.ishape_normal())
+            obuf_normal = driver.execute(ibuf_normal)
+            # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            if obuf_normal.shape[1] > 1:
+                obuf_normal = np.argmax(obuf_normal, axis=1)
+            ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
+            nok += ret[0]
+            ok += ret[1]
+            i += 1
+            print("batch %d : total OK %d NOK %d" % (i, ok, nok))
 
+    # calculate top-1 accuracy
     acc = 100.0 * ok / (total)
     print("Final accuracy: %f" % acc)
+
+    # write report to file
+    report = {
+        "top-1_accuracy": acc,
+    }
+    with open(reportfile, "w") as f:
+        json.dump(report, f, indent=2)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 7a8d38182d..39bed71c82 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -30,14 +30,15 @@
 import json
 import multiprocessing as mp
 import os
-import subprocess
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.util.basic import get_num_default_workers
 from shutil import copytree
+from subprocess import CalledProcessError
 
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths
-from finn.util.basic import make_build_dir
+from finn.util.basic import launch_process_helper, make_build_dir
+from finn.util.exception import FINNError
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 from finn.util.logging import log
 
@@ -633,14 +634,12 @@ def apply(self, model):
             f.write("vivado -mode batch -source make_project.tcl\n")
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", make_project_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        (_, stderr_data) = process_compile.communicate()
 
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for wrapper HDL
+            pass
 
         # wrapper may be created in different location depending on Vivado version
         if not os.path.isfile(wrapper_filename):
@@ -649,7 +648,7 @@ def apply(self, model):
             if os.path.isfile(wrapper_filename_alt):
                 model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise Exception(
+                raise FINNError(
                     """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
                     Please check logs under the parent directory."""
                     % (wrapper_filename, wrapper_filename_alt)
diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py
index e58c33906c..4b1e70369b 100644
--- a/src/finn/transformation/fpgadataflow/make_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_driver.py
@@ -312,9 +312,10 @@ class MakePYNQDriverIODMA(Transformation):
     under the runtime_weights/ subfolder of the pynq_driver_dir.
     """
 
-    def __init__(self, platform):
+    def __init__(self, platform, validation_datset=None):
         super().__init__()
         self.platform = platform
+        self.validation_datset = validation_datset
 
     def apply(self, model):
         # create a temporary folder for the generated driver
@@ -428,8 +429,16 @@ def apply(self, model):
         )
         shutil.copy(validate_template, validate_py)
 
-        # generate weight files for runtime-writable layers
+        # generate settings.json for generated driver
+        if self.validation_datset is not None:
+            settings = {
+                "validation_dataset": self.validation_datset,
+            }
+            settingsfile = pynq_driver_dir + "/settings.json"
+            with open(settingsfile, "w") as f:
+                json.dump(settings, f, indent=2)
 
+        # generate weight files for runtime-writable layers
         for sdp_ind, sdp_node in enumerate(model.graph.node):
             assert sdp_node.op_type == "StreamingDataflowPartition"
             # get dataflow model
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 59d4293323..e280fba016 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -29,13 +29,13 @@
 
 import math
 import os
-import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from shutil import copy
+from subprocess import CalledProcessError
 
 from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
@@ -47,9 +47,14 @@
 from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
+from finn.util.basic import (
+    launch_process_helper,
+    make_build_dir,
+    pynq_native_port_width,
+    pynq_part_map,
+)
 from finn.util.deps import get_deps_path
-from finn.util.logging import log
+from finn.util.exception import FINNError
 
 from . import templates
 
@@ -399,16 +404,15 @@ def apply(self, model):
 
         # call the synthesis script
         bash_command = ["bash", synth_project_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for bitfile
+            pass
+
         bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
         if not os.path.isfile(bitfile_name):
-            raise Exception(
+            raise FINNError(
                 "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
             )
         deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit"
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 222c9c2336..1c5a5eff91 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -29,7 +29,6 @@
 
 import json
 import os
-import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -38,6 +37,7 @@
     GiveUniqueNodeNames,
     RemoveUnusedTensors,
 )
+from subprocess import CalledProcessError
 
 from finn.builder.build_dataflow_config import FpgaMemoryType, VitisOptStrategy
 from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
@@ -49,8 +49,8 @@
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.basic import make_build_dir
-from finn.util.logging import log
+from finn.util.basic import launch_process_helper, make_build_dir
+from finn.util.exception import FINNError
 
 from . import templates
 
@@ -142,16 +142,14 @@ def apply(self, model):
             f.write("vivado -mode batch -source gen_xo.tcl\n")
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", package_xo_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
-        assert os.path.isfile(xo_path), (
-            "Vitis .xo file not created, check logs under %s" % vivado_proj_dir
-        )
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for .xo file
+            pass
+        if not os.path.isfile(xo_path):
+            raise FINNError("Vitis .xo file not created, check logs under %s" % vivado_proj_dir)
+
         return (model, False)
 
 
@@ -327,18 +325,17 @@ def apply(self, model):
             )
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
-        # TODO rename xclbin appropriately here?
+
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for .xo file
+            pass
         xclbin = link_dir + "/a.xclbin"
-        assert os.path.isfile(xclbin), (
-            "Vitis .xclbin file not created, check logs under %s" % link_dir
-        )
+        if not os.path.isfile(xclbin):
+            raise FINNError("Vitis .xclbin file not created, check logs under %s" % link_dir)
+
+        # TODO rename xclbin appropriately here?
         model.set_metadata_prop("bitfile", xclbin)
 
         # run Vivado to gen xml report
@@ -350,13 +347,7 @@ def apply(self, model):
             f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl"))
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", gen_rep_xml_sh]
-        process_genxml = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_genxml.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        launch_process_helper(bash_command, print_stdout=False)
         # filename for the synth utilization report
         synth_report_filename = link_dir + "/synth_report.xml"
         model.set_metadata_prop("vivado_synth_rpt", synth_report_filename)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index acb8bb1303..7f7e658146 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -151,6 +151,65 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path
     return str(tmpdir)
 
 
+def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
+    """Helper function to launch a process in a way that facilitates logging
+    stdout/stderr with Python loggers.
+    Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise."""
+    process = subprocess.run(args, capture_output=True, env=proc_env, cwd=cwd, text=True)
+    cmd_out = process.stdout.strip()
+    cmd_err = process.stderr.strip()
+
+    # Handle stdout
+    if cmd_out:
+        if print_stdout is True:
+            log.info(cmd_out)
+        else:
+            # Print with DEBUG level regardless
+            log.debug(cmd_out)
+
+    # Handle stderr, depending on return code
+    if process.returncode == 0:
+        # Process completed successfully, log stderr only as WARNING
+        if cmd_err:
+            log.warning(cmd_err)
+    else:
+        # Process failed, log stderr as ERROR
+        if cmd_err:
+            log.error(cmd_err)
+
+        # Log additional ERROR message
+        if isinstance(args, list):
+            cmd = " ".join(args)
+        else:
+            cmd = args
+        log.error(f"Launched process returned non-zero exit code ({process.returncode}): {cmd}")
+
+    # Raise CalledProcessError for non-zero return code
+    process.check_returncode()
+    return (cmd_out, cmd_err)
+
+
+def which(program):
+    "Python equivalent of the shell cmd 'which'."
+
+    # source:
+    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
+
+
 class CppBuilder:
     """Builds the g++ compiler command to produces the executable of the c++ code
     in code_gen_dir which is passed to the function build() of this class."""
@@ -194,50 +253,7 @@ def build(self, code_gen_dir):
             f.write("#!/bin/bash \n")
             f.write(bash_compile + "\n")
         bash_command = ["bash", self.compile_script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
-        )
-        _, stderr_data = process_compile.communicate()
-        if stderr_data.strip():
-            log.critical(stderr_data.strip())  # Decode bytes and log as critical
-
-
-def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
-    """Helper function to launch a process in a way that facilitates logging
-    stdout/stderr with Python loggers.
-    Returns (cmd_out, cmd_err)."""
-    if proc_env is None:
-        proc_env = os.environ.copy()
-    with subprocess.Popen(
-        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=proc_env, cwd=cwd, text=True
-    ) as proc:
-        (cmd_out, cmd_err) = proc.communicate()
-    if cmd_out.strip() and print_stdout is True:
-        log.info(cmd_out.strip())
-    if cmd_err.strip():
-        log.critical(cmd_err.strip())
-    return (cmd_out, cmd_err)
-
-
-def which(program):
-    "Python equivalent of the shell cmd 'which'."
-
-    # source:
-    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
-    def is_exe(fpath):
-        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
-
-    fpath, fname = os.path.split(program)
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ["PATH"].split(os.pathsep):
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-
-    return None
+        launch_process_helper(bash_command, print_stdout=False)
 
 
 mem_primitives_versal = {
diff --git a/src/finn/util/hls.py b/src/finn/util/hls.py
index b1b88dbafe..dc153c0f52 100644
--- a/src/finn/util/hls.py
+++ b/src/finn/util/hls.py
@@ -27,10 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import subprocess
-
-from finn.util.basic import which
-from finn.util.logging import log
+from finn.util.basic import launch_process_helper, which
 
 
 class CallHLS:
@@ -65,10 +62,4 @@ def build(self, code_gen_dir):
         f.write("cd {}\n".format(working_dir))
         f.close()
         bash_command = ["bash", self.ipgen_script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        launch_process_helper(bash_command, print_stdout=False)
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index 4770066117..cf75fd273b 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -165,8 +165,8 @@ def test_end2end_cybsec_mlp_build(self):
         )
         build.build_dataflow_cfg(model_file, cfg)
         # check the generated files
-        assert os.path.isfile(output_dir + "/time_per_step.json")
-        assert os.path.isfile(output_dir + "/final_hw_config.json")
+        assert os.path.isfile(output_dir + "/report/time_per_step.json")
+        assert os.path.isfile(output_dir + "/report/final_hw_config.json")
         assert os.path.isfile(output_dir + "/template_specialize_layers_config.json")
         assert os.path.isfile(output_dir + "/driver/driver.py")
         est_cycles_report = output_dir + "/report/estimate_layer_cycles.json"
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index bb89e8ab84..97686235d0 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -95,7 +95,7 @@ def test_fifosizing_linear(method, topology):
     cfg_cmp.auto_fifo_depths = False
     cfg_cmp.target_fps = None
     cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP]
-    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    cfg_cmp.folding_config_file = tmp_output_dir + "/report/final_hw_config.json"
     build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
 
     model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx")
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index 18f574bc8f..65d1942bed 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -48,9 +48,9 @@ def test_end2end_build_dataflow_directory():
     build_dataflow_directory(target_dir)
     # check the generated files
     output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1"
-    assert os.path.isfile(output_dir + "/time_per_step.json")
-    assert os.path.isfile(output_dir + "/auto_folding_config.json")
-    assert os.path.isfile(output_dir + "/final_hw_config.json")
+    assert os.path.isfile(output_dir + "/report/time_per_step.json")
+    assert os.path.isfile(output_dir + "/report/auto_folding_config.json")
+    assert os.path.isfile(output_dir + "/report/final_hw_config.json")
     assert os.path.isfile(output_dir + "/template_specialize_layers_config.json")
     assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml")
     assert os.path.isfile(output_dir + "/driver/driver.py")