From 0aec7f047493a693f287d2ac09cca74f32e95d86 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 12:34:54 +0000
Subject: [PATCH 001/125] Check-in core benchmarking code

---
 .gitlab-ci.yml                                |   84 ++
 benchmarking/bench-ci.yml                     |   53 +
 benchmarking/bench.py                         |  180 +++
 benchmarking/bench_base.py                    |  933 +++++++++++++++
 benchmarking/bench_rtl_swg.py                 |  403 +++++++
 benchmarking/cfg/fifosizing_test.json         |   21 +
 benchmarking/cfg/metafi_fifosizing_test.json  |   17 +
 benchmarking/cfg/mvau_test.json               |   29 +
 .../cfg/resnet50_fifosizing_test.json         |   19 +
 benchmarking/cfg/transformer_gpt_all.json     |   22 +
 benchmarking/cfg/transformer_radioml_all.json |    7 +
 benchmarking/cfg/transformer_sweep.json       |   92 ++
 benchmarking/cfg/transformer_test.json        |   20 +
 benchmarking/collect.py                       |   90 ++
 benchmarking/dut/fifosizing.py                |  576 +++++++++
 benchmarking/dut/mvau.py                      |  295 +++++
 benchmarking/dut/resnet50_custom_steps.py     |  252 ++++
 benchmarking/dut/transformer.py               | 1046 +++++++++++++++++
 benchmarking/dut/transformer_custom_steps.py  |  878 ++++++++++++++
 benchmarking/dut/transformer_gpt.py           |  348 ++++++
 benchmarking/dut/transformer_radioml.py       |  336 ++++++
 benchmarking/harness/sink/ip/component.xml    |  256 ++++
 .../harness/sink/ip/src/harness_sink.v        |   39 +
 .../sink/ip/xgui/harness_sink_v1_0.tcl        |   25 +
 benchmarking/harness/vector_xor.v             |   32 +
 benchmarking/templates.py                     |  213 ++++
 benchmarking/util.py                          |   87 ++
 27 files changed, 6353 insertions(+)
 create mode 100644 .gitlab-ci.yml
 create mode 100644 benchmarking/bench-ci.yml
 create mode 100644 benchmarking/bench.py
 create mode 100644 benchmarking/bench_base.py
 create mode 100644 benchmarking/bench_rtl_swg.py
 create mode 100644 benchmarking/cfg/fifosizing_test.json
 create mode 100644 benchmarking/cfg/metafi_fifosizing_test.json
 create mode 100644 benchmarking/cfg/mvau_test.json
 create mode 100644 benchmarking/cfg/resnet50_fifosizing_test.json
 create mode 100644 benchmarking/cfg/transformer_gpt_all.json
 create mode 100644 benchmarking/cfg/transformer_radioml_all.json
 create mode 100644 benchmarking/cfg/transformer_sweep.json
 create mode 100644 benchmarking/cfg/transformer_test.json
 create mode 100644 benchmarking/collect.py
 create mode 100644 benchmarking/dut/fifosizing.py
 create mode 100644 benchmarking/dut/mvau.py
 create mode 100644 benchmarking/dut/resnet50_custom_steps.py
 create mode 100644 benchmarking/dut/transformer.py
 create mode 100644 benchmarking/dut/transformer_custom_steps.py
 create mode 100644 benchmarking/dut/transformer_gpt.py
 create mode 100644 benchmarking/dut/transformer_radioml.py
 create mode 100644 benchmarking/harness/sink/ip/component.xml
 create mode 100644 benchmarking/harness/sink/ip/src/harness_sink.v
 create mode 100644 benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
 create mode 100644 benchmarking/harness/vector_xor.v
 create mode 100644 benchmarking/templates.py
 create mode 100644 benchmarking/util.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000..ebfa2f6f88
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,84 @@
+stages:
+  - update
+  - build
+  - load_deps
+  - test
+  - trigger_benchmarks
+
+variables:
+  PIPELINE_NAME:
+    description: "Optional name to better identify this pipeline"
+    value: ""
+  CPU_CORES:
+    description: "Select number of CPU cores and test workers"
+    value: "8"
+  PARALLEL_JOBS:
+    description: "Number of parallel Slurm array jobs per CI job"
+    value: "2"
+  SLURM_TIMEOUT:
+    description: "Timeout"
+    value: "2-0" # [days-hours]
+  MANUAL_CFG_PATH:
+    description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner"
+    value: ""
+  SLURM_PARTITION:
+    description: "Slurm partition (e.g., normal, largemem, fpga, gpu)"
+    value: "normal"
+  SLURM_QOS:
+    description: "Optional QoS option (include --qos, e.g., --qos express)"
+    value: ""
+  FINN_XILINX_VERSION:
+    value: "2022.2"
+
+workflow:
+  name: '$PIPELINE_NAME'
+
+Fetch Repos:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: load_deps
+  tags:
+    - login
+  cache: 
+    key: $CI_COMMIT_SHA
+    paths:
+      - deps
+  script:
+    - ./fetch-repos.sh
+
+Bench (Manual):
+  stage: trigger_benchmarks
+  rules:
+    - if: $MANUAL_CFG_PATH != ""
+  trigger:
+    include: benchmarking/bench-ci.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    BENCH_CFG: "manual"
+
+Bench:
+  stage: trigger_benchmarks
+  rules:
+    - if: $MANUAL_CFG_PATH == ""
+  trigger:
+    include: benchmarking/bench-ci.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  parallel:
+    matrix:
+      - BENCH_CFG: [mvau_test]
+
+#dev: mvau_test
+#fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test
+#transformer: transformer_test, transformer_radioml_all
+
+#TODO: introduce result collect job on parent level for easier visualization/excel interfacing
+#TODO: more control via (optional) variables
+#TODO: move power measurement from polling-based script to its own job/runner
+#TODO: ensure a freshly initialized workdir on job/runner level (e.g. created directories seem to stay there)
+#TODO: (optionally) save ALL build artifacts/logs/temporary files to artifacts or PFS for debugging (maybe via Jacamar feature of setting individual persistent workdirs?)
+#TODO: fix clock frequency discrepancies between setting, synth, and driver
\ No newline at end of file
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
new file mode 100644
index 0000000000..f50bd1d3f8
--- /dev/null
+++ b/benchmarking/bench-ci.yml
@@ -0,0 +1,53 @@
+stages:
+  - synth
+  - measure
+  - collect
+
+variables:
+  BENCH_CFG:
+    description: "Select config, usually provided by parent pipeline"
+    value: ""
+
+workflow:
+  name: "bench_$BENCH_CFG"
+
+FINN Build:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: synth
+  variables:
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    PYTEST_PARALLEL: "$CPU_CORES"
+    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/xilinx/finn_dev.sif"
+  before_script:
+    - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
+    - cd $PATH_WORKDIR/finn-plus
+    - module load system singularity
+  script:
+    - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG
+  cache: 
+    key: $CI_COMMIT_SHA
+    policy: pull
+    paths:
+      - deps
+  artifacts:
+    name: "bench_artifacts"
+    when: always
+    paths:
+      - bench_artifacts/
+
+Result Collection:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: collect
+  tags:
+    - image_build
+  script:
+    - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
+  artifacts:
+    name: "bench_results"
+    when: always
+    paths:
+      - bench_results.json
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
new file mode 100644
index 0000000000..77f62bd775
--- /dev/null
+++ b/benchmarking/bench.py
@@ -0,0 +1,180 @@
+import itertools
+import sys
+import os
+import json
+import time
+import traceback
+import onnxruntime as ort
+
+from dut.mvau import bench_mvau
+from dut.transformer import bench_transformer
+from dut.transformer_radioml import bench_transformer_radioml
+from dut.transformer_gpt import bench_transformer_gpt
+from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing
+
+
+def main(config_name):
+    exit_code = 0
+    # Attempt to work around onnxruntime issue on Slurm-managed clusters:
+    # See https://github.com/microsoft/onnxruntime/issues/8313
+    # This seems to happen only when assigned CPU cores are not contiguous
+    _default_session_options = ort.capi._pybind_state.get_default_session_options()
+    def get_default_session_options_new():
+        _default_session_options.inter_op_num_threads = 1
+        _default_session_options.intra_op_num_threads = 1
+        return _default_session_options
+    ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
+
+    # Gather job array info
+    job_id = int(os.environ["SLURM_JOB_ID"])
+    #TODO: allow portable execution on any platform by making as many env vars as possible optional
+    print("Job launched with ID: %d" % (job_id))
+    try:
+        array_id = int(os.environ["SLURM_ARRAY_JOB_ID"])
+        task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
+        task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
+        print(
+            "Launched as job array (Array ID: %d, Task ID: %d, Task count: %d)"
+            % (array_id, task_id, task_count)
+        )
+    except KeyError:
+        array_id = job_id
+        task_id = 0
+        task_count = 1
+        print("Launched as single job")
+
+    # Prepare result directory
+    # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
+    experiment_dir = os.environ.get("CI_PROJECT_DIR")
+
+    artifacts_dir = os.path.join(experiment_dir, "bench_artifacts")
+    print("Collecting results in path: %s" % artifacts_dir)
+    os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True)
+    log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id))
+    
+    # save dir for saving bitstreams (and optionally full build artifacts for debugging (TODO))
+    # TODO: make this more configurable or switch to job/artifact based power measurement
+    if job_id == 0:
+        #DEBUG mode
+        save_dir = experiment_dir + "_save"
+    else:
+        save_dir = os.path.join("/scratch/hpc-prf-radioml/felix/jobs/",
+                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
+    print("Saving additional artifacts in path: %s" % save_dir)
+    os.makedirs(save_dir, exist_ok=True)
+
+    # Gather benchmarking configs
+    if config_name == "manual":
+        configs_path, config_select = os.path.split(os.environ.get("MANUAL_CFG_PATH"))
+    else:
+        configs_path = os.path.join(os.path.dirname(__file__), "cfg")
+        config_select = config_name + ".json"
+
+    # Load config
+    config_path = os.path.join(configs_path, config_select)
+    print("Loading config %s" % (config_path))
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            config = json.load(f)
+    else:
+        print("ERROR: config file not found")
+        return
+
+    # Expand all specified config combinations (gridsearch)
+    config_expanded = []
+    for param_set in config:
+        param_set_expanded = list(
+            dict(zip(param_set.keys(), x)) for x in itertools.product(*param_set.values())
+        )
+        config_expanded.extend(param_set_expanded)
+
+    # Save config (only first job of array) for logging purposes
+    if task_id == 0:
+        with open(os.path.join(artifacts_dir, "bench_config.json"), "w") as f:
+            json.dump(config, f, indent=2)
+        with open(os.path.join(artifacts_dir, "bench_config_exp.json"), "w") as f:
+            json.dump(config_expanded, f, indent=2)
+
+    # Determine which runs this job will work on
+    total_runs = len(config_expanded)
+    if total_runs <= task_count:
+        if task_id < total_runs:
+            selected_runs = [task_id]
+        else:
+            return
+    else:
+        selected_runs = []
+        idx = task_id
+        while idx < total_runs:
+            selected_runs.append(idx)
+            idx = idx + task_count
+    print("This job will perform %d out of %d total runs" % (len(selected_runs), total_runs))
+
+    # Run benchmark
+    # TODO: integrate this loop (especially status logging) into the bench class
+    # TODO: log additional info as artifact or directly into info section of json (e.g. dut, versions, date)
+    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable)
+    log = []
+    for run, run_id in enumerate(selected_runs):
+        print(
+            "Starting run %d/%d (id %d of %d total runs)"
+            % (run + 1, len(selected_runs), run_id, total_runs)
+        )
+
+        params = config_expanded[run_id]
+        print("Run parameters: %s" % (str(params)))
+
+        log_dict = {"run_id": run_id, "task_id": task_id, "params": params}
+
+        # Determine which DUT to run TODO: do this lookup more generically?
+        # give bench subclass name directly in config?
+        if config_select.startswith("mvau"):
+            bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("transformer_radioml"):
+            bench_object = bench_transformer_radioml(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("transformer_gpt"):
+            bench_object = bench_transformer_gpt(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("transformer"):
+            bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("fifosizing"):
+            bench_object = bench_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("metafi_fifosizing"):
+            bench_object = bench_metafi_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
+        elif config_select.startswith("resnet50_fifosizing"):
+            bench_object = bench_resnet50_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
+        else:
+            print("ERROR: unknown DUT specified")
+
+        start_time = time.time()
+        try:
+            bench_object.run()
+            output_dict = bench_object.output_dict
+            if output_dict is None:
+                output_dict = {}
+                log_dict["status"] = "skipped"
+                print("Run skipped")
+            else:
+                log_dict["status"] = "ok"
+                print("Run completed")
+        except Exception:
+            output_dict = {}
+            log_dict["status"] = "failed"
+            print("Run failed: " + traceback.format_exc())
+            exit_code = 1
+
+        log_dict["total_time"] = int(time.time() - start_time)
+        log_dict["output"] = output_dict
+        log.append(log_dict)
+        # overwrite output log file every time to allow early abort
+        with open(log_path, "w") as f:
+            json.dump(log, f, indent=2)
+        
+        # save local artifacts of this run (e.g., detailed debug info)
+        bench_object.save_local_artifacts_collection()
+    print("Stopping job")
+    return exit_code
+    #TODO: add additional exit codes (e.g. when some verification within the run failed)?
+
+if __name__ == "__main__":
+    exit_code = main(sys.argv[1])
+    sys.exit(exit_code)
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
new file mode 100644
index 0000000000..5c191d911f
--- /dev/null
+++ b/benchmarking/bench_base.py
@@ -0,0 +1,933 @@
+import itertools
+import os
+import subprocess
+import copy
+import json
+import time
+import traceback
+import glob
+from shutil import copy as shcopy
+from shutil import copytree
+import finn.core.onnx_exec as oxe
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.analysis.fpgadataflow.res_estimation import res_estimation
+from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs
+from finn.util.basic import make_build_dir, pynq_native_port_width, part_map
+from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
+from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from qonnx.util.basic import (
+    gen_finn_dt_tensor,
+    roundup_to_integer_multiple,
+)
+from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
+from qonnx.core.modelwrapper import ModelWrapper
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+import pandas as pd
+import onnxruntime as ort
+
+class MakeZYNQHarnessProject(Transformation):
+    """Based on MakeZYNQProject transformation, but integrates IP into test harness instead of DMA shell."""
+
+    def __init__(self, platform, output_dir, dut_duplication=1, clock_period_ns=10):
+        super().__init__()
+        self.platform = platform
+        self.output_dir = output_dir
+        self.dut_duplication = dut_duplication
+        self.clock_period_ns = clock_period_ns
+
+    def apply(self, model):
+        # create a config file and empty list of xo files
+        config = []
+        idma_idx = 0
+        odma_idx = 0
+        aximm_idx = 0
+        axilite_idx = 0
+        global_clk_ns = 0
+
+        # assume single stitched-ip (previously dataflowpartition) as DUT
+        # assume single primary input/output
+        input_tensor = model.graph.input[0]
+        output_tensor = model.graph.output[0]
+        input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
+        output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
+        instream_width = input_node_inst.get_instream_width_padded()
+        outstream_width = output_node_inst.get_outstream_width_padded()
+
+        # assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
+        # sdp_node = getCustomOp(node)
+        # dataflow_model_filename = sdp_node.get_nodeattr("model")
+        # kernel_model = ModelWrapper(dataflow_model_filename)
+        kernel_model = model
+
+        ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj")
+        if ipstitch_path is None or (not os.path.isdir(ipstitch_path)):
+            raise Exception("No stitched IPI design found, apply CreateStitchedIP first.")
+
+        vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv")
+        if vivado_stitch_vlnv is None:
+            raise Exception("No vlnv found, apply CreateStitchedIP first.")
+
+        ip_dirs = ["list"]
+        ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path)
+        ip_dirs.append("$::env(FINN_ROOT)/benchmarking/harness/sink/ip")
+        ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
+        config.append(
+            "set_property ip_repo_paths "
+            "[concat [get_property ip_repo_paths [current_project]] %s] "
+            "[current_project]" % ip_dirs_str
+        )
+        config.append("update_ip_catalog -rebuild -scan_changes")
+        config.append(
+            "import_files -fileset sources_1 -norecurse $::env(FINN_ROOT)/benchmarking/harness/vector_xor.v"
+        )
+
+        # get metadata property clk_ns to calculate clock frequency
+        clk_ns = float(kernel_model.get_metadata_prop("clk_ns"))
+        if clk_ns > global_clk_ns:
+            global_clk_ns = clk_ns
+
+        ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
+
+        # instantiate DUT, TODO: switch to wrapper verilog file for (multiple-) DUT instantiation
+        for id in range(self.dut_duplication):
+            dut_instance_name = "finn_design_%d" % id
+            config.append(
+                "create_bd_cell -type ip -vlnv %s %s" % (vivado_stitch_vlnv, dut_instance_name)
+            )
+            # sdp_node.set_nodeattr("instance_name", instance_names[node.name])
+            config.append(
+                "connect_bd_net [get_bd_pins %s/ap_clk] [get_bd_pins axi_interconnect_0/aclk]"
+                % dut_instance_name
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins %s/ap_rst_n] [get_bd_pins axi_interconnect_0/aresetn]"
+                % dut_instance_name
+            )
+
+        # instantiate input harness
+        if instream_width > 8192:
+            print("ERROR: DUT input stream width > 8192")
+            raise Exception("ERROR: DUT input stream width > 8192")
+        elif instream_width > 4096:
+            num_sources = 8
+            source_width = roundup_to_integer_multiple(instream_width / 8, 8)
+        elif instream_width > 2048:
+            num_sources = 4
+            source_width = roundup_to_integer_multiple(instream_width / 4, 8)
+        elif instream_width > 1024:
+            num_sources = 2
+            source_width = roundup_to_integer_multiple(instream_width / 2, 8)
+        else:
+            num_sources = 1
+            source_width = instream_width
+
+        if self.dut_duplication > 1:
+            if num_sources > 1:
+                print("ERROR: DUT duplication with >1024 stream width not supported!")
+                raise Exception("ERROR: DUT duplication with >1024 stream width not supported!")
+
+            num_sources = self.dut_duplication  # one source per DUT instance
+            seed = 0xABCD
+            for id in range(num_sources):
+                config.append(
+                    "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d"
+                    % id
+                )
+                config.append(
+                    "set_property -dict [list \
+                    CONFIG.C_ATG_MODE {AXI4-Stream} \
+                    CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \
+                    CONFIG.C_AXIS_SPARSE_EN {false} \
+                    CONFIG.C_AXIS_TDATA_WIDTH {%d} \
+                    CONFIG.C_AXIS_TDEST_WIDTH {0} \
+                    CONFIG.C_AXIS_TID_WIDTH {0} \
+                    CONFIG.C_AXIS_TUSER_WIDTH {0} \
+                    CONFIG.STRM_DATA_SEED {%s} \
+                    ] [get_bd_cells axi_traffic_gen_%d]"
+                    % (source_width, "0x{:04X}".format(seed), id)
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]"
+                    % id
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]"
+                    % id
+                )
+                seed = seed + 99
+
+                config.append(
+                    "connect_bd_intf_net [get_bd_intf_pins axi_traffic_gen_%d/M_AXIS_MASTER] [get_bd_intf_pins finn_design_%d/s_axis_0]"
+                    % (id, id)
+                )
+
+        else:
+            seed = 0xABCD
+            for id in range(num_sources):
+                config.append(
+                    "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d"
+                    % id
+                )
+                config.append(
+                    "set_property -dict [list \
+                    CONFIG.C_ATG_MODE {AXI4-Stream} \
+                    CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \
+                    CONFIG.C_AXIS_SPARSE_EN {false} \
+                    CONFIG.C_AXIS_TDATA_WIDTH {%d} \
+                    CONFIG.C_AXIS_TDEST_WIDTH {0} \
+                    CONFIG.C_AXIS_TID_WIDTH {0} \
+                    CONFIG.C_AXIS_TUSER_WIDTH {0} \
+                    CONFIG.STRM_DATA_SEED {%s} \
+                    ] [get_bd_cells axi_traffic_gen_%d]"
+                    % (source_width, "0x{:04X}".format(seed), id)
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]"
+                    % id
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]"
+                    % id
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tready] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tready]"
+                    % id
+                )
+                seed = seed + 99
+
+            if num_sources > 1:
+                config.append(
+                    "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_tdata"
+                )
+                config.append(
+                    "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_tdata]" % num_sources
+                )
+
+                for id in range(num_sources):
+                    config.append(
+                        "connect_bd_net [get_bd_pins xlconcat_tdata/In%d] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tdata]"
+                        % (id, id)
+                    )
+
+                config.append(
+                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins xlconcat_tdata/dout]"
+                )
+            else:
+                config.append(
+                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins axi_traffic_gen_0/m_axis_1_tdata]"
+                )
+
+            # only connect valid from source 0 to DUT
+            config.append(
+                "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tvalid] [get_bd_pins axi_traffic_gen_0/m_axis_1_tvalid]"
+            )
+
+        # instantiate output harness
+        for id in range(self.dut_duplication):
+            config.append(
+                "create_bd_cell -type ip -vlnv xilinx.com:user:harness_sink:1.0 sink_%d" % id
+            )
+            config.append(
+                "set_property -dict [list CONFIG.STREAM_WIDTH {%d}] [get_bd_cells sink_%d]"
+                % (outstream_width, id)
+            )
+            config.append(
+                "connect_bd_intf_net [get_bd_intf_pins sink_%d/s_axis_0] [get_bd_intf_pins finn_design_%d/m_axis_0]"
+                % (id, id)
+            )
+
+        # GPIO control (TODO: connect interrupt)
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0")
+        config.append(
+            "set_property -dict [list \
+            CONFIG.C_ALL_INPUTS {0} \
+            CONFIG.C_GPIO_WIDTH {5} \
+            CONFIG.C_INTERRUPT_PRESENT {1} \
+            ] [get_bd_cells axi_gpio_0]"
+        )
+        config.append(
+            "connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] "
+            "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx)
+        )
+        config.append("assign_axi_addr_proc axi_gpio_0/S_AXI")
+        axilite_idx += 1
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_0")
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_1")
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_2")
+        config.append(
+            "set_property -dict [list \
+            CONFIG.DIN_FROM {0} \
+            CONFIG.DIN_TO {0} \
+            CONFIG.DIN_WIDTH {5} \
+            ] [get_bd_cells xlslice_0]"
+        )
+        config.append(
+            "set_property -dict [list \
+            CONFIG.DIN_FROM {1} \
+            CONFIG.DIN_TO {1} \
+            CONFIG.DIN_WIDTH {5} \
+            ] [get_bd_cells xlslice_1]"
+        )
+        config.append(
+            "set_property -dict [list \
+            CONFIG.DIN_FROM {2} \
+            CONFIG.DIN_TO {2} \
+            CONFIG.DIN_WIDTH {5} \
+            ] [get_bd_cells xlslice_2]"
+        )
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_0")
+        config.append(
+            "set_property -dict [list CONFIG.IN1_WIDTH.VALUE_SRC USER CONFIG.IN2_WIDTH.VALUE_SRC USER CONFIG.IN0_WIDTH.VALUE_SRC USER] [get_bd_cells xlconcat_0]"
+        )
+        config.append(
+            "set_property -dict [list \
+            CONFIG.IN0_WIDTH {3} \
+            CONFIG.NUM_PORTS {3} \
+            ] [get_bd_cells xlconcat_0]"
+        )
+        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0")
+        config.append(
+            "set_property -dict [list \
+            CONFIG.CONST_VAL {0} \
+            CONFIG.CONST_WIDTH {3} \
+            ] [get_bd_cells xlconstant_0]"
+        )
+        config.append(
+            """
+            connect_bd_net [get_bd_pins xlslice_0/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
+            connect_bd_net [get_bd_pins xlslice_1/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
+            connect_bd_net [get_bd_pins xlslice_2/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
+            connect_bd_net [get_bd_pins xlconstant_0/dout] [get_bd_pins xlconcat_0/In0]
+            connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_i] [get_bd_pins xlconcat_0/dout]
+        """
+        )
+        if self.dut_duplication > 1:
+            config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_valid")
+            config.append(
+                "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_valid]"
+                % self.dut_duplication
+            )
+            config.append(
+                "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_checksum"
+            )
+            config.append(
+                "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_checksum]"
+                % self.dut_duplication
+            )
+
+            config.append("create_bd_cell -type module -reference vector_xor vector_xor_valid")
+            config.append(
+                "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_valid]"
+                % self.dut_duplication
+            )
+            config.append("create_bd_cell -type module -reference vector_xor vector_xor_checksum")
+            config.append(
+                "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_checksum]"
+                % self.dut_duplication
+            )
+
+            config.append(
+                "connect_bd_net [get_bd_pins vector_xor_valid/in_data] [get_bd_pins xlconcat_valid/dout]"
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins vector_xor_checksum/in_data] [get_bd_pins xlconcat_checksum/dout]"
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins vector_xor_valid/out_data] [get_bd_pins xlconcat_0/In1]"
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins vector_xor_checksum/out_data] [get_bd_pins xlconcat_0/In2]"
+            )
+            for id in range(self.dut_duplication):
+                config.append(
+                    "connect_bd_net [get_bd_pins sink_%d/valid] [get_bd_pins xlconcat_valid/In%d]"
+                    % (id, id)
+                )
+                config.append(
+                    "connect_bd_net [get_bd_pins sink_%d/checksum] [get_bd_pins xlconcat_checksum/In%d]"
+                    % (id, id)
+                )
+        else:
+            config.append("connect_bd_net [get_bd_pins sink_0/valid] [get_bd_pins xlconcat_0/In1]")
+            config.append(
+                "connect_bd_net [get_bd_pins sink_0/checksum] [get_bd_pins xlconcat_0/In2]"
+            )
+        for id in range(self.dut_duplication):
+            config.append(
+                "connect_bd_net [get_bd_pins xlslice_2/Dout] [get_bd_pins sink_%d/enable]" % id
+            )
+        for id in range(num_sources):
+            config.append(
+                "connect_bd_net [get_bd_pins xlslice_0/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_start]"
+                % id
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins xlslice_1/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_stop]"
+                % id
+            )
+
+        # create a temporary folder for the project
+        vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
+        model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
+
+        fclk_mhz = int(1 / (global_clk_ns * 0.001))
+
+        # create a TCL recipe for the project
+        ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl"
+        config = "\n".join(config) + "\n"
+        with open(ipcfg, "w") as f:
+            f.write(
+                zynq_harness_template
+                % (
+                    fclk_mhz,
+                    axilite_idx,
+                    aximm_idx,
+                    self.platform,
+                    part_map[self.platform],
+                    config,
+                )
+            )
+
+        # create a TCL recipe for the project
+        synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh"
+        working_dir = os.environ["PWD"]
+        with open(synth_project_sh, "w") as f:
+            f.write("#!/bin/bash \n")
+            f.write("cd {}\n".format(vivado_pynq_proj_dir))
+            f.write("vivado -mode batch -source %s\n" % ipcfg)
+            f.write("cd {}\n".format(working_dir))
+
+        # call the synthesis script
+        bash_command = ["bash", synth_project_sh]
+        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+        process_compile.communicate()
+
+        # collect results
+        os.makedirs(self.output_dir, exist_ok=True)
+
+        bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
+        if not os.path.isfile(bitfile_name):
+            raise Exception(
+                "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
+            )
+        hwh_name = vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh"
+        if not os.path.isfile(hwh_name):
+            raise Exception(
+                "Synthesis failed, no hwh file found. Check logs under %s" % vivado_pynq_proj_dir
+            )
+        synth_report_name = vivado_pynq_proj_dir + "/synth_report.xml"
+        model.set_metadata_prop("vivado_synth_rpt", synth_report_name)
+        model.set_metadata_prop("bitfile", bitfile_name)
+        model.set_metadata_prop("hw_handoff", hwh_name)
+
+        shcopy(bitfile_name, self.output_dir)
+        shcopy(hwh_name, self.output_dir)
+        shcopy(synth_report_name, self.output_dir)
+
+        post_synth_resources = model.analysis(post_synth_res)
+        with open(self.output_dir + "/post_synth_resources.json", "w") as f:
+                json.dump(post_synth_resources, f, indent=2)
+
+        timing_rpt = ("%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"% vivado_pynq_proj_dir)
+        shcopy(timing_rpt, self.output_dir + "/post_route_timing.rpt")
+        return (model, False)
+
+def step_synth_harness(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Build step version of above transformation (used for full builds)
+    model = model.transform(MakeZYNQHarnessProject(
+                platform=cfg.board,
+                output_dir=os.path.join(cfg.output_dir, "harness"),
+                #dut_duplication=dut_duplication, #TODO: enable for full builds
+                clock_period_ns=cfg.synth_clk_period_ns
+            ))
+    return model
+
+def start_test_batch_fast(results_path, project_path, run_target, pairs):
+    # Prepare tcl script
+    script = template_open.replace("$PROJ_PATH$", project_path)
+    # script = script.replace("$PERIOD$", period)
+    script = script.replace("$RUN$", run_target)
+    for toggle_rate, static_prob in pairs:
+        script = script + template_single_test
+        script = script.replace("$TOGGLE_RATE$", str(toggle_rate))
+        script = script.replace("$STATIC_PROB$", str(static_prob))
+        # script = script.replace("$SWITCH_TARGET$", switch_target)
+        script = script.replace("$REPORT_PATH$", results_path)
+        script = script.replace("$REPORT_NAME$", f"{toggle_rate}_{static_prob}")
+    with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file:
+        tcl_file.write(script)
+
+    # Prepare bash script
+    bash_script = os.getcwd() + "/report_power.sh"
+    with open(bash_script, "w") as script:
+        script.write("#!/bin/bash \n")
+        script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n")
+
+    # Run script
+    sub_proc = subprocess.Popen(["bash", bash_script])
+    sub_proc.communicate()
+
+    # Parse results
+    for toggle_rate, static_prob in pairs:
+        power_report_dict = power_xml_to_dict(f"{results_path}/{toggle_rate}_{static_prob}.xml")
+        power_report_json = f"{results_path}/{toggle_rate}_{static_prob}.json"
+        with open(power_report_json, "w") as json_file:
+            json_file.write(json.dumps(power_report_dict, indent=2))
+
+
+def sim_power_report(results_path, project_path, in_width, out_width, dtype_width, sim_duration_ns):
+    # Prepare tcl script
+    script = template_open.replace("$PROJ_PATH$", project_path)
+    script = script.replace("$RUN$", "impl_1")
+    script = script + template_sim_power
+    script = script.replace("$TB_FILE_PATH$", os.getcwd() + "/switching_simulation_tb.v")
+    script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif")
+    script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns)))
+    script = script.replace("$REPORT_PATH$", results_path)
+    script = script.replace("$REPORT_NAME$", f"sim")
+    with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file:
+        tcl_file.write(script)
+
+    # Prepare testbench
+    testbench = template_switching_simulation_tb.replace("$INSTREAM_WIDTH$", str(in_width))
+    testbench = testbench.replace("$OUTSTREAM_WIDTH$", str(out_width))
+    testbench = testbench.replace("$DTYPE_WIDTH$", str(dtype_width))
+    testbench = testbench.replace(
+        "$RANDOM_FUNCTION$", "$urandom_range(0, {max})".format(max=2**dtype_width - 1)
+    )
+    with open(os.getcwd() + "/switching_simulation_tb.v", "w") as tb_file:
+        tb_file.write(testbench)
+
+    # Prepare shell script
+    bash_script = os.getcwd() + "/report_power.sh"
+    with open(bash_script, "w") as script:
+        script.write("#!/bin/bash \n")
+        script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n")
+
+    # Run script
+    sub_proc = subprocess.Popen(["bash", bash_script])
+    sub_proc.communicate()
+
+    # Parse results
+    power_report_dict = power_xml_to_dict(f"{results_path}/sim.xml")
+    power_report_json = f"{results_path}/sim.json"
+    with open(power_report_json, "w") as json_file:
+        json_file.write(json.dumps(power_report_dict, indent=2))
+
+class bench():
+    def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True):
+        super().__init__()
+        self.params = params
+        self.task_id = task_id
+        self.run_id = run_id
+        self.artifacts_dir = artifacts_dir
+        self.save_dir = save_dir
+        self.debug = debug
+
+        #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously
+
+        # General configuration
+        # TODO: do not allow multiple targets in a single bench job due to measurement?
+        if "board" in params:
+            self.board = params["board"]
+        else:
+            self.board = "RFSoC2x2"
+
+        if "part" in params:
+            self.part = params["part"]
+        elif self.board in part_map:
+            self.part = part_map[self.board]
+        else:
+            raise Exception("No part specified for board %s" % self.board)
+
+        if "clock_period_ns" in params:
+            self.clock_period_ns = params["clock_period_ns"]
+        else:
+            self.clock_period_ns = 10
+
+        # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts)
+        print("Clearing FINN BUILD DIR ahead of run")
+        delete_dir_contents(os.environ["FINN_BUILD_DIR"])
+
+        # Initialize output directories (might exist from other runs of the same job)
+        self.artifacts_dir_models = os.path.join(self.artifacts_dir, "models")
+        os.makedirs(self.artifacts_dir_models, exist_ok=True)
+        self.artifacts_dir_power = os.path.join(self.artifacts_dir, "power_vivado", "run_%d" % (self.run_id))
+        os.makedirs(self.artifacts_dir_power, exist_ok=True)
+
+        self.save_dir_bitstreams = os.path.join(self.save_dir, "bitstreams")
+        os.makedirs(self.save_dir_bitstreams, exist_ok=True)
+
+        # Intermediate models saved between steps
+        # TODO: create setter functions for intermediate models or other artifacts that log them to gitlab artifacts or local dir automatically
+        self.model_initial = None
+        self.model_step_hls = None
+        self.model_step_synthesis = None
+
+        # Initialize dictionary to collect all benchmark results
+        self.output_dict = {}
+
+        # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow
+        self.build_inputs = {}
+
+        # Collect tuples of (name, source path) to save as local artifacts upon run completion or fail by exception
+        self.local_artifacts_collection = []
+        if self.debug:
+            # Save entire FINN build dir and working dir
+            # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure)
+            self.local_artifacts_collection.append(("finn_tmp", os.environ["FINN_BUILD_DIR"]))
+            self.local_artifacts_collection.append(("finn_cwd", os.environ["FINN_ROOT"]))
+
+    def save_artifact(self, name, source_path):
+        target_path = os.path.join(self.artifacts_dir, name, "run_%d" % (self.run_id))
+        os.makedirs(target_path, exist_ok=True)
+        if os.path.isdir(source_path):
+            copytree(source_path, target_path, dirs_exist_ok=True)
+        else:
+            shcopy(source_path, target_path)
+
+    def save_local_artifact(self, name, source_path):
+        target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
+        os.makedirs(target_path, exist_ok=True)
+        if os.path.isdir(source_path):
+            copytree(source_path, target_path, dirs_exist_ok=True)
+        else:
+            shcopy(source_path, target_path)
+
+    def save_local_artifacts_collection(self):
+        # this should be called upon successful or failed completion of a run
+        for (name, source_path) in self.local_artifacts_collection:
+            self.save_local_artifact(name, source_path)
+
+    def step_make_model(self):
+        # may be implemented in subclass
+        pass
+
+    def step_export_onnx(self):
+        # may be implemented in subclass
+        pass
+
+    def step_build(self):
+        # may be implemented in subclass
+        pass
+
+    def run(self):
+        # must be implemented in subclass
+        pass
+
+    def step_finn_estimate(self):
+        # Gather FINN estimates
+        print("Gathering FINN estimates")
+
+        model = self.model_initial
+        finn_resources_model = res_estimation(model, fpgapart=self.part)
+        finn_cycles_model = model.analysis(exp_cycles_per_layer)
+        if self.target_node:
+            node = model.get_nodes_by_op_type(self.target_node)[0]
+            finn_resources = finn_resources_model[node.name]
+            finn_cycles = finn_cycles_model[node.name]
+        else:
+            finn_resources = finn_resources_model # TODO: aggregate?
+            finn_cycles = 0 # TODO: aggregate or drop
+        finn_estimates = finn_resources
+        finn_estimates["CYCLES"] = finn_cycles
+        self.output_dict["finn_estimates"] = finn_estimates
+
+    def step_hls(self):
+        # Perform Vitis HLS synthesis for HLS resource/performance reports
+        start_time = time.time()
+        print("Performing Vitis HLS synthesis")
+        model = self.model_initial
+        model = model.transform(PrepareIP(self.part, self.clock_period_ns))
+        model = model.transform(HLSSynthIP())
+
+        hls_resources_model = model.analysis(hls_synth_res_estimation)
+        if self.target_node:
+            node = model.get_nodes_by_op_type(self.target_node)[0]
+            hls_resources = hls_resources_model[node.name]
+        else:
+            hls_resources = hls_resources_model # TODO: aggregate?
+        self.output_dict["hls_estimates"] = hls_resources
+        self.output_dict["hls_time"] = int(time.time() - start_time)
+
+        self.model_step_hls = copy.deepcopy(model)
+
+    def step_rtlsim(self):
+        # Perform RTL simulation for performance measurement
+        start_time = time.time()
+        print("Performing Verilator RTL simulation (n=1)")
+        # Prepare
+        model = self.model_step_hls
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+        # Generate input data
+        input_tensor = model.graph.input[0]
+        input_shape = model.get_tensor_shape(input_tensor.name)
+        input_dtype = model.get_tensor_datatype(input_tensor.name)
+        x = gen_finn_dt_tensor(input_dtype, input_shape)
+        input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case
+        # Run
+        oxe.execute_onnx(model, input_dict)["outp"]  # do not check output for correctness TODO: add functional verification throughout benchmarking steps
+        # Log result
+        node = model.get_nodes_by_op_type("MVAU_hls")[0]
+        inst = getCustomOp(node)
+        rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim")
+        self.output_dict["rtlsim_cycles"] = rtlsim_cycles
+        self.output_dict["rtlsim_time"] = int(time.time() - start_time)
+
+    def step_synthesis(self):
+        # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports
+        # TODO: avoid duplicate synthesis by using shell build also for post_synth_resources and power sim?
+        # TODO: check OMX synth strategy again!
+        start_time = time.time()
+        print("Performing Vivado (stitched-ip, out-of-context) synthesis")
+        model = self.model_step_hls
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
+        model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns))
+        ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth"))
+
+        start_test_batch_fast(
+            results_path=self.artifacts_dir_power,
+            project_path=os.path.join(
+                ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
+            ),
+            run_target="impl_1",
+            pairs=[(25, 0.5), (50, 0.5), (75, 0.5)],
+        )
+
+        # Log most important power results directly (refer to detailed logs for more)
+        for reportname in ["25_0.5", "50_0.5", "75_0.5"]:
+            with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
+                report = json.load(f)
+                power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
+                power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
+                ooc_synth_results["power_%s" % reportname] = power
+                ooc_synth_results["power_dyn_%s" % reportname] = power_dyn
+
+        self.output_dict["ooc_synth"] = ooc_synth_results
+        self.output_dict["ooc_synth_time"] = int(time.time() - start_time)
+
+        # Save model for logging purposes
+        model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id)))
+        self.model_step_synthesis = copy.deepcopy(model)
+
+    def step_sim_power(self):
+        # Perform Vivado simulation for accurate power report
+        start_time = time.time()
+        if "ooc_synth" not in self.output_dict:
+            print("ERROR: step_sim_power requires step_synthesis")
+        print("Performing Vivado simulation for power report")
+        if "rtlsim_cycles" in self.output_dict:
+            sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns
+        else:
+            sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns
+
+        model = self.model_step_synthesis
+        input_tensor = model.graph.input[0]
+        output_tensor = model.graph.output[0]
+        input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
+        output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
+        sim_power_report(
+            results_path=self.artifacts_dir_power,
+            project_path=os.path.join(
+                self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
+            ),
+            in_width=input_node_inst.get_instream_width(),
+            out_width=output_node_inst.get_outstream_width(),
+            dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(),
+            sim_duration_ns=sim_duration_ns,
+        )
+
+        # Log most important power results directly (refer to detailed logs for more)
+        for reportname in ["sim"]:
+            with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
+                report = json.load(f)
+                power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
+                power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
+                self.output_dict["power_%s" % reportname] = power
+                self.output_dict["power_dyn%s" % reportname] = power_dyn
+
+        self.output_dict["sim_power_time"] = int(time.time() - start_time)
+
+    def step_synth_power(self):
+        # Perform Vivado synthesis for on-hardware power measurement
+        start_time = time.time()
+        if self.model_step_hls is None:
+            print("ERROR: step_synth_power requires step_hls")
+        print("Performing Vivado synthesis with test harness integration for power measurement")
+
+        if "dut_duplication" in self.params:
+            dut_duplication = self.params["dut_duplication"]
+        else:
+            dut_duplication = 1
+    
+        model = self.model_step_hls.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
+
+        build_dir = "temp_output_harness_build"
+        # TODO: replace hold harness with new instr wrapper implementation
+        #TODO: if synth fails this could contain stale bitstreams which will be power tested
+        model = model.transform(
+            MakeZYNQHarnessProject(
+                platform=self.board,
+                output_dir=build_dir,
+                dut_duplication=dut_duplication,
+                clock_period_ns=self.clock_period_ns
+            )
+        )
+
+        # COPY bitstreams and other outputs
+        # TODO: integrate better (e.g. as artifact) and remove redundant copy
+        # TODO: make this more configurable or switch to job/artifact based power measurement 
+        shcopy(os.path.join(build_dir, "top_wrapper.bit"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
+        shcopy(os.path.join(build_dir, "top.hwh"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
+        shcopy(os.path.join(build_dir, "synth_report.xml"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
+        clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
+        measurement_settings = {"freq_mhz": clock_period_mhz}
+        with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
+            json.dump(measurement_settings, f, indent=2)
+
+        self.output_dict["synth_power_time"] = int(time.time() - start_time)
+
+        # Save model for logging purposes
+        model.save(os.path.join(self.artifacts_dir_models, "model_%d_synth_power.onnx" % (self.run_id)))
+
+    def step_parse_builder_output(self, build_dir):
+        # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow
+
+        # COPY bitstreams and other outputs
+        # TODO: integrate better (e.g. as artifact) and remove redundant copy
+        # TODO: make this more configurable or switch to job/artifact based power measurement
+        # TODO: make compatible to new instr wrapper (or however we generate these outputs)
+        shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
+        shcopy(os.path.join(build_dir, "harness/top.hwh"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
+        shcopy(os.path.join(build_dir, "harness/synth_report.xml"), 
+               os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
+        clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
+        measurement_settings = {"freq_mhz": clock_period_mhz}
+        with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
+            json.dump(measurement_settings, f, indent=2)
+
+        # CHECK FOR VERIFICATION STEP SUCCESS
+        # Collect all verification output filenames
+        outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy"))
+        # Extract the verification status for each verification output by matching
+        # to the SUCCESS string contained in the filename
+        status = all([
+            out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs
+        ])
+   
+        # Construct a dictionary reporting the verification status as string
+        self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]}
+        # TODO: mark job as failed if verification fails
+
+        # PARSE LOGS
+        report_path = os.path.join(build_dir, "harness/post_synth_resources.json") 
+        # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run)
+        report_filter = "(top)"
+        # Open the report file
+        with open(report_path) as file:
+            # Load the JSON formatted report
+            report = pd.read_json(file, orient="index")
+        # Filter the reported rows according to some regex filter rule
+        report = report.filter(regex=report_filter, axis="rows")
+        # Generate a summary of the total resources
+        summary = report.sum()
+
+        #TODO: parse finn estimates, hls estimates, step times, (rtlsim n=1, n=100)
+        #TODO: add vivado latency simulation for special transformer case
+        
+        self.output_dict["builder"] = summary.to_dict()
+
+    def steps_simple_model_flow(self):
+        # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops)
+        do_hls = self.params["do_hls"] if "do_hls" in self.params else False
+        do_rtlsim = self.params["do_rtlsim"] if "do_rtlsim" in self.params else False
+        do_synthesis = self.params["do_synthesis"] if "do_synthesis" in self.params else False
+        do_sim_power = self.params["do_sim_power"] if "do_sim_power" in self.params else False
+        do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False
+
+        # Perform steps
+        model, dut_info = self.step_make_model()
+
+        # Save model for logging purposes
+        # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure
+        # E.g. parameters and paths could be stored as onnx attributes and benchmarking steps as generic or specialized custom_op transformations
+        model.save(os.path.join(self.artifacts_dir_models, "model_%d_initial.onnx" % (self.run_id)))
+
+        # Save model for use in other steps
+        self.model_initial = model
+
+        # Log dict reported by DUT-specific scripts to overall result dict
+        # E.g. this could contain SIMD/PE derived from folding factors or weight distribution information
+        self.output_dict["info"] = dut_info
+
+        self.step_finn_estimate()
+
+        if do_hls:
+            self.step_hls()
+        if do_rtlsim:
+            self.step_rtlsim()
+        if do_synthesis:
+            self.step_synthesis()
+        if do_sim_power:
+            self.step_sim_power()
+        if do_synth_power:
+            self.step_synth_power()
+
+    def steps_full_build_flow(self):
+        # Default step sequence for benchmarking a full FINN builder flow
+
+        # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
+        # Ensure it exists but is empty (clear potential artifacts from previous runs)
+        tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow")
+        os.makedirs(tmp_buildflow_dir, exist_ok=True)
+        delete_dir_contents(tmp_buildflow_dir)
+        self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output")
+        os.makedirs(self.build_inputs["build_dir"], exist_ok=True)
+        self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"]))
+
+        if "model_dir" in self.params:
+            # input ONNX model and verification input/output pairs are provided
+            model_dir = self.params["model_dir"]
+            self.build_inputs["onnx_path"] = os.path.join(model_dir, "model.onnx")
+            self.build_inputs["input_npy_path"] = os.path.join(model_dir, "inp.npy")
+            self.build_inputs["output_npy_path"] = os.path.join(model_dir, "out.npy")
+        elif "model_path" in self.params:
+            self.build_inputs["onnx_path"] = self.params["model_path"]
+        else:
+            # input ONNX model (+ optional I/O pair for verification) will be generated
+            self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx")
+            self.step_export_onnx(self.build_inputs["onnx_path"])
+            self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"])
+
+        if "folding_path" in self.params:
+            self.build_inputs["folding_path"] = self.params["folding_path"]
+        if "specialize_path" in self.params:
+            self.build_inputs["specialize_path"] = self.params["specialize_path"]
+        if "floorplan_path" in self.params:
+            self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
+
+        self.step_build()
+
+        self.step_parse_builder_output(self.build_inputs["build_dir"])
diff --git a/benchmarking/bench_rtl_swg.py b/benchmarking/bench_rtl_swg.py
new file mode 100644
index 0000000000..37995be10e
--- /dev/null
+++ b/benchmarking/bench_rtl_swg.py
@@ -0,0 +1,403 @@
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.util.basic import gen_finn_dt_tensor
+
+import finn.core.onnx_exec as oxe
+from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
+from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
+from finn.analysis.fpgadataflow.res_estimation import res_estimation
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
+
+
+def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    im2col_node = helper.make_node(
+        "Im2Col",
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.general",
+        stride=[stride_h, stride_w],
+        kernel_size=[k_h, k_w],
+        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
+        dilations=[dilation_h, dilation_w],
+        pad_amount=[0, 0, 0, 0],
+        pad_value=0,
+    )
+    graph = helper.make_graph(
+        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
+    )
+
+    model = helper.make_model(graph, producer_name="im2col-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    return model
+
+
+def make_single_slidingwindow_modelwrapper(
+    type,
+    k,
+    ifm_ch,
+    ifm_dim,
+    ofm_dim,
+    simd,
+    m,
+    parallel_window,
+    stride,
+    dilation,
+    idt,
+    dw=0,
+    ram_style="auto",
+):
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+    ofm_dim_h, ofm_dim_w = ofm_dim
+
+    odt = idt
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
+    )
+
+    SlidingWindow_node = helper.make_node(
+        type,
+        ["inp"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow",
+        backend="fpgadataflow",
+        ConvKernelDim=[k_h, k_w],
+        IFMChannels=ifm_ch,
+        IFMDim=[ifm_dim_h, ifm_dim_w],
+        OFMDim=[ofm_dim_h, ofm_dim_w],
+        SIMD=simd,
+        M=m,
+        parallel_window=parallel_window,
+        Stride=[stride_h, stride_w],
+        Dilation=[dilation_h, dilation_w],
+        inputDataType=idt.name,
+        outputDataType=odt.name,
+        depthwise=dw,
+        ram_style=ram_style,
+    )
+    graph = helper.make_graph(
+        nodes=[SlidingWindow_node],
+        name="slidingwindow_graph",
+        inputs=[inp],
+        outputs=[outp],
+    )
+
+    model = helper.make_model(graph, producer_name="slidingwindow-model")
+    model = ModelWrapper(model)
+
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype("outp", odt)
+
+    # DEBUG
+    # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    # swg_inst = getCustomOp(swg_node)
+    # swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd")
+
+    return model
+
+
+def prepare_inputs(input_tensor):
+    return {"inp": input_tensor}
+
+
+def bench_rtl_swg(params, task_id, run_id, results_dir):
+    # Read params
+    idt = params["idt"]
+    k = params["k"]
+    ifm_dim = params["ifm_dim"]
+    ifm_ch = params["ifm_ch"]
+    stride = params["stride"]
+    dilation = params["dilation"]
+    dw = params["dw"]
+    simd = params["simd"]
+    m = params["m"]
+    parallel_window = params["parallel_window"]
+    flip = params["flip"]
+    ram_style = params["ram_style"]
+
+    only_estimates = params["only_estimates"]
+    skip_rtlsim = params["skip_rtlsim"]
+    skip_synth = params["skip_synth"]
+    synthesize_hls_comparison = params["synthesize_hls_comparison"]
+
+    output_dict = {}
+
+    # convert string to FINN DataType
+    idt = DataType[idt]
+
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            return
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    # inter-dependent test parameters
+    if simd == "ifm_ch":
+        simd = ifm_ch
+
+    # skip conditions
+    if simd > ifm_ch:
+        return
+    if ifm_ch % simd != 0:
+        return
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        return
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        return
+    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
+        k_w == 1 and (stride_w != 1 or dilation_w != 1)
+    ):
+        return
+    if k_h == 1 and k_w == 1 and simd != ifm_ch:
+        return
+    if parallel_window and simd != ifm_ch:
+        return
+    if not parallel_window and m > 1:
+        return
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
+    model = make_single_slidingwindow_modelwrapper(
+        type="ConvolutionInputGenerator_rtl",
+        k=k,
+        ifm_ch=ifm_ch,
+        ifm_dim=ifm_dim,
+        ofm_dim=ofm_dim,
+        simd=simd,
+        m=m,
+        parallel_window=parallel_window,
+        stride=stride,
+        dilation=dilation,
+        idt=idt,
+        dw=dw,
+        ram_style=ram_style,
+    )
+
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    if not only_estimates:
+        model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
+        model = model.transform(PrepareRTLSim())
+
+    node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
+    inst = getCustomOp(node)
+
+    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+    exp_cycles = exp_cycles_dict[node.name]
+    exp_res_dict = model.analysis(res_estimation)
+    exp_res = exp_res_dict[node.name]
+
+    output_dict["est_Cycles"] = exp_cycles
+    output_dict["est_LUT"] = exp_res["LUT"]
+    output_dict["est_BRAM"] = exp_res["BRAM_18K"] * 0.5
+    output_dict["est_URAM"] = exp_res["URAM"]
+
+    if only_estimates:
+        return output_dict
+
+    if not skip_rtlsim:
+        # prepare input data
+        input_dict = prepare_inputs(x)
+        # execute model
+        oxe.execute_onnx(model, input_dict)["outp"]
+
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        output_dict["Cycles"] = cycles_rtlsim
+        print("RTLSIM cycles: %d" % cycles_rtlsim)
+
+    if not skip_synth:
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5))
+        model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5))
+        ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth"))
+        output_dict["LUT"] = ooc_res_dict["LUT"]
+        output_dict["BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"]
+        output_dict["URAM"] = ooc_res_dict["URAM"]
+        output_dict["WNS"] = ooc_res_dict["WNS"]
+        output_dict["Fmax"] = ooc_res_dict["fmax_mhz"]
+
+    ###############################################################
+    # HLS COMPARISON:
+    if synthesize_hls_comparison:
+        output_dict["HLS_compatible"] = "yes"
+
+        is_square = True
+        props_to_check = [k, ifm_dim, ofm_dim, stride, dilation]
+        for prop in props_to_check:
+            is_square = prop[0] == prop[1]
+            if not is_square:
+                is_square = False
+
+        if not is_square or dilation[0] != 1 or dilation[1] != 1:
+            # try 1D HLS ConvInpGen
+
+            # rectangular case not supported
+            if ifm_dim[0] == 1:
+                if ofm_dim[0] != 1 or k[0] != 1 or stride[0] != 1 or dilation[0] != 1:
+                    output_dict["HLS_compatible"] = "no"
+            elif ifm_dim[1] == 1:
+                if ofm_dim[1] != 1 or k[1] != 1 or stride[1] != 1 or dilation[1] != 1:
+                    output_dict["HLS_compatible"] = "no"
+            else:
+                output_dict["HLS_compatible"] = "no"
+
+            # unsupported parallelization
+            if m > 1:
+                output_dict["HLS_compatible"] = "no"
+            if parallel_window > 0:
+                fully_unfolded = simd == ifm_ch
+                non_dws = dw == 0
+                no_stride = stride_h == 1 and stride_w == 1
+                no_dilation = dilation_h == 1 and dilation_w == 1
+                supported_ram_style = ram_style in ["auto", "distributed"]
+                if not (
+                    fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style
+                ):
+                    output_dict["HLS_compatible"] = "no"
+
+            # unsupported hyperparams
+            if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
+                output_dict["HLS_compatible"] = "no"
+            if (dilation_h > 1 or dilation_w > 1) and dw == 0:
+                output_dict["HLS_compatible"] = "no"
+
+            model = make_single_slidingwindow_modelwrapper(
+                type="ConvolutionInputGenerator1D",
+                k=k,
+                ifm_ch=ifm_ch,
+                ifm_dim=ifm_dim,
+                ofm_dim=ofm_dim,
+                simd=simd,
+                m=m,
+                parallel_window=parallel_window,
+                stride=stride,
+                dilation=dilation,
+                idt=idt,
+                dw=dw,
+                ram_style=ram_style,
+            )
+        else:
+            # try 2D HLS ConvInpGen
+
+            # unsupported parallelization
+            if m > 1 or parallel_window > 0:
+                output_dict["HLS_compatible"] = "no"
+
+            model = make_single_slidingwindow_modelwrapper(
+                type="ConvolutionInputGenerator",
+                k=k,
+                ifm_ch=ifm_ch,
+                ifm_dim=ifm_dim,
+                ofm_dim=ofm_dim,
+                simd=simd,
+                m=m,
+                parallel_window=parallel_window,
+                stride=stride,
+                dilation=dilation,
+                idt=idt,
+                dw=dw,
+                ram_style=ram_style,
+            )
+
+        if output_dict["HLS_compatible"] == "no":
+            return output_dict
+
+        # perform usual RTLSIM steps
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+
+        # extract first results (estimates)
+        node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator")
+        if len(node_) == 0:
+            node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
+        node = node_[0]
+        inst = getCustomOp(node)
+
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        output_dict["HLS_FINN_est_Cycles"] = exp_cycles
+
+        exp_res_dict = model.analysis(res_estimation)
+        exp_res = exp_res_dict[node.name]
+        output_dict["HLS_FINN_est_LUT"] = exp_res["LUT"]
+        output_dict["HLS_FINN_est_BRAM"] = exp_res["BRAM_18K"] * 0.5
+        output_dict["HLS_FINN_est_URAM"] = exp_res["URAM"]
+
+        exp_res_dict_hls = model.analysis(hls_synth_res_estimation)
+        exp_res_hls = exp_res_dict_hls[node.name]
+        output_dict["HLS_HLS_est_LUT"] = int(exp_res_hls["LUT"])
+        output_dict["HLS_HLS_est_BRAM"] = int(exp_res_hls["BRAM_18K"]) * 0.5
+        output_dict["HLS_HLS_est_URAM"] = int(exp_res_hls["URAM"])
+
+        # perform rtlsim (for cycle measurement)
+        if not skip_rtlsim:
+            input_dict = prepare_inputs(x)
+            oxe.execute_onnx(model, input_dict)["outp"]
+            cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+            output_dict["HLS_Cycles"] = cycles_rtlsim
+
+        # perform ooc synthesis (for resource/slack measurement)
+        model = model.transform(ReplaceVerilogRelPaths())
+        model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5))
+        model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5))
+        ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth"))
+        output_dict["HLS_LUT"] = ooc_res_dict["LUT"]
+        output_dict["HLS_BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"]
+        output_dict["HLS_URAM"] = ooc_res_dict["URAM"]
+        output_dict["HLS_WNS"] = ooc_res_dict["WNS"]
+        output_dict["HLS_Fmax"] = ooc_res_dict["fmax_mhz"]
+
+    return output_dict
diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
new file mode 100644
index 0000000000..890f4c5b66
--- /dev/null
+++ b/benchmarking/cfg/fifosizing_test.json
@@ -0,0 +1,21 @@
+[
+    {
+        "dim": [32],
+        "kernel_size": [5],
+        "ch": [4],
+        "simd": [4],
+        "pe": [4],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [3],
+
+        "strategy": ["analytical", "rtlsim"],
+
+        "rtlsim_n": [10],
+        "throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [64],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    }
+    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
new file mode 100644
index 0000000000..2a3aa895ab
--- /dev/null
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -0,0 +1,17 @@
+[
+    {
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "strategy": ["analytical"],
+
+        "rtlsim_n": [10],
+        "throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [1024],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    }
+    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json
new file mode 100644
index 0000000000..0c3abdb574
--- /dev/null
+++ b/benchmarking/cfg/mvau_test.json
@@ -0,0 +1,29 @@
+[
+    {
+        "idt": ["INT4","INT2"],
+        "wdt": ["INT4"],
+        "act": ["INT4"],
+
+        "sparsity_type": ["none"],
+        "sparsity_amount": [0],
+
+        "nhw": [[1,32,32]],
+        "mw": [64],
+        "mh": [64],
+        "sf": [-1],
+        "nf": [-1],
+        "m": [1],
+
+        "mem_mode": ["internal_embedded"],
+        "ram_style": ["distributed"],
+        "ram_style_thr": ["distributed"],
+
+        "do_hls": [true],
+        "do_rtlsim": [true],
+        "do_synthesis": [true],
+        "do_sim_power": [true],
+        "do_synth_power": [true],
+
+        "dut_duplication": [1]
+    }
+    ]
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
new file mode 100644
index 0000000000..1e85b972da
--- /dev/null
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -0,0 +1,19 @@
+[
+    {
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["U250"],
+        "clock_period_ns": [4],
+
+        "strategy": ["analytical"],
+
+        "rtlsim_n": [2],
+        "throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [1024],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    }
+    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json
new file mode 100644
index 0000000000..27c426606e
--- /dev/null
+++ b/benchmarking/cfg/transformer_gpt_all.json
@@ -0,0 +1,22 @@
+[
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"],
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"],
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"],
+        "dut_duplication": [1]
+    },   
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
+        "dut_duplication": [1]
+    }
+]
diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json
new file mode 100644
index 0000000000..7dbdc217d7
--- /dev/null
+++ b/benchmarking/cfg/transformer_radioml_all.json
@@ -0,0 +1,7 @@
+[
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
+        "dut_duplication": [1]
+    }
+]
diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json
new file mode 100644
index 0000000000..d10c4d94ca
--- /dev/null
+++ b/benchmarking/cfg/transformer_sweep.json
@@ -0,0 +1,92 @@
+[
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [1],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [32],
+        "model_mlp_dim": [1536],
+        "model_seq_len": [512],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [8],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [256],
+        "model_mlp_dim": [1536],
+        "model_seq_len": [512],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [12],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [384],
+        "model_mlp_dim": [1536],
+        "model_seq_len": [512],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [12],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [96],
+        "model_mlp_dim": [1536],
+        "model_seq_len": [512],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [1],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [32],
+        "model_mlp_dim": [1536],
+        "model_seq_len": [512],
+        "model_bits": [2, 4, 6, 8],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    }
+]
diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json
new file mode 100644
index 0000000000..784d96f93d
--- /dev/null
+++ b/benchmarking/cfg/transformer_test.json
@@ -0,0 +1,20 @@
+[
+    {
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [1],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [32],
+        "model_mlp_dim": [192],
+        "model_seq_len": [64],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "dut_duplication": [1]
+    }
+]
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
new file mode 100644
index 0000000000..3bc9aaf04b
--- /dev/null
+++ b/benchmarking/collect.py
@@ -0,0 +1,90 @@
+import itertools
+import json
+import os
+import sys
+import time
+
+def merge_dicts(a: dict, b: dict):
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_dicts(a[key], b[key])
+            elif a[key] != b[key]:
+                raise Exception("ERROR: Dict merge conflict")
+        else:
+            a[key] = b[key]
+    return a
+
+def consolidate_logs(path, output_filepath):
+    log = []
+    i = 0
+    while (i < 1024):
+        if (os.path.isfile(os.path.join(path,"task_%d.json"%(i)))):
+            with open(os.path.join(path,"task_%d.json"%(i)), "r") as f:
+                log_task = json.load(f)
+            log.extend(log_task)
+        i = i + 1
+    
+    with open(output_filepath, "w") as f:
+        json.dump(log, f, indent=2)
+
+def merge_logs(log_a, log_b, log_out):
+    # merges json log (list of nested dicts) b into a, not vice versa (TODO)
+
+    with open(log_a, "r") as f:
+        a = json.load(f)
+    with open(log_b, "r") as f:
+        b = json.load(f)
+
+    for idx, run_a in enumerate(a):
+        for run_b in b:
+            if run_a["run_id"] == run_b["run_id"]:
+                #a[idx] |= run_b # requires Python >= 3.9
+                #a[idx] = {**run_a, **run_b}
+                a[idx] = merge_dicts(run_a, run_b)
+                break
+
+    # also sort by run id
+    out = sorted(a, key=lambda x: x["run_id"])
+
+    with open(log_out, "w") as f:
+        json.dump(out, f, indent=2)
+
+def wait_for_power_measurements():
+    # TODO: detect when no bitstreams are to be measured (e.g. for fifosizing) and skip
+    # TODO: make configurable, relative to some env variable due to different mountint points
+    bitstreams_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
+                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
+                            "bitstreams")
+    
+    power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
+                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
+                            "power_measure.json")
+
+    # count bitstreams to measure (can't rely on total number of runs since some of them could've failed)
+    files = os.listdir(bitstreams_path)
+    bitstream_count = len(list(filter(lambda x : ".bit" in x, files)))
+
+    log = []
+    print("Checking if all bitstreams of pipeline have been measured..")
+    while(len(log) < bitstream_count):
+        if os.path.isfile(power_log_path):
+            with open(power_log_path, "r") as f:
+                log = json.load(f)
+        print("Found measurements for %d/%d bitstreams"%(len(log),bitstream_count))
+        time.sleep(60)
+    print("Power measurement complete")
+
+if __name__ == "__main__":
+    print("Consolidating synthesis results from all sub-jobs of the array")
+    consolidate_logs(sys.argv[1], sys.argv[2])
+
+    # TODO: disabled for now, update accordingly to new runner-based measurement setup
+    # wait_for_power_measurements()
+    # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
+    #                         "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
+    #                         "power_measure.json")
+    # if os.path.isfile(power_log_path):
+    #     print("Merging power measurement logs with remaining logs")
+    #     merge_logs(sys.argv[2], power_log_path, sys.argv[2])
+    print("Done")
diff --git a/benchmarking/dut/fifosizing.py b/benchmarking/dut/fifosizing.py
new file mode 100644
index 0000000000..46b972deb0
--- /dev/null
+++ b/benchmarking/dut/fifosizing.py
@@ -0,0 +1,576 @@
+import json
+import numpy as np
+import os
+import shutil
+import torch
+import copy
+from brevitas.export import export_qonnx
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import (
+    GiveRandomTensorNames,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    GiveUniqueParameterTensors,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.merge_onnx_models import MergeONNXModels
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir
+from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+from finn.util.test import get_trained_network_and_ishape
+from finn.util.basic import alveo_default_platform
+
+from dut.resnet50_custom_steps import (
+        step_resnet50_tidy,
+        step_resnet50_streamline,
+        step_resnet50_convert_to_hw,
+        step_resnet50_slr_floorplan,
+    )
+
+from bench_base import bench
+
+def generate_random_threshold_values(
+    data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
+):
+    if per_tensor:
+        num_input_channels = 1
+    if narrow:
+        num_steps -= 1
+
+    return np.random.randint(
+        data_type.min(),
+        data_type.max() + 1,
+        (num_input_channels, num_steps),
+    ).astype(np.float32)
+
+
+def sort_thresholds_increasing(thresholds):
+    return np.sort(thresholds, axis=1)
+
+def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0):
+    # hardcoded parameters
+    idt = DataType["UINT4"]
+    wdt = DataType["UINT4"]
+    odt = DataType["UINT4"]
+    tdt = DataType["UINT32"]
+    stride = 1
+    in_ch = out_ch = ch  # input channel = output channel for stacking
+    # pad so that input dim = output dim for stacking (only supports odd kernel_size for now)
+    pad = int(np.floor(kernel_size / 2))
+
+    total_pad = 2 * pad
+    out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad)
+    weights_shape = [in_ch * kernel_size * kernel_size, out_ch]
+    thresholds_shape = [1, odt.get_num_possible_values() - 1]
+    input_shape = [1, ifm_dim, ifm_dim, in_ch]
+    padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch]
+    inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size]
+    output_shape = [1, out_feature_dim, out_feature_dim, out_ch]
+
+    assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable"
+
+    padding_config = {}
+    padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    padding_config["backend"] = "fpgadataflow"
+    padding_config["ImgDim"] = [ifm_dim, ifm_dim]
+    padding_config["NumChannels"] = in_ch
+    padding_config["SIMD"] = simd
+    padding_config["Padding"] = [pad, pad, pad, pad]
+    padding_config["inputDataType"] = idt.name
+
+    inpgen_config = {}
+    inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl"
+    inpgen_config["backend"] = "fpgadataflow"
+    inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size]
+    inpgen_config["IFMChannels"] = in_ch
+    inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad]
+    inpgen_config["OFMDim"] = [ifm_dim, ifm_dim]
+    inpgen_config["inputDataType"] = idt.name
+    inpgen_config["outputDataType"] = idt.name
+    inpgen_config["SIMD"] = simd
+    inpgen_config["parallel_window"] = parallel_window
+    inpgen_config["Stride"] = [stride, stride]
+    inpgen_config["Dilation"] = [1, 1]
+
+    mvau_config = {}
+    mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    mvau_config["backend"] = "fpgadataflow"
+    mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    mvau_config["MW"] = in_ch * kernel_size * kernel_size
+    mvau_config["MH"] = in_ch
+    mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size
+    mvau_config["PE"] = pe
+    mvau_config["resType"] = "lut"
+    mvau_config["mem_mode"] = "internal_embedded"  # internal_decoupled
+    mvau_config["inputDataType"] = idt.name
+    mvau_config["weightDataType"] = wdt.name
+    mvau_config["outputDataType"] = odt.name
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape),
+        helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape),
+        helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape),
+        helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape),
+    ]
+
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="building_block",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config),
+                helper.make_node(
+                    "ConvolutionInputGenerator_rtl",
+                    ["padding_out"],
+                    ["inpgen_out"],
+                    **inpgen_config,
+                ),
+                helper.make_node(
+                    "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config
+                ),
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_layout("top_in", ["N", "H", "W", "C"])
+    model.set_tensor_datatype("top_out", odt)
+    model.set_tensor_datatype("weights", wdt)
+    model.set_tensor_datatype("thresholds", tdt)
+
+    weights = gen_finn_dt_tensor(wdt, weights_shape)
+    # TODO: thresholds are all the same
+    thresholds = generate_random_threshold_values(
+        tdt, out_ch, odt.get_num_possible_values() - 1, False, True
+    )
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    model.set_initializer("weights", weights)
+    model.set_initializer("thresholds", thresholds)
+
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+
+    return model
+
+
+def combine_blocks(lb, rb, ifm_dim, ch, pe):
+    # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape
+    # to avoid mix-ups, start by giving all tensors random names
+    lb = lb.transform(GiveRandomTensorNames())
+    rb = rb.transform(GiveRandomTensorNames())
+    # erase all node names to avoid conflict
+    for n in lb.graph.node:
+        n.name = ""
+    for n in rb.graph.node:
+        n.name = ""
+
+    lb_input = lb.graph.input[0]
+    lb_output = lb.graph.output[0]
+    rb_input = rb.graph.input[0]
+    rb_output = rb.graph.output[0]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch])
+
+    dup_config = {}
+    dup_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    dup_config["backend"] = "fpgadataflow"
+    dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    dup_config["NumChannels"] = ch
+    dup_config["PE"] = pe
+    dup_config["NumOutputStreams"] = 2
+    dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name
+
+    add_config = {}
+    add_config["domain"] = "finn.custom_op.fpgadataflow.hls"
+    add_config["backend"] = "fpgadataflow"
+    add_config["numInputVectors"] = [1, ifm_dim, ifm_dim]
+    add_config["NumChannels"] = ch
+    add_config["PE"] = pe
+    add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name
+
+    nodes_lb = [node for node in lb.graph.node]
+    nodes_rb = [node for node in rb.graph.node]
+    nodes_new = (
+        nodes_lb
+        + nodes_rb
+        + [
+            helper.make_node(
+                "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config
+            ),
+            helper.make_node(
+                "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config
+            ),
+        ]
+    )
+
+    value_info_lb = [x for x in lb.graph.value_info]
+    value_info_rb = [x for x in rb.graph.value_info]
+    value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output]
+
+    initializer_lb = [x for x in lb.graph.initializer]
+    initializer_rb = [x for x in rb.graph.initializer]
+    initializer_new = initializer_lb + initializer_rb
+    modelproto = qonnx_make_model(
+        helper.make_graph(
+            name="branching_model",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info_new,
+            nodes=nodes_new,
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name))
+    model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name))
+    for i in initializer_new:
+        model.graph.initializer.append(i)
+
+    # tidy-up
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+class bench_fifosizing(bench):
+    def step_export_onnx(self, onnx_export_path):
+        np.random.seed(0)
+        tmp_output_dir = make_build_dir("test_fifosizing")
+
+        #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests
+        # or make into a build flow output product "fifo_report"
+        #TODO: allow manual folding/fifo config as input
+
+        #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness?
+        #      e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does?
+
+        #TODO: how to determine rtlsim_n automatically?
+
+        # conv parameters
+        dim = self.params["dim"]
+        kernel_size = self.params["kernel_size"]
+        ch = self.params["ch"]
+        simd = self.params["simd"]
+        pe = self.params["pe"]
+        parallel_window = self.params["parallel_window"]
+
+        lb = None
+        for i in range(self.params["lb_num_layers"]):
+            new_block = make_conv_building_block(
+                dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+            )
+            lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block))
+        lb.save(tmp_output_dir + "/lb.onnx")
+
+        rb = None
+        for i in range(self.params["rb_num_layers"]):
+            new_block = make_conv_building_block(
+                dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window
+            )
+            rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block))
+        rb.save(tmp_output_dir + "/rb.onnx")
+
+        model = combine_blocks(lb, rb, dim, ch, pe=4)
+        model.save(onnx_export_path)
+
+    def step_build_setup(self):
+        # create build config for synthetic test models
+
+        cfg = build_cfg.DataflowBuildConfig(
+            output_dir = self.build_inputs["build_dir"],
+            synth_clk_period_ns = self.clock_period_ns,
+            verbose=False,
+            # only works with characterization-based FIFO-sizing
+            auto_fifo_depths=True,
+            auto_fifo_strategy="characterize",
+            characteristic_function_strategy=self.params["strategy"],
+            split_large_fifos=False,
+            # manual folding
+            target_fps=None,
+            # general rtlsim settings
+            force_python_rtlsim=False,
+            rtlsim_batch_size=self.params["rtlsim_n"],
+            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP,
+                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+            ],
+        )
+
+        return cfg
+    
+    def step_fifotest(self, onnx_path, cfg, build_dir):
+        log = {}
+        build.build_dataflow_cfg(onnx_path, cfg)
+
+        # load performance reports
+        with open(build_dir + "/report/estimate_network_performance.json") as f:
+            est_data = json.load(f)
+        with open(build_dir + "/report/rtlsim_performance.json") as f:
+            sim_data = json.load(f) 
+
+        # check for deadlock
+        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
+        first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+        last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
+        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
+        deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+        log["deadlock"] = deadlock.tolist()
+
+        # check rtlsim throughput
+        throughput = sim_data["throughput[images/s]"]
+        stable_throughput = sim_data["stable_throughput[images/s]"]
+        estimated_throughput = est_data["estimated_throughput_fps"]
+        throughput_factor = throughput / estimated_throughput
+        stable_throughput_factor = stable_throughput / estimated_throughput
+
+        # TODO: Take throughput or stable_throughput?
+        throughput_pass = throughput_factor > self.params["throughput_factor_threshold"]
+
+        log["throughput_pass"] = throughput_pass
+        log["throughput"] = throughput
+        log["stable_throughput"] = stable_throughput
+        log["estimated_throughput"] = estimated_throughput
+
+        # log FIFO sizes for easier inspection
+        log["fifo_depths"] = {}
+        log["fifo_sizes"] = {}
+        total_fifo_size = 0
+        for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            node_inst = getCustomOp(node)
+            log["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
+            log["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
+            total_fifo_size += log["fifo_sizes"][node.name] 
+        log["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
+
+        # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear
+        fifo_reduction_pass = []
+        log["fifo_reduction_results"] = {}
+        model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx")
+        for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            model = copy.deepcopy(model_orig)
+            node = model.get_node_from_name(node_orig.name)
+            node_inst = getCustomOp(node)
+
+            # skip shallow FIFOs
+            # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado?
+            if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]:
+                log["fifo_reduction_results"][node.name] = "skip"
+                continue
+
+            # reduce depth of current FIFO and reset generated code
+            node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"]))
+            node_inst.set_nodeattr("code_gen_dir_ipgen", "")
+            node_inst.set_nodeattr("ip_path", "")
+            node_inst.set_nodeattr("ipgen_path", "")
+
+            # save model variation
+            tmp_output_dir_var = build_dir + "/variations/" + node.name
+            os.makedirs(tmp_output_dir_var)
+            model.save(tmp_output_dir_var + "/model.onnx")
+
+            # build again, only re-run necessary steps to save time
+            cfg.output_dir = tmp_output_dir_var
+            cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"]
+            build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg)
+
+            # load performance report
+            with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f:
+                sim_data = json.load(f)
+
+            # check for deadlock
+            model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
+            first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+            last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
+            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
+            var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+
+            # check rtlsim throughput
+            var_throughput = sim_data["throughput[images/s]"]
+            var_stable_throughput = sim_data["stable_throughput[images/s]"]
+            # TODO: take throughput or stable_throughput?
+            throughput_drop = (throughput - var_throughput) / throughput
+
+            if var_deadlock:   
+                fifo_reduction_pass.append(True)
+                log["fifo_reduction_results"][node.name] = 1.0
+            elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]:
+                fifo_reduction_pass.append(True)
+                log["fifo_reduction_results"][node.name] = throughput_drop
+            else:
+                fifo_reduction_pass.append(False)
+                log["fifo_reduction_results"][node.name] = "fail (no drop)"
+        
+        self.output_dict["fifosizing_testresults"] = log
+
+    def step_build(self):
+        # TODO: rename steps to model three phases: model creation/import, dataflow build, analysis
+        # dataflow build should be easily swappable and adpaptable to finn-examples
+        cfg = self.step_build_setup()
+        cfg.board = self.board
+        if "folding_path" in self.build_inputs:
+            cfg.folding_config_file = self.build_inputs["folding_path"]
+        if "specialize_path" in self.build_inputs:
+            cfg.specialize_layers_config_file = self.build_inputs["specialize_path"]
+        self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"])
+
+    def step_parse_builder_output(self, build_dir):
+        # build output itself is not relevant here (yet)
+        pass
+
+    def run(self):
+        self.steps_full_build_flow()
+
+
+# # custom steps
+# from custom_steps import (
+#     step_extract_absorb_bias,
+#     step_pre_streamline,
+#     step_residual_convert_to_hw,
+#     step_residual_streamline,
+#     step_residual_tidy,
+#     step_residual_topo,
+#     step_set_preferred_impl_style,
+#     step_convert_final_layers
+# )
+
+# TODO: put these definitions into separate files/classes so we can use them for other types of benchmaks as well
+class bench_metafi_fifosizing(bench_fifosizing):
+    def step_build_setup(self):
+        # create build config for MetaFi models
+
+        steps = [
+            # step_residual_tidy,
+            # step_extract_absorb_bias,
+            # step_residual_topo,
+            # step_pre_streamline,
+            # step_residual_streamline,
+            # step_residual_convert_to_hw,
+            "step_create_dataflow_partition",
+            # step_set_preferred_impl_style,
+            "step_specialize_layers",
+            "step_target_fps_parallelization",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            "step_create_stitched_ip",
+            "step_measure_rtlsim_performance",
+            "step_out_of_context_synthesis",
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+
+        cfg = build_cfg.DataflowBuildConfig(
+            output_dir = self.build_inputs["build_dir"],
+            synth_clk_period_ns = self.clock_period_ns,
+            steps=steps,
+            verbose=False,
+            target_fps=None, #23
+            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
+            #vitis_platform=vitis_platform,
+
+            auto_fifo_depths=False,
+            split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test
+
+            # general rtlsim settings
+            force_python_rtlsim=False,
+            rtlsim_batch_size=self.params["rtlsim_n"],
+
+            # folding_config_file=folding_config_file,
+            # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
+            # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
+            # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json",
+            auto_fifo_strategy="characterize",
+            characteristic_function_strategy=self.params["strategy"],
+            #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
+            # standalone_thresholds=True,
+            # enable extra performance optimizations (physopt)
+            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP,
+                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
+            ],
+        )
+
+        # where is this used and why?
+        cfg.use_conv_rtl = True,  # use rtl for conv layers (MVAU cannot use rtl in our model)
+
+        return cfg
+
+
+class bench_resnet50_fifosizing(bench_fifosizing):
+    def step_build_setup(self):
+        # create build config for ResNet-50 (based on finn-examples)
+
+        resnet50_build_steps = [
+            step_resnet50_tidy,
+            step_resnet50_streamline,
+            step_resnet50_convert_to_hw,
+            "step_create_dataflow_partition",
+            "step_specialize_layers",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            step_resnet50_slr_floorplan,
+            "step_create_stitched_ip", # was not in finn-examples
+            "step_measure_rtlsim_performance", # was not in finn-examples
+            "step_out_of_context_synthesis", # was not in finn-examples
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+
+        cfg = build_cfg.DataflowBuildConfig(
+            output_dir = self.build_inputs["build_dir"],
+            synth_clk_period_ns = self.clock_period_ns,
+            steps=resnet50_build_steps,
+            shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end
+            auto_fifo_depths=False,
+            split_large_fifos=True,
+            vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
+
+            # enable extra performance optimizations (physopt)
+            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP,
+                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
+            ],
+        )
+
+        # non-standard build parameter for custom step
+        cfg.floorplan_path = self.build_inputs["floorplan_path"]
+
+        return cfg
\ No newline at end of file
diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py
new file mode 100644
index 0000000000..a41eec694b
--- /dev/null
+++ b/benchmarking/dut/mvau.py
@@ -0,0 +1,295 @@
+
+import math
+import numpy as np
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.util.basic import (
+    calculate_matvec_accumulator_range,
+    gen_finn_dt_tensor,
+    qonnx_make_model
+)
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+    MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+    MinimizeWeightBitWidth,
+)
+from bench_base import bench
+
+class bench_mvau(bench):
+
+    def _make_single_mvau_model(
+        self,
+        W,
+        numInputVectors,
+        pe,
+        simd,
+        m,
+        wdt,
+        idt,
+        odt,
+        T=None,
+        tdt=None,
+        mem_mode="const",
+        ram_style="auto",
+        ram_style_thresholds="auto",
+    ):
+        mw = W.shape[0]
+        mh = W.shape[1]
+
+        # there are two ways to implement bipolar weights and inputs for
+        # MatrixVectorActivation:
+        # - specify their datatypes as such
+        # - specify their datatypes as BINARY as use binaryXnorMode
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            # we'll internally convert weights/inputs to binary and specify the
+            # datatypes as such, and also set the binaryXnorMode attribute to 1
+            export_wdt = DataType["BINARY"]
+            export_idt = DataType["BINARY"]
+            binary_xnor_mode = 1
+        else:
+            export_wdt = wdt
+            export_idt = idt
+            binary_xnor_mode = 0
+
+        # numInputVectors for dense = [N]
+        # numInputVectors for conv  = [N, H, W]
+        inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, numInputVectors + [mw])
+        outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, numInputVectors + [mh])
+        if T is not None:
+            no_act = 0
+            node_inp_list = ["inp", "weights", "thresh"]
+            if odt == DataType["BIPOLAR"]:
+                actval = 0
+            else:
+                actval = odt.min()
+        else:
+            # no thresholds
+            node_inp_list = ["inp", "weights"]
+            actval = 0
+            no_act = 1
+        mvau_node = helper.make_node(
+            "MVAU_hls", #TODO: add rtl support (configurable as param)
+            node_inp_list,
+            ["outp"],
+            domain="finn.custom_op.fpgadataflow.hls",
+            backend="fpgadataflow",
+            MW=mw,
+            MH=mh,
+            SIMD=simd,
+            PE=pe,
+            M=m,
+            numInputVectors=numInputVectors,
+            inputDataType=export_idt.name,
+            weightDataType=export_wdt.name,
+            outputDataType=odt.name,
+            ActVal=actval,
+            binaryXnorMode=binary_xnor_mode,
+            noActivation=no_act,
+            resType="lut",
+            mem_mode=mem_mode,
+            ram_style=ram_style,
+            ram_style_thresholds=ram_style_thresholds,
+            runtime_writeable_weights=0,
+        )
+
+        graph = helper.make_graph(nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp])
+        model = qonnx_make_model(graph, producer_name="mvau-model")
+        model = ModelWrapper(model)
+
+        model.set_tensor_datatype("inp", idt)
+        model.set_tensor_datatype("outp", odt)
+        model.set_tensor_datatype("weights", wdt)
+        # model.set_tensor_shape("weights", (channels, 1, k_h, k_w)) from VVAU
+        if binary_xnor_mode:
+            # convert bipolar to binary
+            model.set_initializer("weights", (W + 1) / 2)
+        else:
+            model.set_initializer("weights", W)
+        if T is not None:
+            model.set_tensor_datatype("thresh", tdt)
+            model.set_initializer("thresh", T)
+
+        # Minimize weight & accumulator width to obtain realistic resource consumption
+        # model = model.transform(InferShapes())
+        model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(InferDataTypes())
+
+        return model
+
+    def step_make_model(self):
+        # Read params
+        idt = self.params["idt"]
+        wdt = self.params["wdt"]
+        act = self.params["act"]
+
+        numInputVectors = self.params["nhw"]
+        mw = self.params["mw"]
+        mh = self.params["mh"]
+        sf = self.params["sf"]
+        nf = self.params["nf"]
+        m = self.params["m"]
+
+        mem_mode = self.params["mem_mode"]
+        ram_style = self.params["ram_style"]
+        ram_style_thr = self.params["ram_style_thr"]
+
+        output_dict = {}
+
+        # convert string to FINN DataType
+        idt = DataType[idt]
+        wdt = DataType[wdt]
+        if act is not None:
+            act = DataType[act]
+
+        # Determine and log folding
+        if sf == -1:
+            sf = mw
+        simd = mw // sf
+        if nf == -1:
+            nf = mh
+        pe = mh // nf
+        if mw % simd != 0 or mh % pe != 0:
+            print("Invalid simd/pe configuration, skipping")
+            return
+        if m > 1 and (simd != mw or pe != mh):
+            print("M > 1 not possible for non-max simd/pe, skipping")
+            return
+        output_dict["simd"] = simd
+        output_dict["pe"] = pe
+
+        # Generate weights
+        np.random.seed(123456)  # TODO: verify or switch to modern numpy random generation
+
+        W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+        if "sparsity_type" in self.params:
+            sparsity_type = self.params["sparsity_type"]
+        else:
+            sparsity_type = "none"
+
+        if sparsity_type == "none":
+            if "sparsity_amount" in self.params:
+                if self.params["sparsity_amount"] > 0:
+                    print("sparsity amount > 0 not applicable for none sparsity, skipping")
+                    return
+        else:
+            if self.params["sparsity_amount"] == 0:
+                print("sparsity amount = 0 not applicable for selected sparsity, skipping")
+                return
+            if sparsity_type == "unstructured":
+                idx = np.random.choice(
+                    mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False
+                )
+                W = np.reshape(W, -1)
+                W[idx] = 0.0
+                W = np.reshape(W, (mw, mh))
+            elif sparsity_type == "rows_random":
+                idx_mw = np.random.choice(mw, size=int(self.params["sparsity_amount"] * mw), replace=False)
+                W[idx_mw, :] = 0.0
+            elif sparsity_type == "cols_random":
+                idx_mh = np.random.choice(mh, size=int(self.params["sparsity_amount"] * mh), replace=False)
+                W[:, idx_mh] = 0.0
+            elif sparsity_type == "rows_regular":
+                if self.params["sparsity_amount"] == 0.25:
+                    idx_mw = np.arange(0, mw, step=4)
+                elif self.params["sparsity_amount"] == 0.5:
+                    idx_mw = np.arange(0, mw, step=2)
+                elif self.params["sparsity_amount"] == 0.75:
+                    idx_mw = np.concatenate(
+                        (np.arange(0, mw, step=4), np.arange(1, mw, step=4), np.arange(2, mw, step=4))
+                    )
+                else:
+                    print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
+                    return
+                W[idx_mw, :] = 0.0
+            elif sparsity_type == "cols_regular":
+                if self.params["sparsity_amount"] == 0.25:
+                    idx_mh = np.arange(0, mh, step=4)
+                elif self.params["sparsity_amount"] == 0.5:
+                    idx_mh = np.arange(0, mh, step=2)
+                elif self.params["sparsity_amount"] == 0.75:
+                    idx_mh = np.concatenate(
+                        (np.arange(0, mh, step=4), np.arange(1, mh, step=4), np.arange(2, mh, step=4))
+                    )
+                else:
+                    print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
+                    return
+                W[:, idx_mh] = 0.0
+
+            else:
+                print("ERROR: unknown sparsity type")
+                raise Exception("ERROR: unknown sparsity type")
+
+        # TODO: implement enforce option which prevents naturally occurring sparsity
+        # params["sparsity_enforce"]
+        # TODO: implement distribution option which selects between uniform/normal/??
+        # params["sparsity_distribution"]
+
+        # log resulting sparsity statistics
+        # could be higher than selected due to naturally occurring sparsity
+        num_zeros = (W == 0).sum()
+        num_ones = (W == 1).sum() + (W == -1).sum()
+        num_p2 = 0
+        for w in np.nditer(W):
+            if w != 0 and w != 1 and w != -1:
+                if w > 0:
+                    if math.log2(w).is_integer():
+                        num_p2 = num_p2 + 1
+                else:
+                    if math.log2(-w).is_integer():
+                        num_p2 = num_p2 + 1
+        output_dict["zero_weights"] = round(num_zeros / W.size, 2)
+        output_dict["easy_weights"] = round((num_zeros + num_ones + num_p2) / W.size, 2)
+
+        # Generate thresholds
+        if act is None:
+            # no activation, produce accumulators
+            T = None
+            tdt = None
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                odt = DataType["UINT32"]
+            else:
+                odt = DataType["INT32"]
+        else:
+            odt = act
+            # set range for threshold values according to worst-case accumulator range (not weight value specific)
+            # this could result in some thresholds being clipped by MinimizeAccumulatorWidth
+            # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt)
+            # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt)
+            # acc_min = min(min(lower_range), min(upper_range))
+            # acc_max = max(max(lower_range), max(upper_range))
+            # set range for threshold values according to actual accumulator range for the generated weights
+            (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt)
+            n_steps = act.get_num_possible_values() - 1
+            T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32)
+            # provide non-decreasing thresholds
+            T = np.sort(T, axis=1)
+            # generate thresholds for activation
+            if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+                tdt = DataType["UINT32"]
+                # bias thresholds to be positive
+                T = np.ceil((T + mw) / 2)
+                assert (T >= 0).all()
+            else:
+                tdt = DataType["INT32"]
+
+        # Create model
+        model = self._make_single_mvau_model(
+            W, numInputVectors, pe, simd, m, wdt, idt, odt, T, tdt, mem_mode, ram_style, ram_style_thr
+        )
+        model = model.transform(GiveUniqueNodeNames())
+        node = model.get_nodes_by_op_type("MVAU_hls")[0]
+        inst = getCustomOp(node)
+
+        self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type
+        return model, output_dict
+
+    def run(self):
+        self.steps_simple_model_flow()
diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py
new file mode 100644
index 0000000000..ddf8b0d0de
--- /dev/null
+++ b/benchmarking/dut/resnet50_custom_steps.py
@@ -0,0 +1,252 @@
+# Copyright (C) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from qonnx.core.modelwrapper import ModelWrapper
+
+from qonnx.transformation.fold_constants import FoldConstants
+
+from qonnx.transformation.general import (
+    ConvertSubToAdd,
+    ConvertDivToMul,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    SortGraph,
+    RemoveUnusedTensors,
+    GiveUniqueParameterTensors,
+    RemoveStaticGraphInputs,
+    ApplyConfig,
+)
+
+from finn.transformation.streamline.absorb import (
+    AbsorbScalarMulAddIntoTopK,
+    AbsorbAddIntoMultiThreshold,
+    AbsorbMulIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+    Absorb1BitMulIntoMatMul,
+    Absorb1BitMulIntoConv,
+    AbsorbConsecutiveTransposes,
+    AbsorbTransposeIntoMultiThreshold,
+)
+
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedAdd,
+    CollapseRepeatedMul,
+)
+
+from finn.transformation.streamline.reorder import (
+    MoveAddPastMul,
+    MoveScalarMulPastMatMul,
+    MoveScalarAddPastMatMul,
+    MoveAddPastConv,
+    MoveScalarMulPastConv,
+    MoveScalarLinearPastInvariants,
+    MoveMaxPoolPastMultiThreshold,
+)
+
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+
+# just for not linear
+from finn.transformation.streamline.reorder import (
+    MoveLinearPastEltwiseAdd,
+    MoveLinearPastFork,
+)
+
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.remove import RemoveIdentityOps
+from qonnx.core.datatype import DataType
+
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.insert_topk import InsertTopK
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    ShellFlowType,
+)
+
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
+
+
+def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(RemoveStaticGraphInputs())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(InsertTopK())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    return model
+
+
+def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    streamline_transformations = [
+        AbsorbScalarMulAddIntoTopK(),  # before MoveAddPastMul to avoid int->float
+        ConvertSubToAdd(),
+        ConvertDivToMul(),
+        RemoveIdentityOps(),
+        CollapseRepeatedMul(),
+        BatchNormToAffine(),
+        ConvertSignToThres(),
+        MoveAddPastMul(),
+        MoveScalarAddPastMatMul(),
+        MoveAddPastConv(),
+        MoveScalarMulPastMatMul(),
+        MoveScalarMulPastConv(),
+        MoveScalarLinearPastInvariants(),
+        MoveAddPastMul(),
+        CollapseRepeatedAdd(),
+        CollapseRepeatedMul(),
+        AbsorbAddIntoMultiThreshold(),
+        FactorOutMulSignMagnitude(),
+        MoveMaxPoolPastMultiThreshold(),
+        AbsorbMulIntoMultiThreshold(),
+        Absorb1BitMulIntoMatMul(),
+        Absorb1BitMulIntoConv(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig):
+    streamline_transformations = [
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+    ]
+    for trn in streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+    return model
+
+
+def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    for iter_id in range(4):
+        model = step_resnet50_streamline_linear(model, cfg)
+        model = step_resnet50_streamline_nonlinear(model, cfg)
+
+        # big loop tidy up
+        model = model.transform(RemoveUnusedTensors())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+        model = model.transform(SortGraph())
+
+    model = model.transform(DoubleToSingleFloat())
+
+    return model
+
+
+def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
+    model = model.transform(InferDataLayouts())
+    model = model.transform(DoubleToSingleFloat())
+    model = model.transform(InferDataTypes())
+    model = model.transform(SortGraph())
+
+    to_hw_transformations = [
+        to_hw.InferAddStreamsLayer,
+        LowerConvsToMatMul,
+        to_hw.InferChannelwiseLinearLayer,
+        to_hw.InferPool,
+        AbsorbTransposeIntoMultiThreshold,
+        RoundAndClipThresholds,
+        to_hw.InferQuantizedMatrixVectorActivation,
+        to_hw.InferThresholdingLayer,
+        AbsorbConsecutiveTransposes,
+        to_hw.InferConvInpGen,
+        to_hw.InferDuplicateStreamsLayer,
+        to_hw.InferLabelSelectLayer,
+    ]
+    for trn in to_hw_transformations:
+        model = model.transform(trn())
+        model = model.transform(InferDataLayouts())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InferDataTypes())
+
+    model = model.transform(RemoveCNVtoFCFlatten())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveUnusedTensors())
+    model = model.transform(SortGraph())
+
+    return model
+
+
+def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
+    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+        # previously, we would always ran the finn experimental partitioner on ResNet-50
+        # this is now changed and a fixed floorplan is applied
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(ApplyConfig(cfg.floorplan_path))
+        print("Fixed SLR floorplanning applied")
+
+        # if you would like to try out the experimental partitioner
+        # please uncomment the lines (that are not marked as comment) below.
+
+        # import numpy as np
+        # from finnexperimental.analysis.partitioning import partition
+
+        # comment: apply partitioning of the model, restricting the first and last layer to SLR0
+        # default_slr = 0
+        # abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
+
+        # comment: increase resource limits to make partitioning feasible, except for SLR0
+        # comment: which also has DDR subsystem
+        # limits = np.array(
+        #    [
+        #        [0.75, 0.5, 0.7, 0.6, 0.6],
+        #        [1, 0.7, 0.9, 0.8, 0.8],
+        #        [1, 0.7, 0.9, 0.8, 0.8],
+        #        [1, 0.7, 0.9, 0.8, 0.8],
+        #    ]
+        # )
+        # floorplan = partition(
+        #    model,
+        #    cfg.synth_clk_period_ns,
+        #    cfg.board,
+        #    abs_anchors=abs_anchors,
+        #    multivariant=False,
+        #    linear_cuts=True,
+        #    limits=limits,
+        # )[0]
+
+        # comment: apply floorplan to model
+        # model = model.transform(ApplyConfig(floorplan))
+        # print("SLR floorplanning applied from partitioner")
+    return model
\ No newline at end of file
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
new file mode 100644
index 0000000000..0dc6444a55
--- /dev/null
+++ b/benchmarking/dut/transformer.py
@@ -0,0 +1,1046 @@
+# Adapted from Christoph's attention-dummy repository
+
+# PyTorch base package: Math and Tensor Stuff
+import torch
+# Brevitas wrapper around PyTorch tensors adding quantization information
+from brevitas.quant_tensor import QuantTensor
+# Brevitas: Quantized versions of PyTorch layers
+from brevitas.nn import (
+    QuantMultiheadAttention,
+    QuantEltwiseAdd,
+    QuantIdentity,
+    QuantLinear,
+    QuantReLU
+)
+import os
+# Progressbar
+from tqdm import trange
+import numpy as np
+from brevitas.export import export_qonnx
+import random
+import json
+import subprocess
+from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+# FINN dataflow builder
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
+from bench_base import bench, step_synth_harness
+
+# Custom build steps required to streamline and convert the attention operator
+from dut.transformer_custom_steps import (
+    step_tidy_up_pre_attention,
+    step_tidy_up_post_attention,
+    step_streamline_attention,
+    step_streamline_residual,
+    step_streamline_norms,
+    step_streamline_positional,
+    step_convert_attention_to_hw,
+    step_convert_elementwise_binary_to_hw,
+    step_convert_lookup_to_hw,
+    step_replicate_streams,
+    set_target_parallelization,
+    set_fifo_depths,
+    step_apply_folding_config,
+    node_by_node_rtlsim,
+    node_by_node_cppsim
+)
+from performance.platform_build_steps import(
+     test_step_gen_vitis_xo,
+     test_step_gen_instrumentation_wrapper,
+     test_step_gen_instrwrap_sim,
+     test_step_insert_tlastmarker,
+     test_step_export_xo,
+     test_step_build_platform,
+     test_step_run_instrwrap_sim
+)
+
+### ADAPTED FROM utils.py
+# Seeds all relevant random number generators to the same seed for
+# reproducibility
+def seed(s):
+    random.seed(s)
+    np.random.seed(s)
+    torch.manual_seed(s)
+
+### ADAPTED FROM model.py
+# Derives a weight quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def weight_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.quant.base import NarrowIntQuant, MaxStatsScaling
+    from brevitas.quant.solver import WeightQuantSolver
+    from brevitas.inject.enum import RestrictValueType
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Per tensor quantization, not per channel
+        scaling_per_output_channel = False
+        # What is this? Copied from PerTensorFloatScaling*
+        #   Probably restricts the scale to be floating-point?
+        restrict_scaling_type = RestrictValueType.FP
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Derives a bias quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def bias_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.quant import IntBias
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(IntBias):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Do not require the bit-width to be adjusted to fit the accumulator to
+        # which the bias is added
+        requires_input_bit_width = False
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Derives an activation quantizer from the brevitas bases leaving bit-width and
+# signedness configurable
+def act_quantizer(bits, _signed=True):
+    # Brevitas quantizer base classes
+    from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling
+    from brevitas.quant.solver import ActQuantSolver
+    from brevitas.inject.enum import RestrictValueType
+
+    # Derive a Quantizer from the brevitas bases
+    class Quantizer(
+        IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver
+    ):
+        # Configure the quantization bit-width
+        bit_width = bits
+        # Signedness of the quantization output
+        signed = _signed
+        # Per tensor quantization, not per channel
+        scaling_per_output_channel = False
+        # What is this? Copied from PerTensorFloatScaling*
+        #   Probably restricts the scale to be floating-point?
+        restrict_scaling_type = RestrictValueType.FP
+
+    # Return the derived quantizer configuration
+    return Quantizer
+
+
+# Gets the normalization layer from configuration key
+def get_norm(key, normalized_shape):
+    # Transposes Sequence and Embedding dimensions
+    class Transpose(torch.nn.Module):
+        # Forward pass transposing the feature map
+        def forward(self, x):  # noqa: May be static
+            # Transpose the last two dimensions of batch x seq x emb layout
+            return torch.transpose(x, dim0=-1, dim1=-2)
+
+    # Dictionary mapping keys to supported normalization layer implementations
+    norms = {
+        # PyTorch default layer normalization. Needs to know the shape of the
+        # feature map to be normalized
+        "layer-norm": torch.nn.LayerNorm(
+            # Note: Disable affine parameters as potential negative scale causes
+            # streamlining issues later
+            normalized_shape=normalized_shape, elementwise_affine=False
+        ),
+        # PyTorch default 1-dimensional batch normalization. Needs to transpose
+        # embedding and sequence dimension to normalized over the embedding
+        # dimension, which is expected to be second.
+        "batch-norm": torch.nn.Sequential(
+            # Note: Disable affine parameters as potential negative scale causes
+            # streamlining issues later
+            Transpose(), torch.nn.LazyBatchNorm1d(affine=False), Transpose()
+        ),
+        # No normalization by a PyTorch built-in identity layer. Should not
+        # appear in the graph.
+        "none": torch.nn.Identity()
+    }
+
+    # Select the normalization layer by key
+    return norms[key]
+
+
+# Gets the attention mask from configuration key and shape
+def get_mask(key, length):
+    # Dictionary mapping keys to supported normalization layer implementations
+    masks = {
+        # No attention mask
+        "none": None,
+        # Generate the upper triangular mask for causal attention
+        "causal": torch.nn.Transformer.generate_square_subsequent_mask(length),
+        # Square matrix with entries randomly set to -inf or 0.0 with 50%
+        # probability each
+        "random": torch.where(  # noqa: Confused by types?
+            torch.rand(length, length) > 0.5, -torch.inf, 0.0
+        )
+    }
+    # Select the mask type by key
+    return masks[key]
+
+
+# Single-layer scaled dot-product attention block with MLP and normalization
+class TransformerBlock(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(
+            self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits
+    ):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+
+        # Input quantizer to the scaled dot-product attention operations, shared
+        # by queries, keys and values inputs. It is important to have this
+        # quantizer separate and not preceding the fork node of the residual
+        # branches to avoid consecutive quantizers in the skip branch.
+        # Note: For some reason it seems not to be possible to use the
+        #   in_proj_input_quant of the attention operator
+        self.sdp_input_quant = QuantIdentity(
+            # Quantize at the output
+            act_quant=act_quantizer(bits, _signed=True),
+            # Pass quantization information on to the next layer.
+            return_quant_tensor=True
+        )
+        # Quantized scaled dot-product attention operator
+        self.sdp = QuantMultiheadAttention(
+            # Size of the embedding dimension (input and output)
+            embed_dim=emb_dim,
+            # Number of attention heads
+            num_heads=num_heads,
+            # Enable a bias added to the input and output projections
+            bias=bias,
+            # Layout of the inputs:
+            #   Batch x Sequence x Embedding (batch-first, True)
+            #   Sequence x Batch x Embedding (batch-second, False)
+            batch_first=True,
+            # If query, key and value input are the same, packed input
+            # projections use a single, large linear projection to produce
+            # the actual query, key and value inputs. Otherwise, use
+            # separate linear projections on each individual input.
+            packed_in_proj=False,
+            # Brevitas has this as an unsigned quantizer by default, but
+            # finn can only handle signed quantizer
+            attn_output_weights_quant=act_quantizer(bits, _signed=True),
+            # Insert an additional quantizer in front ot the softmax. In our
+            # finn custom-op, this will be matched to the quantizer
+            # following the query and key matmul.
+            # Note: Disable to prevent the quantizer from tripping over -inf
+            # from the attention mask
+            softmax_input_quant=None,
+            # Quantize the input projections weights as configured
+            in_proj_weight_quant=weight_quantizer(bits, _signed=True),
+            # Quantize the bias of the input projections as configured
+            in_proj_bias_quant=bias_quantizer(bits, _signed=True),
+            # No quantization in front of the input projections as this is
+            # either done by a standalone quantizer preceding the whole block
+            in_proj_input_quant=None,
+
+            # Quantize the output projections weights as configured
+            out_proj_weight_quant=weight_quantizer(bits, _signed=True),
+            # Quantize the bias of the output projections as configured
+            out_proj_bias_quant=bias_quantizer(bits, _signed=True),
+            # Quantize the input to the output projection as configured
+            out_proj_input_quant=act_quantizer(bits, _signed=True),
+
+            # Quantizer the key after projections as configured
+            k_transposed_quant=act_quantizer(bits, _signed=True),
+            # Quantize the queries after projections as configured
+            q_scaled_quant=act_quantizer(bits, _signed=True),
+            # Quantize the values after projection as configured
+            v_quant=act_quantizer(bits, _signed=True),
+
+            # No output quantization for now, as stacking multiple layers
+            # results in multiple multi-thresholds in succession
+            out_proj_output_quant=None,
+
+            # Return the quantization parameters so the next layer can
+            # quantize the bias
+            return_quant_tensor=True
+        )
+        # Residual branch addition skipping over the attention layer
+        self.residual_sdp = QuantEltwiseAdd(
+            # Shared input activation quantizer such that the scales at both
+            # input branches are identical. This allows floating point scale
+            # factor to be streamlined past the add-node.
+            input_quant=act_quantizer(bits, _signed=True),
+            # Disable the output quantizer after the add operation. Output of
+            # the add will have one more bit than the inputs, which is probably
+            # fine and does not require re-quantization.
+            output_quant=None,
+            # Pass quantization information on to the next layer.
+            return_quant_tensor=True
+        )
+        # Normalization following the attention layer
+        self.norm_sdp = torch.nn.Sequential(
+            # Select the normalization layer implementation
+            get_norm(key=norm, normalized_shape=emb_dim),
+            # No quantizer to avoid consecutive quantizer in the MLP residual
+            # branch. See input quantizer in front of the first MLP layer.
+        )
+
+        # Quantized MLP following the scaled dot-product attention
+        self.mlp = torch.nn.Sequential(
+            # Quantize the inputs to the MLP block. Placed here to not have this
+            # at the input of the residual branch.
+            QuantIdentity(
+                # Quantize at the output
+                act_quant=act_quantizer(bits, _signed=True),
+                # Pass quantization information on to the next layer.
+                return_quant_tensor=True
+            ),
+            # First mlp layer projecting to the mlp dimension
+            QuantLinear(
+                # Inputs have the size of the attention embedding dimension
+                emb_dim,
+                # Project to the configured mlp dimension, which is typically
+                # larger than the embedding dimension
+                mlp_dim,
+                # Enable the learned bias vector
+                bias=bias,
+                # Quantize weights to the same representation as all other
+                # layers
+                weight_quant=weight_quantizer(bits, _signed=True),
+                # Quantize the bias to the same representation as all other
+                # layers
+                bias_quant=bias_quantizer(bits, _signed=True),
+                # No input quantizer as this is directly preceded by a
+                # standalone quantizer
+                input_quant=None,
+                # Not output quantizer as this is directly followed by a
+                # quantized ReLU activation taking care of quantization
+                output_quant=None,
+                # Return the quantization parameters so the next layer can
+                # quantize the bias
+                return_quant_tensor=True
+            ),
+            # Use the ReLU activation function instead of the more commonly used
+            # GELU, as the latter is not mapped easily to hardware with FINN
+            QuantReLU(
+                # Note: ReLU must be quantized to unsigned representation
+                act_quant=act_quantizer(bits, _signed=False),
+                # Return the quantization parameters so the next layer can
+                # quantize the bias
+                return_quant_tensor=True
+            ),
+            # Second mlp layer projecting back to the embedding dimension
+            QuantLinear(
+                # Inputs have the configured mlp dimension, which is typically
+                # larger than the embedding dimension
+                mlp_dim,
+                # Project back to the size of the attention embedding dimension
+                emb_dim,
+                # Enable the learned bias vector
+                bias=bias,
+                # Quantize weights to the same representation as all other
+                # layers
+                weight_quant=weight_quantizer(bits, _signed=True),
+                # Quantize the bias to the same representation as all other
+                # layers
+                bias_quant=bias_quantizer(bits, _signed=True),
+                # No input quantizer as the inputs are already quantized by the
+                # preceding ReLU layer
+                input_quant=None,
+                # Not output quantizer as this is directly followed by a
+                # quantized element-wise addition taking care of quantization
+                output_quant=None,
+                # Pass quantization information on to the next layer.
+                return_quant_tensor=True
+            ),
+        )
+        # Residual branch addition skipping over the MLP layer
+        self.residual_mlp = QuantEltwiseAdd(
+            # Shared input activation quantizer such that the scales at both
+            # input branches are identical. This allows floating point scale
+            # factor to be streamlined past the add-node.
+            input_quant=act_quantizer(bits, _signed=True),
+            # Disable the output quantizer after the add operation. Output of
+            # the add will have one more bit than the inputs, which is probably
+            # fine and does not require re-quantization.
+            output_quant=None,
+            # Pass quantization information on to the next layer.
+            # Note: Not for the last layer to allow this to be combined with
+            # standard pytorch calls like .detach() or .numpy(), which are
+            # not directly available on QuantTensor.
+            return_quant_tensor=True
+        )
+        # Normalization following the attention layer
+        self.norm_mlp = torch.nn.Sequential(
+            # Select the normalization layer implementation
+            get_norm(key=norm, normalized_shape=emb_dim),
+            # No quantizer to avoid consecutive quantizer in the SDP residual
+            # branch
+        )
+        # Generate the attention mask according to configuration
+        self.mask = get_mask(mask, seq_len)
+
+    # Forward pass through the transformer block
+    def forward(self, x):
+        # Move the mask to the same device as the input, just in case...
+        mask = self.mask.to(x.device) if self.mask is not None else None
+        # Quantize the input to the attention block
+        q = self.sdp_input_quant(x)
+        # Scaled dot-product attention with residual branch and normalization
+        x = self.norm_sdp(
+            self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0])
+        )
+        # MLP layer with residual branch and normalization
+        return self.norm_mlp(self.residual_mlp(x, self.mlp(x)))
+
+
+# Quantized sinusoidal positional encoding layer
+class QuantSinusoidalPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor
+        )
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Get the size of the inputs to dynamically generate encodings of the
+        # same size
+        _, seq, emb = x.shape
+        # Start by enumerating all steps of the sequence
+        i = torch.as_tensor([[n] for n in range(seq)])
+        # Scale factor adjusting the frequency/wavelength of the sinusoid
+        # depending on the embedding dimension index
+        f = torch.as_tensor([1e4 ** -(i / emb) for i in range(0, emb, 2)])
+        # Prepare empty positional encoding tensor of the same size as the input
+        pos = torch.empty(seq, emb)
+        # Fill the positional encoding with alternating sine and cosine waves
+        pos[:, 0::2] = torch.sin(f * i)
+        pos[:, 1::2] = torch.cos(f * i)
+        # Move the encoding tensor to the same device as the input tensor
+        pos = pos.to(x.device, dtype=x.dtype)
+        # Add the quantized encoding to the quantized input
+        return self.add(x, pos)
+
+
+# Quantized learned positional encoding layer
+class QuantLearnedPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(
+            self,
+            seq_len,
+            emb_dim,
+            input_quant,
+            output_quant,
+            return_quant_tensor
+    ):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor
+        )
+        # Register a parameter tensor representing the not quantized positional
+        # encoding
+        self.pos = torch.nn.Parameter(torch.empty(seq_len, emb_dim))
+        # Reset/Initialize the parameter tensor
+        self.reset_parameters()
+
+    # Resets/Initializes the positional encoding parameter tensor
+    def reset_parameters(self):
+        # Initialize the positional encoding from a normal distribution with
+        # zero mean and unit standard deviation
+        torch.nn.init.normal_(self.pos, mean=0, std=1)
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Add the quantized encoding to the quantized input
+        return self.add(x, self.pos)
+
+
+# Lazy version of the learned encoding not requiring input dimensions at
+# initialization, inferring these at the first forward pass
+class LazyQuantLearnedPositionalEncoding(
+    torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa
+):
+    # Once initialized, this will become a QuantLearnedPositionalEncoding as
+    # defined above
+    cls_to_become = QuantLearnedPositionalEncoding
+    # Parameter tensor of the QuantLearnedPositionalEncoding is uninitialized
+    pos: torch.nn.UninitializedParameter
+
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the quantizer parts of QuantLearnedPositionalEncoding,
+        # leaving the dimensions empty
+        super().__init__(0, 0, input_quant, output_quant, return_quant_tensor)
+        # Register an uninitialized parameter tensor for the positional encoding
+        self.pos = torch.nn.UninitializedParameter()
+
+    # Resets/Initializes the positional encoding parameter tensor
+    def reset_parameters(self):
+        # If this has already been initialized, delegate to the actual
+        # implementation
+        if not self.has_uninitialized_params():
+            super().reset_parameters()
+
+    # Initializes/Materializes the uninitialized parameter tensor given some
+    # sample input tensor to infer the dimensions
+    def initialize_parameters(self, x):
+        # Only materialize the parameter tensor if it is not yet initialized
+        if self.has_uninitialized_params():
+            # Do not accumulate gradient information from initialization
+            with torch.no_grad():
+                # Get the size of the inputs to generate encodings of the same
+                # size
+                _, seq, emb = x.shape
+                # Materialize the positional encoding parameter tensor
+                self.pos.materialize((seq, emb))
+                # Properly initialize the parameters by resetting the values
+                self.reset_parameters()
+
+
+# Quantized binary positional encoding layer
+class QuantBinaryPositionalEncoding(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(self, input_quant, output_quant, return_quant_tensor):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+        # Adds the quantized input and positional encoding
+        self.add = QuantEltwiseAdd(
+            # Input quantization to be applied to the input as well as the
+            # positional encodings
+            input_quant=input_quant,
+            # Quantize the outputs after adding input and positional encoding
+            output_quant=output_quant,
+            # Returns quantization information to the next layer
+            return_quant_tensor=return_quant_tensor
+        )
+
+    # Forward pass adding positional encoding to the input tensor
+    def forward(self, x):
+        # Get the size of the inputs to dynamically generate encodings of the
+        # same size
+        _, seq, emb = x.shape
+        # Binary positional encoding fills the embedding dimension with the bit
+        # pattern corresponding to the position in the sequence
+        pos = torch.as_tensor([
+            [(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)
+        ])
+        # Move the encoding tensor to the same device as the input tensor
+        pos = pos.to(x.device, dtype=x.dtype)
+        # Add the quantized encoding tp the quantized input
+        #   Note: Convert encoding to bipolar representation
+        return self.add(x, 2 * pos - 1)
+
+
+# Gets the positional encoding layer from configuration key, quantizers and
+# shape
+def get_positional_encoding(
+        key, input_quant, output_quant, return_quant_tensor
+):
+    # Dictionary mapping keys to supported normalization layer implementations
+    masks = {
+        # No positional encoding
+        "none": QuantIdentity(
+            act_quant=input_quant, return_quant_tensor=return_quant_tensor
+        ),
+        # Fixed, sinusoidal positional encoding according to Vaswani et al. with
+        # added quantizers
+        "sinusoidal": QuantSinusoidalPositionalEncoding(
+            input_quant, output_quant, return_quant_tensor
+        ),
+        # Fixed, binary positional encoding with quantizers
+        "binary": QuantBinaryPositionalEncoding(
+            input_quant, output_quant, return_quant_tensor
+        ),
+        # Learned positional encoding with quantizers
+        "learned": LazyQuantLearnedPositionalEncoding(
+            input_quant, output_quant, return_quant_tensor
+        )
+    }
+    # Select the positional encoding type by key
+    return masks[key]
+
+
+# Unpacks the standard PyTorch tensor from a brevitas QuantTensor
+def unpack_from_quant(tensor: torch.Tensor | QuantTensor):
+    # If this is a QuantTensor we can extract the wrapped tensor
+    if isinstance(tensor, QuantTensor):
+        # The underlying tensor is wrapped as the value attribute
+        return tensor.value
+    # Assume this is already a plain PyTorch tensor
+    return tensor
+
+
+# Dummy transformer encoder model
+class DummyTransformer(torch.nn.Module):
+    # Initializes the model and registers the module parameters
+    def __init__(
+            self,
+            # Number of layers of attention blocks
+            num_layers,
+            # Number of attention heads per block
+            num_heads,
+            # Size of embedding dimension going into/out of the attention block
+            emb_dim,
+            # Size of MLP dimension in each attention block
+            mlp_dim,
+            # Length of the input sequence, i.e., context size
+            seq_len,
+            # Enables bias term added to Linear layers
+            bias,
+            # Quantization bit-width: For now all layers are quantized to the
+            # same bit-width
+            bits,
+            # Type of normalization layer to use in the transformer blocks
+            #   Options are: layer-norm, batch-norm and none
+            norm="none",
+            # Type of attention mask to use
+            #   Options are: none, causal or const
+            mask="none",
+            # Type of positional encoding to use at the input
+            #   Options are: none, sinusoidal, binary, learned
+            positional_encoding="none"
+    ):
+        # Initialize the PyTorch Module superclass
+        super().__init__()
+
+        # Positional encoding layer at the input
+        self.pos = get_positional_encoding(
+            # Select the implementation by configuration key
+            key=positional_encoding,
+            # Quantize the inputs to the positional encoding to the same
+            # bit-width as the input
+            input_quant=act_quantizer(bits, _signed=True),
+            # Quantize the sum of input and positional encoding to the same
+            # bit-width as the input
+            output_quant=None,
+            # Pass quantization information on to the next layer
+            return_quant_tensor=True
+        )
+
+        # Sequence of num_layers transformer encoder blocks
+        self.encoder = torch.nn.Sequential(*[
+            TransformerBlock(
+                num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits
+            ) for _ in range(num_layers)
+        ])
+
+    # Model forward pass taking an input sequence and returning a single set of
+    # class probabilities
+    def forward(self, x):
+        # Add positional encoding to the input and feed through the encoder
+        # stack
+        # Note: Get the wrapped value out of the QuantTensor to have only a
+        # single output from the model.
+        return unpack_from_quant(self.encoder(self.pos(x)))
+
+### ADAPTED FROM export.py
+
+# Check whether a layer is a normalization layer of some supported type
+def is_norm_layer(module):
+    # Set of normalization layer (bases) which maybe need to be patched
+    norm_layers = {
+        # All BatchNorm and InstanceNorm variants derive from this baseclass
+        torch.nn.modules.batchnorm._NormBase,  # noqa: Access to _NormBase
+        # LayerNorm has a unique implementation
+        torch.nn.LayerNorm,
+    }
+    # Check the module against all supported norm layer types
+    return any(isinstance(module, norm) for norm in norm_layers)
+
+
+# Fixes export issues of normalization layers with disabled affine parameters.
+# Somehow the export to ONNX trips when it encounters the weight and bias tensor
+# to be 'None'.
+def patch_non_affine_norms(model: torch.nn.Module):  # noqa: Shadows model
+    # Iterate all modules in the model container
+    for name, module in model.named_modules():
+        # If the module is a normalization layer it might require patching the
+        # affine parameters
+        if is_norm_layer(module):
+            # Check whether affine scale parameters are missing
+            if hasattr(module, "weight") and module.weight is None:
+                # There need to be running statistics to patch the scales
+                if hasattr(module, "running_var"):
+                    # Patch the affine bias by all 1 tensor of the same shape,
+                    # type and device as the running variance
+                    module.weight = torch.nn.Parameter(
+                        torch.ones_like(module.running_var)
+                    )
+            # Check whether affine bias parameters are missing
+            if hasattr(module, "bias") and module.bias is None:
+                # There need to be running statistics to patch the scales
+                if hasattr(module, "running_mean"):
+                    # Patch the affine bias by all 0 tensor of the same shape,
+                    # type and device as the running mean
+                    module.bias = torch.nn.Parameter(
+                        torch.zeros_like(module.running_var)
+                    )
+    # Return the patched model container
+    return model
+
+template_folding_yaml = """
+# Per operator type default configurations
+defaults:
+    # Scaled dot-product attention head implemented via HLS
+    ScaledDotProductAttention_hls:
+        # Type of memory to be used for internal buffer storage
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Type of memory to be used fo the attention mask (if present)
+        #   Options: auto, block, distributed
+        ram_style_mask: block
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        mac_resource: lut
+    # Addition of two inputs (constants or streamed) implemented via HLS
+    ElementwiseAdd_hls:
+        # Type of memory to be used for internal buffer storage and/or constant
+        # parameter tensors
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Matrix vector activation unit implemented via HLS
+    MVAU_hls:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Matrix vector activation unit implemented via RTL
+    MVAU_rtl:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        # Note: RTL MVAU currently does not support LUT-based implementation
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
+    Thresholding_hls:
+        # Memory mode for threshold storage
+        #   Options: internal_embedded, internal_decoupled
+        mem_mode: internal_decoupled
+        # Type of memory to be used for threshold storage if "internal_decoupled"
+        #   Options: distributed, block
+        ram_style: distributed
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
+    Thresholding_rtl:
+        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
+        # depth of the thresholds
+        # Note: This combination forces "distributed" LUT implementation
+        depth_trigger_uram: 2147483647  # "infinity"
+        depth_trigger_bram: 2147483647  # "infinity"
+    #    # Note: This combination forces "block" RAM implementation
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 1
+    #    # Note: This combination forces "ultra" RAM implementation
+    #    depth_trigger_uram: 1
+    #    depth_trigger_bram: 0
+    #    # Note: This combination is equivalent to "auto"
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 0
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
+    StreamingFIFO_rtl:
+        # RTL vs. IPI implementation of FIFOs
+        #   Options: rtl, vivado
+        impl_style: rtl
+        # Resource type for FIFOs when impl_style is vivado
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Individual, named node-specific configurations here
+    # ...
+"""
+
+class bench_transformer(bench):
+    def step_export_onnx(self, output_onnx_path):
+        # Load the parameters file
+        #params = dvc.api.params_show("params.yaml")
+        # Seed all RNGs
+        seed(self.params["seed"])
+        # Make PyTorch behave deterministically if possible
+        torch.use_deterministic_algorithms(mode=True, warn_only=True)
+        # Create a model instance from the configuration parameters
+        #model = DummyTransformer(**params["model"])
+        model = DummyTransformer(
+            num_layers = self.params["model_num_layers"],
+            num_heads = self.params["model_num_heads"],
+            emb_dim = self.params["model_emb_dim"],
+            mlp_dim = self.params["model_mlp_dim"],
+            seq_len = self.params["model_seq_len"],
+            bias = self.params["model_bias"],
+            bits = self.params["model_bits"],
+            norm = self.params["model_norm"],
+            mask = self.params["model_mask"],
+            positional_encoding = self.params["model_positional_encoding"],
+        )
+
+        # Get the configured sequence length and embedding dimension to generate
+        # test inputs
+        seq, dim = self.params["model_seq_len"], self.params["model_emb_dim"]
+        # No gradient accumulation for calibration passes required
+        with torch.no_grad():
+            # Check whether GPU training is available and select the appropriate
+            # device
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            # Move the model to the training device
+            model = model.to(device)
+            # Multiple passes of calibration might be necessary for larger/deep
+            # models
+            for _ in trange(0, self.params["calibration_passes"], desc="calibrating"):
+                # Pass random data through the model to "calibrate" dummy quantizer.
+                # Large batch to have more calibration samples. Otherwise, there is
+                # too much deviation between this calibration and the verification
+                # samples.
+                model(torch.rand(128, seq, dim, device=device))
+            # Move the model back to the CPU
+            model = model.cpu()
+        # Prevent export issue for missing affine normalization parameters
+        model = patch_non_affine_norms(model)
+        # Switch model to evaluation mode to have it fixed for export
+        model = model.eval()
+        # Sample random input tensor in batch-first layout
+        x = torch.rand(1, seq, dim)
+        # Compute attention output
+        o = model(x)
+        # Save the input and output data for verification purposes later
+        # TODO: go via self.build_inputs["input_npy_path"]
+        np.save("inp.npy", x.detach().numpy())
+        np.save("out.npy", o.detach().numpy())
+        # Export the model graph to QONNX
+        #export_qonnx(model, (x,), "attention.onnx", **self.params["export"])
+        export_qonnx(model, (x,), output_onnx_path, 
+                    opset_version = 14, 
+                    do_constant_folding = True)
+
+    def step_build(self):
+        #with open("params.yaml") as file:
+        #    params = yaml.safe_load(file)
+        # Seed all RNGs
+        seed(self.params["seed"])
+        # Extract sequence length and embedding dimension from parameters
+        seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"]
+
+        # Prepare config files
+        # TODO: make configurable
+        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
+        specialize_layers_dict = {
+            "Defaults": {
+                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
+            },
+            "": {
+                "preferred_impl_style": ""
+            }
+        }
+        with open("specialize_layers.json", "w") as f:
+                json.dump(specialize_layers_dict, f, indent=2)
+        with open("folding.yaml", "w") as f:
+                f.write(template_folding_yaml)
+
+        # Create a configuration for building the scaled dot-product attention
+        # operator to a hardware accelerator
+        cfg = build_cfg.DataflowBuildConfig(
+            # Unpack the build configuration parameters
+            #**params["build"],
+            output_dir = self.build_inputs["build_dir"],
+            stitched_ip_gen_dcp = True,
+            synth_clk_period_ns = self.clock_period_ns,
+            board = self.board,
+            shell_flow_type = "vivado_zynq", #TODO: Alveo support
+            folding_config_file = "folding.yaml",
+            specialize_layers_config_file = "specialize_layers.json",
+            standalone_thresholds = True,
+            max_multithreshold_bit_width = 16,
+            mvau_wwidth_max = 2048,
+            split_large_fifos = True,
+
+            verbose = False, # if True prints stdout and stderr to console instead of build_dataflow.log
+            enable_build_pdb_debug = False,
+
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
+                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
+                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
+                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
+                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
+                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
+            ],
+
+            verify_steps=[
+                # Verify the model after converting to the FINN onnx dialect
+                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
+                # Verify the model again using python mode after the default
+                # streamlining step
+                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+                # Verify the model again after tidy up transformations, right before
+                # converting to HLS
+                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+                # Verify the model after generating C++ HLS and applying folding
+                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+            ],
+            # File with test inputs for verification
+            verify_input_npy="inp.npy",
+            # File with expected test outputs for verification
+            verify_expected_output_npy="out.npy",
+            # Save the intermediate model graphs
+            save_intermediate_models=True,
+            # Avoid RTL simulation for setting the FIFO sizes
+            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
+            # Do not automatically set FIFO sizes as this requires RTL simulation
+            # not implemented for the attention operator
+            auto_fifo_depths=False,
+            # Build steps to execute
+            steps=[
+                # Need to apply some tidy-up transformations before converting to
+                # the finn dialect of onnx
+                step_tidy_up_pre_attention,
+                # Convert all QONNX Quant nodes to Multithreshold nodes
+                "step_qonnx_to_finn",
+                # Tidy up the graph after converting from QONNX to FINN format
+                # Note: Triggers a verification step
+                "step_tidy_up",
+                # Positional encoding needs to be streamlined first with slightly
+                # different order of certain streamlining transformations to avoid
+                # weird rounding issue of intermediate results
+                step_streamline_positional,
+                # Custom streamlining for models containing attention operators
+                step_streamline_attention,
+                # Streamlining of the residual branches
+                step_streamline_residual,
+                # Streamline the normalization layers, i.e., transposed batch norm
+                step_streamline_norms,
+                # Another round using the default streamlining steps
+                # Note: Triggers a verification step
+                "step_streamline",
+                # New conversion of the scaled dot-product attention pattern
+                step_convert_attention_to_hw,
+                # Another tidy-up step to remove unnecessary dimensions and
+                # operations after converting the attention operators to HLS
+                step_tidy_up_post_attention,
+                # Convert the elementwise binary operations to hardware operators.
+                # These include for example adding residual branches and positional
+                # encoding
+                step_convert_elementwise_binary_to_hw,
+                # Convert the Gather layer realizing the input token embedding to
+                # the FINN hardware implementation, i.e., the Lookup layer
+                step_convert_lookup_to_hw,
+                # Properly replicate the stream feeding the query, key and value
+                # projections
+                step_replicate_streams,
+                # Convert most other layers supported by FINN to HW operators
+                "step_convert_to_hw",
+                # Specialize HW layer implementations as either HLS or RTL
+                "step_specialize_layers",
+                "step_create_dataflow_partition",
+                # Set the folding configuration to meet the cycles per sequence
+                # target
+                set_target_parallelization(seq_len, emb_dim),
+                # Apply folding configuration, specifying hardware implementation
+                # details
+                # Note: This triggers a verification step
+                step_apply_folding_config,
+                "step_minimize_bit_width",
+                # The ScaledDotProductAttention custom op does not define any
+                # estimates
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                # Set the attention- and residual-related FIFO depths insert FIFOs
+                # and apply folding configuration once again
+                # Note: Implement all FIFOs with a depth at least as deep as the
+                # sequence length in URAM.
+                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
+                # Run additional node-by-node verification in RTL simulation of the
+                # model before creating the stitched IP
+                # Note: end-to-end verification of the stitched IP in RTL simulation
+                # is still not possible due to missing float IPs
+                node_by_node_cppsim,
+                # Only for debugging for now, does not work if "vivado" style
+                # StreamingFIFOs are used
+                # node_by_node_rtlsim,
+
+                test_step_insert_tlastmarker, # required for instrumentation_wrapper
+
+                "step_create_stitched_ip",
+
+                # "step_measure_rtlsim_performance", # not possible due to float components
+
+                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
+                
+                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
+
+                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
+                #"step_synthesize_bitfile", 
+                #"step_make_pynq_driver",
+                #"step_deployment_package",
+
+                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
+                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
+
+                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
+                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
+                
+                #test_step_export_xo, # preparation step for original instr wrapper integration
+                #test_step_build_platform # synthesis with instr wrapper
+            ]
+        )
+        # Run the build process on the dummy attention operator graph
+        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
+        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
+
+    def run(self):
+        self.steps_full_build_flow()
+
+        # DEBUG code for live logging of long instr wrapper simulation:
+        # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log")
+        # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True)
+        # sim_output_dir = build_dir + "/instrwrap_sim"
+        # # Prepare bash script
+        # bash_script = os.getcwd() + "/run_vivado_sim.sh"
+        # with open(bash_script, "w") as script:
+        #     script.write("#!/bin/bash\n")
+        #     script.write("cd %s\n"%(sim_output_dir))
+        #     script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path))
+        # # Run script
+        # print("Running Vivado simulation of instrumentation wrapper")
+        # sub_proc = subprocess.Popen(["bash", bash_script])
+        # sub_proc.communicate()
+        #######
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
new file mode 100644
index 0000000000..d28a4c501a
--- /dev/null
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -0,0 +1,878 @@
+# ADAPTED FROM Christoph's attention-dummy build_steps.py
+
+# Copies (deep-copies) python objects
+import copy
+# Numpy for loading and comparing the verification input/output
+import numpy as np
+# YAML for loading experiment configurations
+import yaml
+# QONNX wrapper of ONNX model graphs
+from qonnx.core.modelwrapper import ModelWrapper
+# QONNX quantization data types
+from qonnx.core.datatype import DataType
+# Converts ONNX graph nodes to QONNX custom-ops if possible
+from qonnx.custom_op.registry import getCustomOp
+# QONNX graph transformations for renaming and cleaning up
+from qonnx.transformation.general import (
+    Transformation,
+    GiveUniqueNodeNames,
+    GiveReadableTensorNames,
+    RemoveUnusedTensors,
+    RemoveStaticGraphInputs,
+    GiveUniqueParameterTensors,
+    ConvertDivToMul,
+    ConvertSubToAdd
+)
+# Converts BatchNorm operation to affine transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+# QONNX graph transformations for inferring datatypes and shapes
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+# QONNX cleanup transformations
+from qonnx.transformation.remove import RemoveIdentityOps
+# Precompute constant output nodes
+from qonnx.transformation.fold_constants import FoldConstants
+# Streamlining transformation: This is a collection of various transformations
+from finn.transformation.streamline import (
+    ConvertSignToThres, RoundAndClipThresholds
+)
+# Fuse/Absorb operations
+from finn.transformation.streamline.absorb import (
+    AbsorbAddIntoMultiThreshold,
+    AbsorbSignBiasIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+    AbsorbMulIntoMultiThreshold,
+    Absorb1BitMulIntoMatMul,
+    Absorb1BitMulIntoConv
+)
+# Reorder operations
+from finn.transformation.streamline.reorder import (
+    MoveMulPastFork,
+    MoveLinearPastFork,
+    MoveTransposePastFork,
+    MoveLinearPastEltwiseAdd,
+    MoveScalarLinearPastInvariants,
+    MoveTransposePastEltwise,
+    MoveMulPastMaxPool,
+    MoveAddPastMul,
+    MoveScalarAddPastMatMul,
+    MoveAddPastConv,
+    MoveScalarMulPastMatMul,
+    MoveScalarMulPastConv,
+)
+# Collapse consecutive operations of the same type
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedMul,
+    CollapseRepeatedTranspose,
+    CollapseRepeatedAdd
+)
+# FINN transformation converting ONNX nodes to hardware custom operators
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferElementwiseBinaryOperation,
+    InferLookupLayer
+)
+# Remove some operations without real effect
+from finn.transformation.streamline.remove import (
+    RemoveIdentityTranspose,
+    RemoveIdentityReshape
+)
+# Cleanup transformation getting rid of 3d data layout
+from finn.transformation.squeeze import Squeeze
+# Detects the attention pattern and converts to hardware custom op
+from finn.transformation.fpgadataflow.attention import (
+    InferScaledDotProductAttention,
+    AbsorbMultiThresholdIntoScaledDotProductAttention
+)
+# Mult-Head Attention support
+from finn.transformation.fpgadataflow.attention_heads import (
+    InferMultiHeads,
+    MoveSplitMultiHeadsPastMultiThreshold,
+    UnrollMultiHeadAttention,
+    MoveMergeMultiHeadsPastMultiThreshold
+)
+# Stream replication for outputs with multiple consumers
+from finn.transformation.fpgadataflow.replicate_stream import (
+    InferReplicateStream
+)
+# Inserts data-width converter and FIFO nodes into the model graph
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+# Splitting and removing of FIFOs from the model graph
+from finn.transformation.fpgadataflow.set_fifo_depths import (
+    RemoveShallowFIFOs,
+    SplitLargeFIFOs,
+)
+# Specializes each layer's implementation style: HLS or RTL implementation
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+# FINN dataflow builder configuration
+from finn.builder.build_dataflow_config import (
+    VerificationStepType, DataflowBuildConfig
+)
+# Graph transformation setting the folding, i.e., parallelization configuration
+from finn.transformation.fpgadataflow.set_folding import SetFolding
+# FINN verification after build/graph transformation steps
+from finn.builder.build_dataflow_steps import verify_step
+
+# Transformations preparing the operators for synthesis and simulation
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+
+# Execute onnx model graphs from the dataflow parent for verification
+from finn.util.test import execute_parent
+
+
+# Composes graph transformations such that each individual transformation as
+# well as the whole sequence is applied exhaustively
+class ComposedTransformation(Transformation):
+    # Initializes the transformation given a list of transformations
+    def __init__(self, transformations: list[Transformation]):
+        # Initialize the transformation base class
+        super().__init__()
+        # Register the list of transformations to be applied in apply()
+        self.transformations = transformations
+
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all transformations to be applied
+        for transformation in self.transformations:
+            # Start each transformation on a deep copy of the model to mimic the
+            # behavior of ModelWrapper.transform()
+            model = copy.deepcopy(model)
+            # Exhaustively apply the transformation until it no longer modifies
+            # the graph
+            while True:
+                # Apply the transformation once, reporting back whether any node
+                # or pattern has been modified
+                model, _graph_modified = transformation.apply(model)
+                # Keep track whether the graph has been modified at least once
+                graph_modified = graph_modified or _graph_modified
+                # Break the loop if this transformation did not change anything
+                if not _graph_modified:
+                    break
+            # Apply the cleanup transformations of the ModelWrapper
+            model.cleanup()
+            # Apply some further cleanup transformations to the model graph
+            # removing some clutter and keeping all names readable and ordered
+            # at any time
+            model = model.transform(RemoveIdentityOps())
+            model = model.transform(GiveUniqueNodeNames())
+            model = model.transform(GiveReadableTensorNames())
+            model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the graph actually
+        # has been transformed by at least one transformation so the whole
+        # sequence of transformations will be reapplied
+        return model, graph_modified
+
+
+# Custom Streamlining transformation: Similar to the built-in transformations
+# but exhaustively reapplied until none of the transformations can be applied
+# anymore.
+def Streamline():  # noqa: Uppercase
+    return ComposedTransformation([
+        ConvertSubToAdd(),
+        ConvertDivToMul(),
+        BatchNormToAffine(),
+        ConvertSignToThres(),
+        MoveMulPastMaxPool(),
+        AbsorbSignBiasIntoMultiThreshold(),
+        MoveScalarLinearPastInvariants(),
+        MoveAddPastMul(),
+        MoveScalarAddPastMatMul(),
+        MoveAddPastConv(),
+        MoveScalarMulPastMatMul(),
+        MoveScalarMulPastConv(),
+        MoveAddPastMul(),
+        CollapseRepeatedAdd(),
+        CollapseRepeatedMul(),
+        MoveMulPastMaxPool(),
+        AbsorbAddIntoMultiThreshold(),
+        FactorOutMulSignMagnitude(),
+        AbsorbMulIntoMultiThreshold(),
+        Absorb1BitMulIntoMatMul(),
+        Absorb1BitMulIntoConv(),
+        RoundAndClipThresholds(),
+    ])
+
+
+# Function running transformations necessary to clean up models containing
+# attention operators
+def step_tidy_up_pre_attention(model: ModelWrapper, _):
+    # Add shape and datatype annotations throughout all the graph
+    model = model.transform(InferDataTypes())  # noqa Duplicate
+    model = model.transform(InferShapes())
+
+    # Cleanup the graph by removing redundant, unnecessary and constant nodes
+    # and tensors and give unique names to everything remaining
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(RemoveStaticGraphInputs())
+    model = model.transform(RemoveUnusedTensors())
+    model = model.transform(GiveUniqueParameterTensors())
+    model = model.transform(FoldConstants())
+
+    # Remove unnecessary shape and layout transformations
+    model = model.transform(RemoveIdentityReshape())
+    model = model.transform(RemoveIdentityTranspose())
+    # Insert tensor layout annotations for Quant to MultiThreshold transform
+    # to determine the correct output channel dimension
+    model = model.transform(InferDataLayouts())
+    # Return the tidied up model
+    return model
+
+
+# Variant of streamlining transformations adapted to attention operators
+def step_streamline_attention(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Exhaustively apply the pattern of streamlining and moving past fork-nodes
+    model = model.transform(ComposedTransformation([
+        # Apply the set of standard streamlining transformations from finn to
+        # the model
+        Streamline(),
+        # We need a custom streamlining step to enable streamlining through
+        # certain fork-nodes Note: This transform is part of finn, but not
+        # included in the standard streamlining transformations
+        MoveLinearPastFork(),
+        # Streamline again there should be more transformations enabled after
+        # moving some nodes past forks
+        Streamline(),
+    ]))
+
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if (VerificationStepType.STREAMLINED_PYTHON in
+            cfg._resolve_verification_steps()):  # noqa
+        verify_step(
+            model, cfg, "streamlined_attention_python", need_parent=False
+        )
+
+    # Return the streamlined model
+    return model
+
+
+# Streamlining transformations to be applied to residual branches
+def step_streamline_residual(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Exhaustively apply the pattern for streamlining residual branches. This
+    # ensures streamlining to work for arbitrary many consecutive residual
+    # blocks, where one "round" of these transformations is required per block.
+    model = model.transform(ComposedTransformation([
+        # Streamline the residual connections by moving scale factors past
+        # elementwise add nodes
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+        MoveScalarLinearPastInvariants(),
+        # Do the normal streamlining flow once again
+        Streamline(),
+    ]))
+
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if (VerificationStepType.STREAMLINED_PYTHON in
+            cfg._resolve_verification_steps()):  # noqa
+        verify_step(
+            model, cfg, "streamlined_residual_python", need_parent=False
+        )
+
+    # Return the streamlined model
+    return model
+
+
+# Streamlining transformation to be applied to the normalization layers
+def step_streamline_norms(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Exhaustively apply the pattern for streamlining norms. This ensures
+    # streamlining to work for arbitrary many consecutive blocks, where one
+    # round of these transformations is required per block.
+    model = model.transform(ComposedTransformation([
+        # Streamline transposed batch normalization (move transposes past the
+        # scale-bias operator, so they can be collapsed afterward)
+        MoveTransposePastEltwise(),
+        # There should now be transposes next to each other which can be
+        # collapsed
+        CollapseRepeatedTranspose(),
+        # The transposes around the batch normalization should be collapsed by
+        # now and cancel each other out
+        RemoveIdentityTranspose(),
+        # Nested, exhaustive compositions of transformations
+        ComposedTransformation([
+            # We now might have transpose operations accumulating in front of
+            # fork nodes
+            MoveTransposePastFork(),
+            MoveTransposePastEltwise(),
+            CollapseRepeatedTranspose(),
+            RemoveIdentityTranspose(),
+        ]),
+        # This might have caused the normalization scale and bias to accumulate
+        # in front of transpose or fork node
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+        MoveScalarLinearPastInvariants(),
+        # This might have enabled more streamlining transformations
+        Streamline(),
+        # We need a custom streamlining step to enable streamlining through
+        # certain fork-nodes Note: This transform is part of finn, but not
+        # included in the standard streamlining transformations
+        MoveLinearPastFork(),
+        # This might have enabled more streamlining transformations
+        Streamline(),
+    ]))
+
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if (VerificationStepType.STREAMLINED_PYTHON in
+            cfg._resolve_verification_steps()):  # noqa
+        verify_step(model, cfg, "streamlined_norms_python", need_parent=False)
+
+    # Return the streamlined model
+    return model
+
+
+# Streamlining transformation to be applied to the positional encoding layer
+def step_streamline_positional(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # There is probably a division in front of the quantized positional
+    # encoding, which is exactly the inverse of the multiplication in front of
+    # that: The are the matching scale factors of the shared input quantizer of
+    # input and positional encoding. Convert the division to multiplication, so
+    # these two can be merged.
+    model = model.transform(ConvertDivToMul())
+    # Merge the quantization scales of shared input quantizers
+    model = model.transform(CollapseRepeatedMul())
+    # Push scalar multiplications, probably scale factors of quantizers, into
+    # the branches of a fork
+    model = model.transform(MoveMulPastFork())
+
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if (VerificationStepType.STREAMLINED_PYTHON in
+            cfg._resolve_verification_steps()):  # noqa
+        verify_step(
+            model, cfg, "streamlined_positional_python", need_parent=False
+        )
+
+    # Return the streamlined model
+    return model
+
+
+# Function running the InferScaledDotProductAttention transformation
+def step_convert_attention_to_hw(model: ModelWrapper, _):
+    # Try to infer reshaping of attention heads
+    model = model.transform(InferMultiHeads())  # noqa: Duplicate
+    # Try to mode the mult-head splitting past the multi thresholds
+    model = model.transform(MoveSplitMultiHeadsPastMultiThreshold())
+    # Moving multi-head splitting past multi thresholds might enable absorbing
+    # adds into thresholds once again
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    # Try to infer a ScaledDotProductAttention custom op
+    model = model.transform(InferScaledDotProductAttention())
+    # Parallelize attention head in the onnx graph
+    model = model.transform(UnrollMultiHeadAttention())
+    # Swap the order of merging the multi heads and applying thresholds
+    model = model.transform(MoveMergeMultiHeadsPastMultiThreshold())
+    # If applicable, absorb the final thresholds into the attention operator
+    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+    # Return the model with attention and multi-heads mapped to hardware
+    # operators
+    return model
+
+
+# Function running the transformations to convert elementwise binary operations
+# to their hardware implementations
+def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _):
+    # Convert elementwise operations to hardware operators
+    #   Note: Do not convert the final Mul operator at the output
+    return model.transform(InferElementwiseBinaryOperation(
+        InferElementwiseBinaryOperation.reject_output_dequant
+    ))
+
+
+# Function running the transformations to convert Gather, i.e., index lookup,
+# nodes to their hardware implementations
+def step_convert_lookup_to_hw(model: ModelWrapper, _):
+    # Iterate all nodes in the graph keeping track of the index
+    for index, node in enumerate(model.graph.node):
+        # If this is a Gather node, force the input (index) type annotation
+        if node.op_type == "Gather":
+            # Force to unsigned 64-bit integer for now
+            model.set_tensor_datatype(node.input[1], DataType["UINT64"])
+            # Get the value info for the input tensor to have access to the ONNX
+            # datatype of the tensor
+            value_info = model.get_tensor_valueinfo(node.input[1])
+            # Force the container datatype of the input to be a float
+            value_info.type.tensor_type.elem_type = 1
+    # Convert Gather to Lookup layers
+    return model.transform(InferLookupLayer())
+
+
+# Function running the InferReplicateStream transformation
+def step_replicate_streams(model: ModelWrapper, _):
+    # Properly replicate the stream feeding the query, key and value projections
+    return model.transform(InferReplicateStream())
+
+
+# Post-processing tidy-up squeezing dimensions and identity operators left over
+# from mapping the attention operators
+def step_tidy_up_post_attention(model: ModelWrapper, _):
+    # Remove dimensions of size 1 (single batch tensors)
+    model = model.transform(Squeeze())
+    model = model.transform(RemoveIdentityTranspose())
+
+    # Squeezing might enable absorbing adds into thresholds once again
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    # If applicable, absorb the final thresholds into the attention operator
+    #   Note: Might be applicable again after squeezing a transpose away
+    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+
+    # Squeezing might enable some more streamlining transformations once again
+    model = model.transform(ComposedTransformation([
+        # Streamline the residual connections by moving scale factors past
+        # elementwise add nodes
+        MoveLinearPastEltwiseAdd(),
+        MoveLinearPastFork(),
+        MoveScalarLinearPastInvariants(),
+        # Do the normal streamlining flow once again
+        Streamline(),
+    ]))
+
+    # Clean up the names for debugging
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    # Return the tidied up model
+    return model
+
+
+# Custom step for setting the parallelism to meet the target of T^2 cycles per
+# sequence
+def set_target_parallelization(seq_len: int,
+                               emb_dim: int):  # noqa: emb_dim
+    # The wrapping function is a generator and this is the actual build step
+    # function taking the model and build configuration
+    def step_set_target_parallelization(
+            model: ModelWrapper, cfg: DataflowBuildConfig
+    ):
+        # Run over all nodes in the model graph to look for attention operators,
+        # which are currently not handled by the SetFolding transformation
+        for index, node in enumerate(model.graph.node):
+            # Only handle attention operations here
+            if node.op_type == "ScaledDotProductAttention_hls":
+                # Convert this to the custom-op instance for easy access to node
+                # attributes
+                inst = getCustomOp(node)
+                # Set the sequence and embedding dimension folding to meet the
+                # T^2 cycles target, i.e., fully parallel along the embedding
+                # dimension and fully sequential along the sequence dimension
+                inst.set_nodeattr("EmbFold", 1)
+                inst.set_nodeattr("SeqFold", seq_len)
+        # Apply the built-in folding configuration transformation with the
+        # T^2 target cycles
+        model = model.transform(SetFolding(
+            seq_len ** 2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation
+        ))
+        # TODO: Extract the folding configuration
+        # Return the model with configured parallelization
+        return model
+
+    # Return the wrapped build step function
+    return step_set_target_parallelization
+
+
+# Applies configuration dictionary to the model graph
+class ApplyConfig(Transformation):
+    # Initializes the transformation with the configuration dictionary
+    def __init__(self, config):
+        # Initialize the transformation base class
+        super().__init__()
+        # Register the configuration dictionary to be used in apply()
+        self.config = config
+
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # A node should not be named "defaults"...
+            assert node.name != "defaults", \
+                "Node has reserved name 'defaults'"
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Apply the per operator type default configurations to the node
+            if node.op_type in self.config["defaults"]:
+                # Run over all default options to be applied to this node
+                for key, value in self.config["defaults"][node.op_type].items():
+                    # Set the nodes attribute to the default option value
+                    inst.set_nodeattr(key, value)
+            # If there is an individual, node-specific configuration apply
+            # this next, potentially overriding the defaults set above
+            if node.name in self.config:
+                # Run over all node-specific options to be applied to this
+                # node
+                for key, value in self.config[node.name].items():
+                    # Set the nodes attribute to the option value
+                    inst.set_nodeattr(key, value)
+        # Return model with configuration applied
+        # Note: Do not consider this as modifying the graph. This does not have
+        # to be reapplied multiple times.
+        return model, False
+
+
+# Custom build step trying to set appropriate FIFO sizes for the transformer
+def set_fifo_depths(
+        seq_len: int, emb_dim: int, uram_threshold: int = 32  # noqa: emb_dim
+):
+    # The wrapping function is a generator and this is the actual build step
+    # function taking the model and build configuration
+    def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
+        # Run over all nodes in the model graph
+        for index, node in enumerate(model.graph.node):
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Extract the FIFO depths configuration of the node
+            in_depths = inst.get_nodeattr("inFIFODepths")
+            out_depths = inst.get_nodeattr("outFIFODepths")
+
+            # Number of inputs and outputs to/from the node
+            num_inputs = len(node.input)
+            num_outputs = len(node.output)
+
+            # If the input/output has only default configurations, fill with as
+            # many shallow FIFOs as there are inputs, to avoid later problems
+            # with to few FIFO depths specified
+            if in_depths == [2] and num_inputs > 1:
+                in_depths = num_inputs * [2]
+            if out_depths == [2] and num_outputs > 1:
+                out_depths = num_outputs * [2]
+
+            # Special case: Attention needs properly sized input FIFOs
+            if node.op_type == "ScaledDotProductAttention_hls":
+                # Each folded input stream needs to be buffered completely
+                # TODO: Not exactly sure whether this is always correct or just
+                #  the worst-case
+                in_depths = [
+                    inst.get_number_input_values(i) for i in range(num_inputs)
+                ]
+                # Note: No special treatment of the output FIFO
+                # out_depths = ...
+
+            # Special case: Adding residual branches needs to buffer the inputs
+            # to avoid deadlocks if one branch is running faster/slower
+            if node.op_type == "ElementwiseAdd_hls":
+                # Only relevant if for join-node operations, i.e., node actually
+                # consumes two branches, potentially operating at a different
+                # rate
+                if model.is_join_node(node):
+                    # Set both inputs to buffer as many cycles as we target for
+                    # the attention operations, i.e., the T^2 cycles per
+                    # sequence target
+                    # TODO: Not exactly sure whether this is always correct or
+                    #  just the worst-case
+                    # TODO: Currently we do not really have a reliable way of
+                    #  figuring out which of the two is the longer/deeper branch
+                    #  in terms of cycles to set a corresponding buffer only to
+                    #  the shorter branch.
+                    in_depths = [seq_len ** 2, seq_len ** 2]
+                    # Note: No special treatment of the output FIFO
+                    # out_depths = ...
+
+            # Set the updated FIFO depths attributes
+            inst.set_nodeattr("inFIFODepths", in_depths)
+            inst.set_nodeattr("outFIFODepths", out_depths)
+
+        # The following partially mirrors (or even copies from) the build-in
+        # step_set_fifo_depths using only manual FIFO depths and our YAML-based
+        # folding configuration.
+
+        # Insert data-width converters
+        model = model.transform(InsertDWC())
+        # Insert FIFOs between all operators (inserts shallow, depths 2 FIFOs if
+        # no other depth is specified)
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+        # Specialize the implementation variant of the (newly added FIFO) layers
+        model = model.transform(
+            SpecializeLayers(cfg._resolve_fpga_part())  # noqa: Access _ method
+        )
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
+        # Only applies if a configuration file is given
+        if cfg.folding_config_file is not None:
+            # Load the configuration dictionary form YAML file
+            with (open(cfg.folding_config_file, "r") as file):
+                # Load YAML string
+                config = yaml.safe_load(file)
+                # Assign unique names to the nodes which can be matched by
+                # individual per-node configuration options
+                model = model.transform(GiveUniqueNodeNames())
+                # Apply the configuration dictionary to the model graph
+                model = model.transform(ApplyConfig(config))
+
+        # Run over all nodes in the model graph once again to modify the
+        # inserted FIFOs
+        # Note: This overwrites the folding configuration...
+        # TODO: Find a better way to handle this
+        for index, node in enumerate(model.graph.node):
+            # Modify all RTL FIFO operators
+            if node.op_type == "StreamingFIFO_rtl":
+                # Convert this to the custom-op instance for easy access to node
+                # attributes
+                inst = getCustomOp(node)
+                # Check the depth of the FIFO: If this is not a shallow FIFO,
+                # implement this via the vivado strategy in URAM
+                if inst.get_nodeattr("depth") >= uram_threshold:
+                    # Change the implementation style to vivado
+                    inst.set_nodeattr("impl_style", "vivado")
+                    # Set the resource type for the memory to URAM
+                    inst.set_nodeattr("ram_style", "ultra")
+
+        # Hardware attributes to be extracted from each node
+        hw_attrs = {
+            "PE",
+            "SIMD",
+            "parallel_window",
+            "ram_style",
+            "ram_style_thresholds",
+            "ram_style_mask",
+            "depth",
+            "impl_style",
+            "resType",
+            "mac_resource",
+            "mem_mode",
+            "runtime_writeable_weights",
+            "inFIFODepths",
+            "outFIFODepths",
+            "depth_trigger_uram",
+            "depth_trigger_bram",
+        }
+
+        # Start collecting the configuration from the model graph as a
+        # dictionary
+        config = {"defaults": {}}
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(model.graph.node):
+            # Convert this to the custom-op instance for easy access to node
+            # attributes
+            inst = getCustomOp(node)
+            # Prepare the node-specific configuration entry for this node
+            config[node.name] = {}
+            # Collect attribute values for all specified hardware attributes
+            for key in hw_attrs:
+                # Some hardware attributes may not be present for all nodes or
+                # op-types, this will be signaled via exception
+                try:
+                    # Try extracting the configuration value from the node
+                    # custom-op instance
+                    config[node.name][key] = inst.get_nodeattr(key)
+                # Missing attributes are signaled va AttributeError
+                except AttributeError:
+                    # Can be safely ignored here
+                    pass
+            # Cleanup: If no attribute is present for this node, there is no
+            # need to keep this in the configuration dictionary as there is
+            # nothing to be restored later
+            if not config[node.name]:
+                # Remove the entry form the configuration dictionary
+                del config[node.name]
+
+        # Create/Open a YAML file to store the configuration for later reuse
+        with open(cfg.output_dir + "/final_hw_config.yaml", "w") as file:
+            # Store the configuration dictionary as YAML code
+            yaml.safe_dump(config, file)
+
+        # Perform FIFO splitting and shallow FIFO removal only after the final
+        # config file has been written. Otherwise, since these transforms may
+        # add/remove FIFOs, we get name mismatch problems when trying to reuse
+        # the final config.
+        if cfg.split_large_fifos:
+            model = model.transform(SplitLargeFIFOs())
+        model = model.transform(RemoveShallowFIFOs())
+
+        # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again
+        # this will only run for the new nodes (e.g. FIFOs and DWCs)
+        model = model.transform(
+            PrepareIP(
+                cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()  # noqa
+            )
+        )
+        model = model.transform(HLSSynthIP())
+
+        # Return the model with configured parallelization
+        return model
+
+    # Return the wrapped build step function
+    return step_set_fifo_depths
+
+
+# Custom step applying our custom format of folding configuration to the graph
+def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Only applies if a configuration file is given
+    if cfg.folding_config_file is not None:
+        # Load the configuration dictionary form YAML file
+        with (open(cfg.folding_config_file, "r") as file):
+            # Load YAML string
+            config = yaml.safe_load(file)
+            # Assign unique names to the nodes which can be matched by
+            # individual per-node configuration options
+            model = model.transform(GiveUniqueNodeNames())
+            # Apply the configuration dictionary to the model graph
+            model = model.transform(ApplyConfig(config))
+    # If configured, run a verification of the transformed model on some sample
+    # inputs
+    if (VerificationStepType.FOLDED_HLS_CPPSIM in
+            cfg._resolve_verification_steps()):  # noqa
+        # Prepare C++ Simulation for verification
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+        # Execute a verification step of the model with inputs specified in
+        # build configuration
+        verify_step(model, cfg, "folded_hls_cppsim", need_parent=True)
+
+    # Return model with configuration applied
+    return model
+
+
+# Runs a node-by-node Python simulation of the model saving the fill execution
+# context
+# Note: Assumes no execution mode to be set
+def node_by_node_python(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Save the original model
+    original = model
+    # Copy the model
+    model = copy.deepcopy(model)
+
+    # Load the verification input/output pair
+    inp = np.load(cfg.verify_input_npy)  # noqa
+    out = np.load(cfg.verify_expected_output_npy)
+
+    # Path to the parent model wrapping the streaming dataflow partition and the
+    # wrapped child model, i.e., the inside of the streaming dataflow partition
+    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
+    child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx"
+    # Save the child model prepared for python simulation
+    model.save(child)
+    # Load the parent model to pass to verification execution
+    parent_model = ModelWrapper(parent)
+
+    # Reshape the input/output to match the model
+    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
+    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
+
+    # Execute the onnx model to collect the result
+    # context = execute_onnx(model, context, return_full_exec_context=True)
+    context = execute_parent(parent, child, inp, return_full_ctx=True)
+    # Extract the output tensor from the execution context
+    model_out = context[parent_model.graph.output[0].name]
+    # Compare input to output
+    result = {True: "SUCCESS", False: "FAIL"}[
+        np.allclose(out, model_out, atol=1e-3)
+    ]
+    # Save the verification outputs into the configured build directory
+    verification_output = f"{cfg.output_dir}/verification_output/"
+    # Save the verification execution context
+    np.savez(f"{verification_output}/verify_python_{result}.npz", **context)
+    # Return the original, unmodified model
+    return original
+
+
+# Runs a node-by-node C++ simulation of the model saving the fill execution
+# context
+def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Save the original model
+    original = model
+    # Copy the model
+    model = copy.deepcopy(model)
+    # Set model execution mode to C++ simulation
+    model = model.transform(SetExecMode("cppsim"))
+    # Generates the C++ source and compiles the C++ simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareCppSim())
+    model = model.transform(CompileCppSim())
+
+    # Load the verification input/output pair
+    inp = np.load(cfg.verify_input_npy)  # noqa
+    out = np.load(cfg.verify_expected_output_npy)
+
+    # Path to the parent model wrapping the streaming dataflow partition and the
+    # wrapped child model, i.e., the inside of the streaming dataflow partition
+    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
+    child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx"
+    # Save the child model prepared for C++ simulation
+    model.save(child)
+    # Load the parent model to pass to verification execution
+    parent_model = ModelWrapper(parent)
+
+    # Reshape the input/output to match the model
+    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
+    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
+
+    # Execute the onnx model to collect the result
+    # context = execute_onnx(model, context, return_full_exec_context=True)
+    context = execute_parent(parent, child, inp, return_full_ctx=True)
+    # Extract the output tensor from the execution context
+    model_out = context[parent_model.graph.output[0].name]
+    # Compare input to output
+    result = {True: "SUCCESS", False: "FAIL"}[
+        np.allclose(out, model_out, atol=1e-3)
+    ]
+    # Save the verification outputs into the configured build directory
+    verification_output = f"{cfg.output_dir}/verification_output/"
+    # Save the verification execution context
+    np.savez(f"{verification_output}/verify_cppsim_{result}.npz", **context)
+    # Return the original, unmodified model
+    return original
+
+
+# Runs a node-by-node RTL simulation of the model saving the fill execution
+# context
+def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # Save the original model
+    original = model
+    # Copy the model
+    model = copy.deepcopy(model)
+    # Set model execution mode to RTL simulation
+    model = model.transform(SetExecMode("rtlsim"))
+    # Generates the C++ source and compiles the RTL simulation
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(
+        cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)  # noqa
+    )
+    model = model.transform(HLSSynthIP())
+    model = model.transform(PrepareRTLSim())
+
+    # Load the verification input/output pair
+    inp = np.load(cfg.verify_input_npy)  # noqa
+    out = np.load(cfg.verify_expected_output_npy)
+
+    # Path to the parent model wrapping the streaming dataflow partition and the
+    # wrapped child model, i.e., the inside of the streaming dataflow partition
+    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
+    child = f"{cfg.output_dir}/intermediate_models/verify_rtlsim.onnx"
+    # Save the child model prepared for RTL simulation
+    model.save(child)
+    # Load the parent model to pass to verification execution
+    parent_model = ModelWrapper(parent)
+
+    # Reshape the input/output to match the model
+    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
+    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
+
+    # Execute the onnx model to collect the result
+    # context = execute_onnx(model, context, return_full_exec_context=True)
+    context = execute_parent(parent, child, inp, return_full_ctx=True)
+    # Extract the output tensor from the execution context
+    model_out = context[parent_model.graph.output[0].name]
+    # Compare input to output
+    result = {True: "SUCCESS", False: "FAIL"}[
+        np.allclose(out, model_out, atol=1e-3)
+    ]
+    # Save the verification outputs into the configured build directory
+    verification_output = f"{cfg.output_dir}/verification_output/"
+    # Save the verification execution context
+    np.savez(f"{verification_output}/verify_rtlsim_{result}.npz", **context)
+    # Return the original, unmodified model
+    return original
diff --git a/benchmarking/dut/transformer_gpt.py b/benchmarking/dut/transformer_gpt.py
new file mode 100644
index 0000000000..5ee77483ab
--- /dev/null
+++ b/benchmarking/dut/transformer_gpt.py
@@ -0,0 +1,348 @@
+# Adapted from Christoph's attention-dummy repository
+
+# PyTorch base package: Math and Tensor Stuff
+import torch
+# Brevitas wrapper around PyTorch tensors adding quantization information
+from brevitas.quant_tensor import QuantTensor
+# Brevitas: Quantized versions of PyTorch layers
+from brevitas.nn import (
+    QuantMultiheadAttention,
+    QuantEltwiseAdd,
+    QuantIdentity,
+    QuantLinear,
+    QuantReLU
+)
+from qonnx.core.modelwrapper import ModelWrapper
+# Progressbar
+from tqdm import trange
+import numpy as np
+from brevitas.export import export_qonnx
+import random
+import json
+import subprocess
+# FINN dataflow builder
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
+from bench_base import bench, step_synth_harness
+import os
+from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+
+# Custom build steps required to streamline and convert the attention operator
+from dut.transformer_custom_steps import (
+    step_tidy_up_pre_attention,
+    step_tidy_up_post_attention,
+    step_streamline_attention,
+    step_streamline_residual,
+    step_streamline_norms,
+    step_streamline_positional,
+    step_convert_attention_to_hw,
+    step_convert_elementwise_binary_to_hw,
+    step_convert_lookup_to_hw,
+    step_replicate_streams,
+    set_target_parallelization,
+    set_fifo_depths,
+    step_apply_folding_config,
+    node_by_node_rtlsim, # noqa: Maybe unused, only for debugging
+    node_by_node_python, # noqa: Maybe unused, only for debugging
+    node_by_node_cppsim
+)
+from performance.platform_build_steps import(
+     test_step_gen_vitis_xo,
+     test_step_gen_instrumentation_wrapper,
+     test_step_gen_instrwrap_sim,
+     test_step_insert_tlastmarker,
+     test_step_export_xo,
+     test_step_build_platform,
+     test_step_run_instrwrap_sim
+)
+
+### ADAPTED FROM utils.py
+# Seeds all relevant random number generators to the same seed for
+# reproducibility
+def seed(s):
+    random.seed(s)
+    np.random.seed(s)
+    torch.manual_seed(s)
+
+template_folding_yaml = """
+# Per operator type default configurations
+defaults:
+    # Scaled dot-product attention head implemented via HLS
+    ScaledDotProductAttention_hls:
+        # Type of memory to be used for internal buffer storage
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Type of memory to be used fo the attention mask (if present)
+        #   Options: auto, block, distributed
+        ram_style_mask: block
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        mac_resource: lut
+    # Addition of two inputs (constants or streamed) implemented via HLS
+    ElementwiseAdd_hls:
+        # Type of memory to be used for internal buffer storage and/or constant
+        # parameter tensors
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Matrix vector activation unit implemented via HLS
+    MVAU_hls:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Matrix vector activation unit implemented via RTL
+    MVAU_rtl:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        # Note: RTL MVAU currently does not support LUT-based implementation
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
+    Thresholding_hls:
+        # Memory mode for threshold storage
+        #   Options: internal_embedded, internal_decoupled
+        mem_mode: internal_decoupled
+        # Type of memory to be used for threshold storage if "internal_decoupled"
+        #   Options: distributed, block
+        ram_style: distributed
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
+    Thresholding_rtl:
+        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
+        # depth of the thresholds
+        # Note: This combination forces "distributed" LUT implementation
+        depth_trigger_uram: 2147483647  # "infinity"
+        depth_trigger_bram: 2147483647  # "infinity"
+    #    # Note: This combination forces "block" RAM implementation
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 1
+    #    # Note: This combination forces "ultra" RAM implementation
+    #    depth_trigger_uram: 1
+    #    depth_trigger_bram: 0
+    #    # Note: This combination is equivalent to "auto"
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 0
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
+    StreamingFIFO_rtl:
+        # RTL vs. IPI implementation of FIFOs
+        #   Options: rtl, vivado
+        impl_style: rtl
+        # Resource type for FIFOs when impl_style is vivado
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Individual, named node-specific configurations here
+    # ...
+"""
+
+class bench_transformer_gpt(bench):
+    def step_build(self):
+        #with open("params.yaml") as file:
+        #    params = yaml.safe_load(file)
+        # Seed all RNGs
+        seed(self.params["seed"])
+
+        # Extract sequence length and embedding dimension from the output of the
+        # first quantizer in the model
+        # Note: Embedding and Sequence dimension flip later
+        model = ModelWrapper(self.build_inputs["onnx_path"])
+        _, emb_dim, seq_len = model.get_tensor_shape(
+            "/emb_add/input_quant/export_handler/Quant_output_0"
+        )
+
+        # Prepare config files
+        # TODO: make configurable
+        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
+        specialize_layers_dict = {
+            "Defaults": {
+                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
+            },
+            "": {
+                "preferred_impl_style": ""
+            }
+        }
+        with open("specialize_layers.json", "w") as f:
+                json.dump(specialize_layers_dict, f, indent=2)
+        with open("folding.yaml", "w") as f:
+                f.write(template_folding_yaml)
+
+        #TODO: make configurable instead of hardcoding exception
+        self.board = "U280"
+        self.part = "xcu280-fsvh2892-2L-e"
+
+        # Create a configuration for building the scaled dot-product attention
+        # operator to a hardware accelerator
+        cfg = build_cfg.DataflowBuildConfig(
+            # Unpack the build configuration parameters
+            #**params["build"],
+            output_dir = self.build_inputs["build_dir"],
+            stitched_ip_gen_dcp = True,
+            synth_clk_period_ns = self.clock_period_ns,
+            board = self.board,
+            shell_flow_type = "vitis_alveo", #TODO: proper Alveo support instead of hardcoding
+            folding_config_file = "folding.yaml",
+            specialize_layers_config_file = "specialize_layers.json",
+            standalone_thresholds = True,
+            max_multithreshold_bit_width = 16,
+            mvau_wwidth_max = 2048,
+            split_large_fifos = True,
+
+            verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log
+
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
+                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
+                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
+                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
+                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
+                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
+            ],
+
+            verify_steps=[
+                # Verify the model after converting to the FINN onnx dialect
+                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
+                # Verify the model again using python mode after the default
+                # streamlining step
+                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+                # Verify the model again after tidy up transformations, right before
+                # converting to HLS
+                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+                # Verify the model after generating C++ HLS and applying folding
+                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+            ],
+            # File with test inputs for verification
+            verify_input_npy=self.build_inputs["input_npy_path"],
+            # File with expected test outputs for verification
+            verify_expected_output_npy=self.build_inputs["output_npy_path"],
+            # Save the intermediate model graphs
+            save_intermediate_models=True,
+            # Avoid RTL simulation for setting the FIFO sizes
+            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
+            # Do not automatically set FIFO sizes as this requires RTL simulation
+            # not implemented for the attention operator
+            auto_fifo_depths=False,
+            # Build steps to execute
+            steps=[
+                # Need to apply some tidy-up transformations before converting to
+                # the finn dialect of onnx
+                step_tidy_up_pre_attention,
+                # Convert all QONNX Quant nodes to Multithreshold nodes
+                "step_qonnx_to_finn",
+                # Tidy up the graph after converting from QONNX to FINN format
+                # Note: Triggers a verification step
+                "step_tidy_up",
+                # Positional encoding needs to be streamlined first with slightly
+                # different order of certain streamlining transformations to avoid
+                # weird rounding issue of intermediate results
+                step_streamline_positional,
+                # Custom streamlining for models containing attention operators
+                step_streamline_attention,
+                # Streamlining of the residual branches
+                step_streamline_residual,
+                # Streamline the normalization layers, i.e., transposed batch norm
+                step_streamline_norms,
+                # Another round using the default streamlining steps
+                # Note: Triggers a verification step
+                "step_streamline",
+                # New conversion of the scaled dot-product attention pattern
+                step_convert_attention_to_hw,
+                # Another tidy-up step to remove unnecessary dimensions and
+                # operations after converting the attention operators to HLS
+                step_tidy_up_post_attention,
+                # Convert the elementwise binary operations to hardware operators.
+                # These include for example adding residual branches and positional
+                # encoding
+                step_convert_elementwise_binary_to_hw,
+                # Convert the Gather layer realizing the input token embedding to
+                # the FINN hardware implementation, i.e., the Lookup layer
+                step_convert_lookup_to_hw,
+                # Properly replicate the stream feeding the query, key and value
+                # projections
+                step_replicate_streams,
+                # Convert most other layers supported by FINN to HW operators
+                "step_convert_to_hw",
+                # Specialize HW layer implementations as either HLS or RTL
+                "step_specialize_layers",
+                "step_create_dataflow_partition",
+                # Set the folding configuration to meet the cycles per sequence
+                # target
+                set_target_parallelization(seq_len, emb_dim),
+                # Apply folding configuration, specifying hardware implementation
+                # details
+                # Note: This triggers a verification step
+                step_apply_folding_config,
+                "step_minimize_bit_width",
+                # The ScaledDotProductAttention custom op does not define any
+                # estimates
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                # Set the attention- and residual-related FIFO depths insert FIFOs
+                # and apply folding configuration once again
+                # Note: Implement all FIFOs with a depth at least as deep as the
+                # sequence length in URAM.
+                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
+                # Run additional node-by-node verification in RTL simulation of the
+                # model before creating the stitched IP
+                # Note: end-to-end verification of the stitched IP in RTL simulation
+                # is still not possible due to missing float IPs
+                node_by_node_cppsim,
+                # Only for debugging for now, does not work if "vivado" style
+                # StreamingFIFOs are used
+                # node_by_node_rtlsim,
+
+                test_step_insert_tlastmarker, # required for instrumentation_wrapper
+
+                "step_create_stitched_ip",
+
+                # "step_measure_rtlsim_performance", # not possible due to float components
+
+                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
+                
+                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
+
+                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
+                #"step_synthesize_bitfile", 
+                #"step_make_pynq_driver",
+                #"step_deployment_package",
+
+                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
+                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
+
+                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
+                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
+                
+                #test_step_export_xo, # preparation step for original instr wrapper integration
+                #test_step_build_platform # synthesis with instr wrapper
+            ]
+        )
+        # Run the build process on the dummy attention operator graph
+        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
+        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
+
+    def run(self):
+        self.steps_full_build_flow()
diff --git a/benchmarking/dut/transformer_radioml.py b/benchmarking/dut/transformer_radioml.py
new file mode 100644
index 0000000000..4d77cb4b8d
--- /dev/null
+++ b/benchmarking/dut/transformer_radioml.py
@@ -0,0 +1,336 @@
+# Adapted from Christoph's attention-dummy repository
+
+# PyTorch base package: Math and Tensor Stuff
+import torch
+# Brevitas wrapper around PyTorch tensors adding quantization information
+from brevitas.quant_tensor import QuantTensor
+# Brevitas: Quantized versions of PyTorch layers
+from brevitas.nn import (
+    QuantMultiheadAttention,
+    QuantEltwiseAdd,
+    QuantIdentity,
+    QuantLinear,
+    QuantReLU
+)
+# Progressbar
+from tqdm import trange
+import numpy as np
+from brevitas.export import export_qonnx
+import random
+import json
+import subprocess
+# FINN dataflow builder
+import finn.builder.build_dataflow as build
+import finn.builder.build_dataflow_config as build_cfg
+from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
+from bench_base import bench, step_synth_harness
+import os
+from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+
+# Custom build steps required to streamline and convert the attention operator
+from dut.transformer_custom_steps import (
+    step_tidy_up_pre_attention,
+    step_tidy_up_post_attention,
+    step_streamline_attention,
+    step_streamline_residual,
+    step_streamline_norms,
+    step_streamline_positional,
+    step_convert_attention_to_hw,
+    step_convert_elementwise_binary_to_hw,
+    step_convert_lookup_to_hw,
+    step_replicate_streams,
+    set_target_parallelization,
+    set_fifo_depths,
+    step_apply_folding_config,
+    node_by_node_rtlsim,
+    node_by_node_cppsim
+)
+from performance.platform_build_steps import(
+     test_step_gen_vitis_xo,
+     test_step_gen_instrumentation_wrapper,
+     test_step_gen_instrwrap_sim,
+     test_step_insert_tlastmarker,
+     test_step_export_xo,
+     test_step_build_platform,
+     test_step_run_instrwrap_sim
+)
+
+### ADAPTED FROM utils.py
+# Seeds all relevant random number generators to the same seed for
+# reproducibility
+def seed(s):
+    random.seed(s)
+    np.random.seed(s)
+    torch.manual_seed(s)
+
+template_folding_yaml = """
+# Per operator type default configurations
+defaults:
+    # Scaled dot-product attention head implemented via HLS
+    ScaledDotProductAttention_hls:
+        # Type of memory to be used for internal buffer storage
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Type of memory to be used fo the attention mask (if present)
+        #   Options: auto, block, distributed
+        ram_style_mask: block
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        mac_resource: lut
+    # Addition of two inputs (constants or streamed) implemented via HLS
+    ElementwiseAdd_hls:
+        # Type of memory to be used for internal buffer storage and/or constant
+        # parameter tensors
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Matrix vector activation unit implemented via HLS
+    MVAU_hls:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Type of memory to be used for threshold storage
+        #   Options: auto, block, distributed
+        ram_style_thresholds: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Matrix vector activation unit implemented via RTL
+    MVAU_rtl:
+        # Resource type to be used for implementing multiplications/MACs
+        #   Options: auto, lut or dsp
+        # Note: RTL MVAU currently does not support LUT-based implementation
+        resType: dsp
+        # Memory mode for weight storage
+        #   Options: internal_embedded, internal_decoupled, external
+        mem_mode: internal_decoupled
+        # Type of memory to be used for weight storage if "internal_decoupled"
+        #   Options: auto, block, distributed, ultra
+        ram_style: block
+        # Makes weights writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
+    Thresholding_hls:
+        # Memory mode for threshold storage
+        #   Options: internal_embedded, internal_decoupled
+        mem_mode: internal_decoupled
+        # Type of memory to be used for threshold storage if "internal_decoupled"
+        #   Options: distributed, block
+        ram_style: distributed
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
+    Thresholding_rtl:
+        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
+        # depth of the thresholds
+        # Note: This combination forces "distributed" LUT implementation
+        depth_trigger_uram: 2147483647  # "infinity"
+        depth_trigger_bram: 2147483647  # "infinity"
+    #    # Note: This combination forces "block" RAM implementation
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 1
+    #    # Note: This combination forces "ultra" RAM implementation
+    #    depth_trigger_uram: 1
+    #    depth_trigger_bram: 0
+    #    # Note: This combination is equivalent to "auto"
+    #    depth_trigger_uram: 0
+    #    depth_trigger_bram: 0
+        # Makes thresholds writeable through AXI-lite interface at runtime
+        runtime_writeable_weights: 0
+    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
+    StreamingFIFO_rtl:
+        # RTL vs. IPI implementation of FIFOs
+        #   Options: rtl, vivado
+        impl_style: rtl
+        # Resource type for FIFOs when impl_style is vivado
+        #   Options: auto, block, distributed, ultra
+        ram_style: distributed
+    # Individual, named node-specific configurations here
+    # ...
+"""
+
+class bench_transformer_radioml(bench):
+    def step_build(self):
+        #with open("params.yaml") as file:
+        #    params = yaml.safe_load(file)
+        # Seed all RNGs
+        seed(self.params["seed"])
+        # Extract sequence length and embedding dimension from parameters
+        _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape
+
+        # Prepare config files
+        # TODO: make configurable
+        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
+        specialize_layers_dict = {
+            "Defaults": {
+                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
+            },
+            "": {
+                "preferred_impl_style": ""
+            }
+        }
+        with open("specialize_layers.json", "w") as f:
+                json.dump(specialize_layers_dict, f, indent=2)
+        with open("folding.yaml", "w") as f:
+                f.write(template_folding_yaml)
+
+        # Create a configuration for building the scaled dot-product attention
+        # operator to a hardware accelerator
+        cfg = build_cfg.DataflowBuildConfig(
+            # Unpack the build configuration parameters
+            #**params["build"],
+            output_dir = self.build_inputs["build_dir"],
+            stitched_ip_gen_dcp = True,
+            synth_clk_period_ns = self.clock_period_ns,
+            board = self.board,
+            shell_flow_type = "vivado_zynq", #TODO: Alveo support
+            folding_config_file = "folding.yaml",
+            specialize_layers_config_file = "specialize_layers.json",
+            standalone_thresholds = True,
+            max_multithreshold_bit_width = 16,
+            mvau_wwidth_max = 2048,
+            split_large_fifos = True,
+
+            verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log
+
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
+                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
+                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
+                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
+                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
+                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
+            ],
+
+            verify_steps=[
+                # Verify the model after converting to the FINN onnx dialect
+                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
+                # Verify the model again using python mode after the default
+                # streamlining step
+                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
+                # Verify the model again after tidy up transformations, right before
+                # converting to HLS
+                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
+                # Verify the model after generating C++ HLS and applying folding
+                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+            ],
+            # File with test inputs for verification
+            verify_input_npy=self.build_inputs["input_npy_path"],
+            # File with expected test outputs for verification
+            verify_expected_output_npy=self.build_inputs["output_npy_path"],
+            # Save the intermediate model graphs
+            save_intermediate_models=True,
+            # Avoid RTL simulation for setting the FIFO sizes
+            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
+            # Do not automatically set FIFO sizes as this requires RTL simulation
+            # not implemented for the attention operator
+            auto_fifo_depths=False,
+            # Build steps to execute
+            steps=[
+                # Need to apply some tidy-up transformations before converting to
+                # the finn dialect of onnx
+                step_tidy_up_pre_attention,
+                # Convert all QONNX Quant nodes to Multithreshold nodes
+                "step_qonnx_to_finn",
+                # Tidy up the graph after converting from QONNX to FINN format
+                # Note: Triggers a verification step
+                "step_tidy_up",
+                # Positional encoding needs to be streamlined first with slightly
+                # different order of certain streamlining transformations to avoid
+                # weird rounding issue of intermediate results
+                step_streamline_positional,
+                # Custom streamlining for models containing attention operators
+                step_streamline_attention,
+                # Streamlining of the residual branches
+                step_streamline_residual,
+                # Streamline the normalization layers, i.e., transposed batch norm
+                step_streamline_norms,
+                # Another round using the default streamlining steps
+                # Note: Triggers a verification step
+                "step_streamline",
+                # New conversion of the scaled dot-product attention pattern
+                step_convert_attention_to_hw,
+                # Another tidy-up step to remove unnecessary dimensions and
+                # operations after converting the attention operators to HLS
+                step_tidy_up_post_attention,
+                # Convert the elementwise binary operations to hardware operators.
+                # These include for example adding residual branches and positional
+                # encoding
+                step_convert_elementwise_binary_to_hw,
+                # Convert the Gather layer realizing the input token embedding to
+                # the FINN hardware implementation, i.e., the Lookup layer
+                step_convert_lookup_to_hw,
+                # Properly replicate the stream feeding the query, key and value
+                # projections
+                step_replicate_streams,
+                # Convert most other layers supported by FINN to HW operators
+                "step_convert_to_hw",
+                # Specialize HW layer implementations as either HLS or RTL
+                "step_specialize_layers",
+                "step_create_dataflow_partition",
+                # Set the folding configuration to meet the cycles per sequence
+                # target
+                set_target_parallelization(seq_len, emb_dim),
+                # Apply folding configuration, specifying hardware implementation
+                # details
+                # Note: This triggers a verification step
+                step_apply_folding_config,
+                "step_minimize_bit_width",
+                # The ScaledDotProductAttention custom op does not define any
+                # estimates
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                # Set the attention- and residual-related FIFO depths insert FIFOs
+                # and apply folding configuration once again
+                # Note: Implement all FIFOs with a depth at least as deep as the
+                # sequence length in URAM.
+                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
+                # Run additional node-by-node verification in RTL simulation of the
+                # model before creating the stitched IP
+                # Note: end-to-end verification of the stitched IP in RTL simulation
+                # is still not possible due to missing float IPs
+                node_by_node_cppsim,
+                # Only for debugging for now, does not work if "vivado" style
+                # StreamingFIFOs are used
+                # node_by_node_rtlsim,
+
+                test_step_insert_tlastmarker, # required for instrumentation_wrapper
+
+                "step_create_stitched_ip",
+
+                # "step_measure_rtlsim_performance", # not possible due to float components
+
+                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
+                
+                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
+
+                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
+                #"step_synthesize_bitfile", 
+                #"step_make_pynq_driver",
+                #"step_deployment_package",
+
+                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
+                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
+
+                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
+                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
+                
+                #test_step_export_xo, # preparation step for original instr wrapper integration
+                #test_step_build_platform # synthesis with instr wrapper
+            ]
+        )
+        # Run the build process on the dummy attention operator graph
+        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
+        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
+
+    def run(self):
+        self.steps_full_build_flow()
diff --git a/benchmarking/harness/sink/ip/component.xml b/benchmarking/harness/sink/ip/component.xml
new file mode 100644
index 0000000000..cb20a9abad
--- /dev/null
+++ b/benchmarking/harness/sink/ip/component.xml
@@ -0,0 +1,256 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <spirit:vendor>xilinx.com</spirit:vendor>
+  <spirit:library>user</spirit:library>
+  <spirit:name>harness_sink</spirit:name>
+  <spirit:version>1.0</spirit:version>
+  <spirit:busInterfaces>
+    <spirit:busInterface>
+      <spirit:name>s_axis_0</spirit:name>
+      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
+      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
+      <spirit:slave/>
+      <spirit:portMaps>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TDATA</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_0_tdata</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TVALID</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_0_tvalid</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+        <spirit:portMap>
+          <spirit:logicalPort>
+            <spirit:name>TREADY</spirit:name>
+          </spirit:logicalPort>
+          <spirit:physicalPort>
+            <spirit:name>s_axis_0_tready</spirit:name>
+          </spirit:physicalPort>
+        </spirit:portMap>
+      </spirit:portMaps>
+    </spirit:busInterface>
+  </spirit:busInterfaces>
+  <spirit:model>
+    <spirit:views>
+      <spirit:view>
+        <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
+        <spirit:displayName>Synthesis</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
+        <spirit:language>Verilog</spirit:language>
+        <spirit:modelName>harness_sink</spirit:modelName>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>18b9f9a4</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_anylanguagebehavioralsimulation</spirit:name>
+        <spirit:displayName>Simulation</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:simulation</spirit:envIdentifier>
+        <spirit:language>Verilog</spirit:language>
+        <spirit:modelName>harness_sink</spirit:modelName>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>18b9f9a4</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+      <spirit:view>
+        <spirit:name>xilinx_xpgui</spirit:name>
+        <spirit:displayName>UI Layout</spirit:displayName>
+        <spirit:envIdentifier>:vivado.xilinx.com:xgui.ui</spirit:envIdentifier>
+        <spirit:fileSetRef>
+          <spirit:localName>xilinx_xpgui_view_fileset</spirit:localName>
+        </spirit:fileSetRef>
+        <spirit:parameters>
+          <spirit:parameter>
+            <spirit:name>viewChecksum</spirit:name>
+            <spirit:value>6955aee3</spirit:value>
+          </spirit:parameter>
+        </spirit:parameters>
+      </spirit:view>
+    </spirit:views>
+    <spirit:ports>
+      <spirit:port>
+        <spirit:name>enable</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>valid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>checksum</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_0_tdata</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:vector>
+            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STREAM_WIDTH&apos;)) - 1)">7</spirit:left>
+            <spirit:right spirit:format="long">0</spirit:right>
+          </spirit:vector>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic_vector</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+          <spirit:driver>
+            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
+          </spirit:driver>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_0_tvalid</spirit:name>
+        <spirit:wire>
+          <spirit:direction>in</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+      <spirit:port>
+        <spirit:name>s_axis_0_tready</spirit:name>
+        <spirit:wire>
+          <spirit:direction>out</spirit:direction>
+          <spirit:wireTypeDefs>
+            <spirit:wireTypeDef>
+              <spirit:typeName>std_logic</spirit:typeName>
+              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
+              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
+            </spirit:wireTypeDef>
+          </spirit:wireTypeDefs>
+        </spirit:wire>
+      </spirit:port>
+    </spirit:ports>
+    <spirit:modelParameters>
+      <spirit:modelParameter xsi:type="spirit:nameValueTypeType" spirit:dataType="integer">
+        <spirit:name>STREAM_WIDTH</spirit:name>
+        <spirit:displayName>Stream Width</spirit:displayName>
+        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STREAM_WIDTH">8</spirit:value>
+      </spirit:modelParameter>
+    </spirit:modelParameters>
+  </spirit:model>
+  <spirit:fileSets>
+    <spirit:fileSet>
+      <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>src/harness_sink.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_18b9f9a4</spirit:userFileType>
+        <spirit:userFileType>IMPORTED_FILE</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>src/harness_sink.v</spirit:name>
+        <spirit:fileType>verilogSource</spirit:fileType>
+        <spirit:userFileType>IMPORTED_FILE</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+    <spirit:fileSet>
+      <spirit:name>xilinx_xpgui_view_fileset</spirit:name>
+      <spirit:file>
+        <spirit:name>xgui/harness_sink_v1_0.tcl</spirit:name>
+        <spirit:fileType>tclSource</spirit:fileType>
+        <spirit:userFileType>CHECKSUM_6955aee3</spirit:userFileType>
+        <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
+      </spirit:file>
+    </spirit:fileSet>
+  </spirit:fileSets>
+  <spirit:description>harness_sink_v1_0</spirit:description>
+  <spirit:parameters>
+    <spirit:parameter>
+      <spirit:name>STREAM_WIDTH</spirit:name>
+      <spirit:displayName>Stream Width</spirit:displayName>
+      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STREAM_WIDTH">8</spirit:value>
+    </spirit:parameter>
+    <spirit:parameter>
+      <spirit:name>Component_Name</spirit:name>
+      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">harness_sink_v1_0</spirit:value>
+    </spirit:parameter>
+  </spirit:parameters>
+  <spirit:vendorExtensions>
+    <xilinx:coreExtensions>
+      <xilinx:supportedFamilies>
+        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">azynq</xilinx:family>
+        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
+      </xilinx:supportedFamilies>
+      <xilinx:taxonomies>
+        <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
+      </xilinx:taxonomies>
+      <xilinx:displayName>harness_sink_v1_0</xilinx:displayName>
+      <xilinx:autoFamilySupportLevel>level_0</xilinx:autoFamilySupportLevel>
+      <xilinx:definitionSource>package_project</xilinx:definitionSource>
+      <xilinx:coreRevision>2</xilinx:coreRevision>
+      <xilinx:coreCreationDateTime>2023-08-22T13:34:35Z</xilinx:coreCreationDateTime>
+    </xilinx:coreExtensions>
+    <xilinx:packagingInfo>
+      <xilinx:xilinxVersion>2022.2</xilinx:xilinxVersion>
+      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="eacb320c"/>
+      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="b9abf8c4"/>
+      <xilinx:checksum xilinx:scope="ports" xilinx:value="90e07bc9"/>
+      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="112e2f2d"/>
+      <xilinx:checksum xilinx:scope="parameters" xilinx:value="00cd102c"/>
+      <xilinx:targetDRCs>
+        <xilinx:targetDRC xilinx:tool="ipi">
+          <xilinx:targetDRCOption xilinx:name="ignore_freq_hz" xilinx:value="true"/>
+        </xilinx:targetDRC>
+      </xilinx:targetDRCs>
+    </xilinx:packagingInfo>
+  </spirit:vendorExtensions>
+</spirit:component>
diff --git a/benchmarking/harness/sink/ip/src/harness_sink.v b/benchmarking/harness/sink/ip/src/harness_sink.v
new file mode 100644
index 0000000000..e6b95e7797
--- /dev/null
+++ b/benchmarking/harness/sink/ip/src/harness_sink.v
@@ -0,0 +1,39 @@
+`timescale 1ns / 1ps
+//////////////////////////////////////////////////////////////////////////////////
+// Company: 
+// Engineer: 
+// 
+// Create Date: 08/22/2023 02:19:08 PM
+// Design Name: 
+// Module Name: harness_sink
+// Project Name: 
+// Target Devices: 
+// Tool Versions: 
+// Description: 
+// 
+// Dependencies: 
+// 
+// Revision:
+// Revision 0.01 - File Created
+// Additional Comments:
+// 
+//////////////////////////////////////////////////////////////////////////////////
+
+
+module harness_sink #(
+    parameter STREAM_WIDTH=8
+)(
+    input enable,
+    output valid,
+    output checksum,
+    input [STREAM_WIDTH-1:0] s_axis_0_tdata,
+    input s_axis_0_tvalid,
+    output s_axis_0_tready
+);
+
+assign s_axis_0_tready = enable;
+
+assign valid = s_axis_0_tvalid;
+assign checksum = ^s_axis_0_tdata;
+
+endmodule
diff --git a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
new file mode 100644
index 0000000000..eb752d53a5
--- /dev/null
+++ b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
@@ -0,0 +1,25 @@
+# Definitional proc to organize widgets for parameters.
+proc init_gui { IPINST } {
+  ipgui::add_param $IPINST -name "Component_Name"
+  #Adding Page
+  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
+  ipgui::add_param $IPINST -name "STREAM_WIDTH" -parent ${Page_0}
+
+
+}
+
+proc update_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } {
+	# Procedure called to update STREAM_WIDTH when any of the dependent parameters in the arguments change
+}
+
+proc validate_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } {
+	# Procedure called to validate STREAM_WIDTH
+	return true
+}
+
+
+proc update_MODELPARAM_VALUE.STREAM_WIDTH { MODELPARAM_VALUE.STREAM_WIDTH PARAM_VALUE.STREAM_WIDTH } {
+	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
+	set_property value [get_property value ${PARAM_VALUE.STREAM_WIDTH}] ${MODELPARAM_VALUE.STREAM_WIDTH}
+}
+
diff --git a/benchmarking/harness/vector_xor.v b/benchmarking/harness/vector_xor.v
new file mode 100644
index 0000000000..3361860ab8
--- /dev/null
+++ b/benchmarking/harness/vector_xor.v
@@ -0,0 +1,32 @@
+`timescale 1ns / 1ps
+//////////////////////////////////////////////////////////////////////////////////
+// Company: 
+// Engineer: 
+// 
+// Create Date: 08/22/2023 02:19:08 PM
+// Design Name: 
+// Module Name: harness_sink
+// Project Name: 
+// Target Devices: 
+// Tool Versions: 
+// Description: 
+// 
+// Dependencies: 
+// 
+// Revision:
+// Revision 0.01 - File Created
+// Additional Comments:
+// 
+//////////////////////////////////////////////////////////////////////////////////
+
+
+module vector_xor #(
+    parameter WIDTH=8
+)(
+    input [WIDTH-1:0] in_data,
+    output out_data
+);
+
+assign out_data = ^in_data;
+
+endmodule
diff --git a/benchmarking/templates.py b/benchmarking/templates.py
new file mode 100644
index 0000000000..c8bf944380
--- /dev/null
+++ b/benchmarking/templates.py
@@ -0,0 +1,213 @@
+# Template strings for benchmarking
+
+
+# power report scripting based on Lucas Reuter:
+template_open = """
+open_project  $PROJ_PATH$
+open_run $RUN$
+"""
+
+template_single_test = """
+set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type lut [get_cells -r finn_design_i/.*]
+set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type register [get_cells -r finn_design_i/.*]
+set_switching_activity -deassert_resets
+report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+reset_switching_activity -hier -type lut [get_cells -r finn_design_i/.*]
+reset_switching_activity -hier -type register [get_cells -r finn_design_i/.*]
+"""
+
+# template_single_test_type = """
+# set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*]
+# set_switching_activity -deassert_resets
+# report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+# reset_switching_activity -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*]
+# """
+
+template_sim_power = """
+set_property SOURCE_SET sources_1 [get_filesets sim_1]
+import_files -fileset sim_1 -norecurse $TB_FILE_PATH$
+set_property top switching_simulation_tb [get_filesets sim_1]
+update_compile_order -fileset sim_1
+
+launch_simulation -mode post-implementation -type functional
+restart
+open_saif $SAIF_FILE_PATH$
+log_saif [get_objects -r /switching_simulation_tb/dut/*]
+run $SIM_DURATION_NS$ ns
+close_saif
+
+read_saif $SAIF_FILE_PATH$
+report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml
+"""
+
+# TODO: configurable clock frequency
+template_switching_simulation_tb = """
+`timescale 1 ns/10 ps
+
+module switching_simulation_tb;
+reg clk;
+reg rst;
+
+//dut inputs
+reg tready;
+reg [$INSTREAM_WIDTH$-1:0] tdata;
+reg tvalid;
+
+//dut outputs
+wire [$OUTSTREAM_WIDTH$-1:0] accel_tdata;
+wire accel_tready;
+wire accel_tvalid;
+
+finn_design_wrapper dut(
+        .ap_clk(clk),
+        .ap_rst_n(rst),
+        .m_axis_0_tdata(accel_tdata),
+        .m_axis_0_tready(tready),
+        .m_axis_0_tvalid(accel_tvalid),
+        .s_axis_0_tdata(tdata),
+        .s_axis_0_tready(accel_tready),
+        .s_axis_0_tvalid(tvalid)
+        );
+
+always
+    begin
+        clk = 0;
+        #2.5;
+        clk = 1;
+        #2.5;
+    end
+
+integer i;
+initial
+    begin
+        tready = 0;
+        tdata = 0;
+        tvalid = 0;
+        rst = 0;
+        #50;
+        rst = 1;
+        tvalid = 1;
+        tready = 1;
+        while(1)
+            begin
+                for (i = 0; i < $INSTREAM_WIDTH$/$DTYPE_WIDTH$; i = i+1) begin
+                    tdata[i*$DTYPE_WIDTH$ +: $DTYPE_WIDTH$] = $RANDOM_FUNCTION$;
+                end
+                #5;
+            end
+    end
+endmodule
+"""
+
+zynq_harness_template = """
+set FREQ_MHZ %s
+set NUM_AXILITE %d
+if {$NUM_AXILITE > 9} {
+    error "Maximum 10 AXI-Lite interfaces supported"
+}
+set NUM_AXIMM %d
+set BOARD %s
+set FPGA_PART %s
+create_project finn_zynq_link ./ -part $FPGA_PART
+
+# set board part repo paths to find boards installed by FINN
+set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
+set paths_param [get_param board.repoPaths]
+lappend paths_prop $::env(FINN_ROOT)/deps/board_files
+lappend paths_param $::env(FINN_ROOT)/deps/board_files
+set_property BOARD_PART_REPO_PATHS $paths_prop [current_project]
+set_param board.repoPaths $paths_param
+
+if {$BOARD == "RFSoC2x2"} {
+    set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project]
+    set ZYNQ_TYPE "zynq_us+"
+} else {
+    puts "Unrecognized board"
+}
+
+create_bd_design "top"
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]]
+    create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
+    apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
+    set_property CONFIG.PSU__DISPLAYPORT__PERIPHERAL__ENABLE {0} [get_bd_cells zynq_ps]
+    #activate one slave port, deactivate the second master port
+    set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {0}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps]
+    #set frequency of PS clock (this can't always be exactly met)
+    set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps]
+    set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
+} else {
+    puts "Unrecognized Zynq type"
+}
+
+#instantiate axi interconnect, axi smartconnect
+set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]]
+#set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]]
+create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0
+#create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0
+#set number of axilite interfaces, and number of axi master interfaces
+#set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0]
+set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0]
+
+#create reset controller and connect interconnects to PS
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    set axi_peripheral_base 0xA0000000
+    #connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD]
+    connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI]
+    #connect interconnect clocks and resets
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/ACLK]
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/S00_ACLK]
+    #apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins zynq_ps/saxihp0_fpd_aclk]
+}
+#connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn]
+
+#procedure used by below IP instantiations to map BD address segments based on the axi interface aperture
+proc assign_axi_addr_proc {axi_intf_path} {
+    #global variable holds current base address
+    global axi_peripheral_base
+    #infer range
+    set range [expr 2**[get_property CONFIG.ADDR_WIDTH [get_bd_intf_pins $axi_intf_path]]]
+    set range [expr $range < 4096 ? 4096 : $range]
+    #align base address to range
+    set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)]
+    #perform assignment
+    assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg*] -offset $offset -range $range
+    #advance base address
+    set axi_peripheral_base [expr $offset + $range]
+}
+
+#custom IP instantiations/connections start here
+%s
+
+#finalize clock and reset connections for interconnects
+if {$ZYNQ_TYPE == "zynq_us+"} {
+    apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_0/M*_ACLK]
+}
+
+save_bd_design
+assign_bd_address
+validate_bd_design
+
+set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ]
+make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top
+
+#set_property strategy Flow_PerfOptimized_high [get_runs synth_1]
+#set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1]
+#set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1]
+#set_property strategy Performance_ExtraTimingOpt [get_runs impl_1]
+#set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1]
+#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+#set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1]
+#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1]
+
+# out-of-context synth can't be used for bitstream generation
+# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1]
+launch_runs -to_step write_bitstream impl_1
+wait_on_run [get_runs impl_1]
+
+# generate synthesis report
+open_run impl_1
+report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml
+close_project
+"""
diff --git a/benchmarking/util.py b/benchmarking/util.py
new file mode 100644
index 0000000000..17dec02762
--- /dev/null
+++ b/benchmarking/util.py
@@ -0,0 +1,87 @@
+# Utility functions for benchmarking
+import os, shutil
+from qonnx.core.datatype import DataType
+import xml.etree.ElementTree as ET
+
+def _find_rows_and_headers(table):
+    rows = table.findall("tablerow")
+    headers = []
+
+    for row in rows:
+        headers = row.findall("tableheader")
+        if len(headers) > 0:
+            break
+    return (rows, headers)
+
+
+def summarize_table(table):
+    table_summary = {}
+    table_summary["headers"] = []
+    rows, headers = _find_rows_and_headers(table)
+
+    if len(headers) > 0:
+        string = "Header: "
+        for header in headers:
+            table_summary["headers"].append(header.attrib["contents"])
+            string = string + header.attrib["contents"] + " "
+        # print(string.rstrip())
+
+    for row in rows:
+        cells = row.findall("tablecell")
+        if len(cells) > 0:
+            cell_name = cells[0].attrib["contents"]
+            string = cell_name
+            table_summary[cell_name] = []
+            for cell in cells[1:]:
+                table_summary[cell_name].append(cell.attrib["contents"])
+                string = string + cell.attrib["contents"] + " "
+            # print(string.rstrip())
+
+    return table_summary
+
+
+def summarize_section(section):
+    section_summary = {}
+    section_summary["tables"] = []
+    section_summary["subsections"] = {}
+
+    # print("Section:", section.attrib["title"])
+    tables = section.findall("table")
+    sub_sections = section.findall("section")
+    for table in tables:
+        section_summary["tables"].append(summarize_table(table))
+    # print("")
+    for sub_section in sub_sections:
+        section_summary["subsections"][sub_section.attrib["title"]] = summarize_section(sub_section)
+
+    return section_summary
+
+
+def power_xml_to_dict(xml_path):
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    sections = root.findall("section")
+    result = {}
+
+    for section in sections:
+        result[section.attrib["title"]] = summarize_section(section)
+
+    return result
+
+def prepare_inputs(input_tensor, idt, wdt):
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        # convert bipolar to binary
+        return {"inp": (input_tensor + 1) / 2}
+    else:
+        return {"inp": input_tensor}
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print('Failed to delete %s. Reason: %s' % (file_path, e))

From cc61f000c16a66c104b94f018c598174be3125a7 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 14:19:41 +0000
Subject: [PATCH 002/125] Pull in new Transformer flow

---
 benchmarking/bench.py                         |    4 -
 .../cfg/resnet50_fifosizing_test.json         |    2 +-
 benchmarking/dut/transformer.py               |  125 +-
 benchmarking/dut/transformer_custom_steps.py  | 1091 ++++++++++++-----
 benchmarking/dut/transformer_gpt.py           |  348 ------
 benchmarking/dut/transformer_radioml.py       |  336 -----
 6 files changed, 864 insertions(+), 1042 deletions(-)
 delete mode 100644 benchmarking/dut/transformer_gpt.py
 delete mode 100644 benchmarking/dut/transformer_radioml.py

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 77f62bd775..db6f00c159 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -130,10 +130,6 @@ def get_default_session_options_new():
         # give bench subclass name directly in config?
         if config_select.startswith("mvau"):
             bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("transformer_radioml"):
-            bench_object = bench_transformer_radioml(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("transformer_gpt"):
-            bench_object = bench_transformer_gpt(params, task_id, run_id, artifacts_dir, save_dir)
         elif config_select.startswith("transformer"):
             bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir)
         elif config_select.startswith("fifosizing"):
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index 1e85b972da..fbb0075dae 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -5,7 +5,7 @@
         "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
         "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
-        "board": ["U250"],
+        "board": ["U280"],
         "clock_period_ns": [4],
 
         "strategy": ["analytical"],
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 0dc6444a55..ed9991100b 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -13,6 +13,7 @@
     QuantReLU
 )
 import os
+from qonnx.core.modelwrapper import ModelWrapper
 # Progressbar
 from tqdm import trange
 import numpy as np
@@ -26,34 +27,37 @@
 import finn.builder.build_dataflow_config as build_cfg
 from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
 from bench_base import bench, step_synth_harness
+from finn.util.basic import alveo_part_map
+
+# Range information structure for seeding the range analysis for converting
+# quantized activations to MultiThreshold
+from qonnx.util.range_analysis import RangeInfo
 
 # Custom build steps required to streamline and convert the attention operator
 from dut.transformer_custom_steps import (
-    step_tidy_up_pre_attention,
-    step_tidy_up_post_attention,
-    step_streamline_attention,
-    step_streamline_residual,
-    step_streamline_norms,
-    step_streamline_positional,
+    prepare_graph,
+    step_streamline,
     step_convert_attention_to_hw,
     step_convert_elementwise_binary_to_hw,
     step_convert_lookup_to_hw,
+    step_convert_split_concat_to_hw,
+    step_convert_depth_wise_to_hw,
     step_replicate_streams,
     set_target_parallelization,
     set_fifo_depths,
     step_apply_folding_config,
-    node_by_node_rtlsim,
-    node_by_node_cppsim
-)
-from performance.platform_build_steps import(
-     test_step_gen_vitis_xo,
-     test_step_gen_instrumentation_wrapper,
-     test_step_gen_instrwrap_sim,
-     test_step_insert_tlastmarker,
-     test_step_export_xo,
-     test_step_build_platform,
-     test_step_run_instrwrap_sim
+    node_by_node_rtlsim,  # noqa: Maybe unused, only for debugging
+    node_by_node_cppsim,
 )
+# from performance.platform_build_steps import(
+#      test_step_gen_vitis_xo,
+#      test_step_gen_instrumentation_wrapper,
+#      test_step_gen_instrwrap_sim,
+#      test_step_insert_tlastmarker,
+#      test_step_export_xo,
+#      test_step_build_platform,
+#      test_step_run_instrwrap_sim
+# )
 
 ### ADAPTED FROM utils.py
 # Seeds all relevant random number generators to the same seed for
@@ -791,6 +795,9 @@ def patch_non_affine_norms(model: torch.nn.Module):  # noqa: Shadows model
 
 class bench_transformer(bench):
     def step_export_onnx(self, output_onnx_path):
+        # Generates a dummy transformer block,
+        # not used for actual models (RadioML, GPT, etc.)
+
         # Load the parameters file
         #params = dvc.api.params_show("params.yaml")
         # Seed all RNGs
@@ -841,9 +848,10 @@ def step_export_onnx(self, output_onnx_path):
         # Compute attention output
         o = model(x)
         # Save the input and output data for verification purposes later
-        # TODO: go via self.build_inputs["input_npy_path"]
         np.save("inp.npy", x.detach().numpy())
         np.save("out.npy", o.detach().numpy())
+        self.build_inputs["input_npy_path"] = "inp.npy"
+        self.build_inputs["output_npy_path"] = "out.npy"
         # Export the model graph to QONNX
         #export_qonnx(model, (x,), "attention.onnx", **self.params["export"])
         export_qonnx(model, (x,), output_onnx_path, 
@@ -856,8 +864,23 @@ def step_build(self):
         # Seed all RNGs
         seed(self.params["seed"])
         # Extract sequence length and embedding dimension from parameters
-        seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"]
-
+        if "model_seq_len" in self.params and "model_emb_dim" in self.params:
+            # for dummy Transformer DUT
+            seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"]
+        else:
+            # for real input models
+            _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape
+            # TODO: use the following to get dimensions for GPT models?
+            #model = ModelWrapper(self.build_inputs["onnx_path"])
+            #_, emb_dim, seq_len = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0")
+
+        # Read the input value range information for the dataset from the parameters
+        # Note: Consider calibrating this on the fly from the dataset
+        range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable?
+        input_range = tuple(np.array([range]).T)
+        # Construct the seed range information of the input tensor
+        range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range)
+    
         # Prepare config files
         # TODO: make configurable
         # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
@@ -874,16 +897,21 @@ def step_build(self):
         with open("folding.yaml", "w") as f:
                 f.write(template_folding_yaml)
 
+        if self.board in alveo_part_map:
+            shell_flow = "vitis_alveo"
+        else:
+            shell_flow = "vivado_zynq"
+
         # Create a configuration for building the scaled dot-product attention
         # operator to a hardware accelerator
         cfg = build_cfg.DataflowBuildConfig(
             # Unpack the build configuration parameters
-            #**params["build"],
+            #**params["build"]["finn"],
             output_dir = self.build_inputs["build_dir"],
-            stitched_ip_gen_dcp = True,
+            stitched_ip_gen_dcp = False, # only needed for further manual integration
             synth_clk_period_ns = self.clock_period_ns,
             board = self.board,
-            shell_flow_type = "vivado_zynq", #TODO: Alveo support
+            shell_flow_type = shell_flow,
             folding_config_file = "folding.yaml",
             specialize_layers_config_file = "specialize_layers.json",
             standalone_thresholds = True,
@@ -915,11 +943,14 @@ def step_build(self):
                 build_cfg.VerificationStepType.TIDY_UP_PYTHON,
                 # Verify the model after generating C++ HLS and applying folding
                 build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+                # No RTL Simulation support for now
             ],
             # File with test inputs for verification
-            verify_input_npy="inp.npy",
+            verify_input_npy=self.build_inputs["input_npy_path"],
             # File with expected test outputs for verification
-            verify_expected_output_npy="out.npy",
+            verify_expected_output_npy=self.build_inputs["output_npy_path"],
+            # Output full context dump for verification steps
+            verify_save_full_context=True,
             # Save the intermediate model graphs
             save_intermediate_models=True,
             # Avoid RTL simulation for setting the FIFO sizes
@@ -929,39 +960,27 @@ def step_build(self):
             auto_fifo_depths=False,
             # Build steps to execute
             steps=[
-                # Need to apply some tidy-up transformations before converting to
-                # the finn dialect of onnx
-                step_tidy_up_pre_attention,
-                # Convert all QONNX Quant nodes to Multithreshold nodes
-                "step_qonnx_to_finn",
-                # Tidy up the graph after converting from QONNX to FINN format
-                # Note: Triggers a verification step
-                "step_tidy_up",
-                # Positional encoding needs to be streamlined first with slightly
-                # different order of certain streamlining transformations to avoid
-                # weird rounding issue of intermediate results
-                step_streamline_positional,
-                # Custom streamlining for models containing attention operators
-                step_streamline_attention,
-                # Streamlining of the residual branches
-                step_streamline_residual,
-                # Streamline the normalization layers, i.e., transposed batch norm
-                step_streamline_norms,
-                # Another round using the default streamlining steps
-                # Note: Triggers a verification step
-                "step_streamline",
-                # New conversion of the scaled dot-product attention pattern
+                # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering
+                # and Quant to MultiThreshold conversion
+                prepare_graph(range_info=range_info),
+                # Unified exhaustive streamlining of complex model topologies
+                # including attention, residuals and splits
+                step_streamline,
+                # conversion of the scaled dot-product attention pattern to
+                # hardware, including cleanup and data layout squeezing
                 step_convert_attention_to_hw,
-                # Another tidy-up step to remove unnecessary dimensions and
-                # operations after converting the attention operators to HLS
-                step_tidy_up_post_attention,
                 # Convert the elementwise binary operations to hardware operators.
                 # These include for example adding residual branches and positional
                 # encoding
                 step_convert_elementwise_binary_to_hw,
-                # Convert the Gather layer realizing the input token embedding to
-                # the FINN hardware implementation, i.e., the Lookup layer
+                # Convert Lookup layers, e.g., token embedding, to hardware custom
+                # operators
                 step_convert_lookup_to_hw,
+                # Convert Split and Concat operators to hardware, e.g., splits
+                # contained in the GLU activation
+                step_convert_split_concat_to_hw,
+                # Convert depth-wise convolution MatMuls to VVUs
+                step_convert_depth_wise_to_hw,
                 # Properly replicate the stream feeding the query, key and value
                 # projections
                 step_replicate_streams,
@@ -997,7 +1016,7 @@ def step_build(self):
                 # StreamingFIFOs are used
                 # node_by_node_rtlsim,
 
-                test_step_insert_tlastmarker, # required for instrumentation_wrapper
+                #test_step_insert_tlastmarker, # required for instrumentation_wrapper
 
                 "step_create_stitched_ip",
 
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index d28a4c501a..e122f79a0d 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -1,81 +1,88 @@
-# ADAPTED FROM Christoph's attention-dummy build_steps.py
-
+# ADAPTED FROM Christoph's radioml-transformer repository, specifically these files:
+# build_steps.py
+# custom/apply_config.py
+# custom/composed_transformation.py
+# custom/streamline.py
+
+# Python warning messages
+import warnings
+# Copies of python objects
+from copy import deepcopy
 # Copies (deep-copies) python objects
 import copy
 # Numpy for loading and comparing the verification input/output
 import numpy as np
 # YAML for loading experiment configurations
 import yaml
+
 # QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-# QONNX quantization data types
-from qonnx.core.datatype import DataType
-# Converts ONNX graph nodes to QONNX custom-ops if possible
-from qonnx.custom_op.registry import getCustomOp
+# Range information structure for seeding the range analysis for converting
+# quantized activations to MultiThreshold
+from qonnx.util.range_analysis import RangeInfo
+
 # QONNX graph transformations for renaming and cleaning up
 from qonnx.transformation.general import (
     Transformation,
     GiveUniqueNodeNames,
     GiveReadableTensorNames,
-    RemoveUnusedTensors,
-    RemoveStaticGraphInputs,
     GiveUniqueParameterTensors,
-    ConvertDivToMul,
-    ConvertSubToAdd
+    RemoveStaticGraphInputs,
+    RemoveUnusedTensors,
 )
-# Converts BatchNorm operation to affine transformation
-from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
-# QONNX graph transformations for inferring datatypes and shapes
+# QONNX graph transformations for annotating the graph with datatype and shape
+# information
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.infer_data_layouts import InferDataLayouts
-# QONNX cleanup transformations
-from qonnx.transformation.remove import RemoveIdentityOps
-# Precompute constant output nodes
+
+# If we have a convolution with a bias tensors input, QONNX and later FINN
+# expect the bias to be expressed as a standalone Add node following the Conv
+# node.
+from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv
+# Converts BatchNorm operation to affine transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+# Converts Gemm operation to MatMul with extracted standalone bias op
+from qonnx.transformation.gemm_to_matmul import GemmToMatMul
+# Converts Conv to Im2Col and MatMul with extracted standalone bias op
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+# Transposes the initializer tensors of a Quant node instead of having a
+# standalone Transpose following
+from qonnx.transformation.quant_constant_folding import (
+    FoldTransposeIntoQuantInit
+)
+# Collapses chains of constants into a single constant operation or even
+# initializer tensors.
 from qonnx.transformation.fold_constants import FoldConstants
-# Streamlining transformation: This is a collection of various transformations
-from finn.transformation.streamline import (
-    ConvertSignToThres, RoundAndClipThresholds
+# Folds quantizers into weight tensor initializers, needed for lowering
+# convolutions to MatMuls
+from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
+# FINN streamlining transformations reordering the graph
+from finn.transformation.streamline.reorder import (
+    MoveTransposePastFork,
+    MoveTransposePastEltwise,
+    MoveTransposePastJoinMul,
+    MoveTransposePastJoinAdd,
+    MoveTransposePastSplit,
+    MoveTransposePastJoinConcat,
+    MoveSqueezePastMultiThreshold,
+    MoveSqueezePastMatMul
 )
-# Fuse/Absorb operations
+# FINN streamlining transformations absorbing tensors/nodes into others
 from finn.transformation.streamline.absorb import (
     AbsorbAddIntoMultiThreshold,
     AbsorbSignBiasIntoMultiThreshold,
-    FactorOutMulSignMagnitude,
-    AbsorbMulIntoMultiThreshold,
-    Absorb1BitMulIntoMatMul,
-    Absorb1BitMulIntoConv
-)
-# Reorder operations
-from finn.transformation.streamline.reorder import (
-    MoveMulPastFork,
-    MoveLinearPastFork,
-    MoveTransposePastFork,
-    MoveLinearPastEltwiseAdd,
-    MoveScalarLinearPastInvariants,
-    MoveTransposePastEltwise,
-    MoveMulPastMaxPool,
-    MoveAddPastMul,
-    MoveScalarAddPastMatMul,
-    MoveAddPastConv,
-    MoveScalarMulPastMatMul,
-    MoveScalarMulPastConv,
 )
-# Collapse consecutive operations of the same type
+# FINN streamlining transformations fusing/collapsing operations of the same
+# kind
 from finn.transformation.streamline.collapse_repeated import (
-    CollapseRepeatedMul,
-    CollapseRepeatedTranspose,
-    CollapseRepeatedAdd
+    CollapseRepeatedTranspose
 )
-# FINN transformation converting ONNX nodes to hardware custom operators
-from finn.transformation.fpgadataflow.convert_to_hw_layers import (
-    InferElementwiseBinaryOperation,
-    InferLookupLayer
-)
-# Remove some operations without real effect
+# FINN streamlining transformations removing nodes without real effect from the
+# graph
 from finn.transformation.streamline.remove import (
     RemoveIdentityTranspose,
-    RemoveIdentityReshape
+    RemoveIdentityReshape,
+    RemoveIdentityOps
 )
 # Cleanup transformation getting rid of 3d data layout
 from finn.transformation.squeeze import Squeeze
@@ -87,14 +94,33 @@
 # Mult-Head Attention support
 from finn.transformation.fpgadataflow.attention_heads import (
     InferMultiHeads,
-    MoveSplitMultiHeadsPastMultiThreshold,
     UnrollMultiHeadAttention,
+    MoveSplitMultiHeadsPastMultiThreshold,
     MoveMergeMultiHeadsPastMultiThreshold
 )
-# Stream replication for outputs with multiple consumers
+# Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps
+from finn.transformation.fpgadataflow.convert_to_hw_layers import (
+    InferSqueeze,
+    InferUnsqueeze,
+    InferElementwiseBinaryOperation,
+    InferSplitLayer,
+    InferConcatLayer,
+    InferLookupLayer,
+    InferVectorVectorActivation
+)
+# Converts fork-nodes to ReplicateStream hardware operator
 from finn.transformation.fpgadataflow.replicate_stream import (
     InferReplicateStream
 )
+# Standard QONNX to FINN conversion function
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
+from finn.transformation.qonnx.quant_act_to_multithreshold import (
+    default_filter_function_generator,
+)
+# QONNX quantization data types
+from qonnx.core.datatype import DataType
+# Converts ONNX graph nodes to QONNX custom-ops if possible
+from qonnx.custom_op.registry import getCustomOp
 # Inserts data-width converter and FIFO nodes into the model graph
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
@@ -125,6 +151,78 @@
 # Execute onnx model graphs from the dataflow parent for verification
 from finn.util.test import execute_parent
 
+# Base class for all QONNX graph transformations and some basic cleanup
+# transformations
+from qonnx.transformation.general import (
+    Transformation,
+    ConvertDivToMul,
+    ConvertSubToAdd,
+)
+
+# QONNX graph transformations for annotating the graph with datatype and shape
+# information
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+# Converts BatchNorm operation to affine transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+
+# Groups node inputs by dynamic vs. initializer category
+from finn.transformation.streamline.absorb import group_inputs_by_category
+
+# FINN streamlining transformations converting and rounding values
+from finn.transformation.streamline import (
+    ConvertSignToThres,
+    RoundAndClipThresholds
+)
+# FINN streamlining transformations reordering the graph
+from finn.transformation.streamline.reorder import (
+    MoveMulPastFork,
+    MoveTransposePastFork,
+    MoveLinearPastEltwiseAdd,
+    MoveScalarLinearPastInvariants,
+    MoveTransposePastEltwise,
+    MoveMulPastMaxPool,
+    MoveAddPastMul,
+    MoveScalarAddPastMatMul,
+    MoveAddPastConv,
+    MoveScalarMulPastMatMul,
+    MoveScalarMulPastConv,
+    MoveTransposePastJoinMul,
+    MoveTransposePastJoinAdd,
+    MoveMulPastJoinAdd,
+    MoveAddPastJoinAdd,
+    MoveScalarLinearPastSplit,
+    MoveAffinePastJoinConcat,
+    MoveMulPastJoinConcat,
+    MoveAddPastJoinConcat,
+    MoveTransposePastSplit,
+    MoveTransposePastJoinConcat,
+    MoveSqueezePastMultiThreshold,
+    is_scalar
+)
+# FINN streamlining transformations absorbing tensors/nodes into others
+from finn.transformation.streamline.absorb import (
+    AbsorbAddIntoMultiThreshold,
+    AbsorbSignBiasIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
+    AbsorbMulIntoMultiThreshold,
+    Absorb1BitMulIntoMatMul,
+    Absorb1BitMulIntoConv,
+    AbsorbTransposeIntoMultiThreshold
+)
+# FINN streamlining transformations fusing/collapsing operations of the same
+# kind
+from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedMul,
+    CollapseRepeatedTranspose,
+    CollapseRepeatedAdd
+)
+# FINN streamlining transformations removing nodes without real effect from the
+# graph
+from finn.transformation.streamline.remove import (
+    RemoveIdentityTranspose,
+    RemoveIdentityReshape
+)
 
 # Composes graph transformations such that each individual transformation as
 # well as the whole sequence is applied exhaustively
@@ -164,201 +262,627 @@ def apply(self, model: ModelWrapper):  # noqa
             model = model.transform(RemoveIdentityOps())
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
+            model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         # Return the transformed model and indicate whether the graph actually
         # has been transformed by at least one transformation so the whole
         # sequence of transformations will be reapplied
         return model, graph_modified
 
+# # Custom conversion from Quant to MultiThreshold
+# TODO: Enable once fixed...
+# from custom.quant_activation_to_multithreshold import (
+#     QuantActivationToMultiThreshold
+# )
 
-# Custom Streamlining transformation: Similar to the built-in transformations
-# but exhaustively reapplied until none of the transformations can be applied
-# anymore.
-def Streamline():  # noqa: Uppercase
-    return ComposedTransformation([
-        ConvertSubToAdd(),
-        ConvertDivToMul(),
-        BatchNormToAffine(),
-        ConvertSignToThres(),
-        MoveMulPastMaxPool(),
-        AbsorbSignBiasIntoMultiThreshold(),
-        MoveScalarLinearPastInvariants(),
-        MoveAddPastMul(),
-        MoveScalarAddPastMatMul(),
-        MoveAddPastConv(),
-        MoveScalarMulPastMatMul(),
-        MoveScalarMulPastConv(),
-        MoveAddPastMul(),
-        CollapseRepeatedAdd(),
-        CollapseRepeatedMul(),
-        MoveMulPastMaxPool(),
-        AbsorbAddIntoMultiThreshold(),
-        FactorOutMulSignMagnitude(),
-        AbsorbMulIntoMultiThreshold(),
-        Absorb1BitMulIntoMatMul(),
-        Absorb1BitMulIntoConv(),
-        RoundAndClipThresholds(),
-    ])
-
-
-# Function running transformations necessary to clean up models containing
-# attention operators
-def step_tidy_up_pre_attention(model: ModelWrapper, _):
-    # Add shape and datatype annotations throughout all the graph
-    model = model.transform(InferDataTypes())  # noqa Duplicate
-    model = model.transform(InferShapes())
-
-    # Cleanup the graph by removing redundant, unnecessary and constant nodes
-    # and tensors and give unique names to everything remaining
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-    model = model.transform(RemoveStaticGraphInputs())
-    model = model.transform(RemoveUnusedTensors())
-    model = model.transform(GiveUniqueParameterTensors())
-    model = model.transform(FoldConstants())
-
-    # Remove unnecessary shape and layout transformations
-    model = model.transform(RemoveIdentityReshape())
-    model = model.transform(RemoveIdentityTranspose())
-    # Insert tensor layout annotations for Quant to MultiThreshold transform
-    # to determine the correct output channel dimension
-    model = model.transform(InferDataLayouts())
-    # Return the tidied up model
-    return model
-
-
-# Variant of streamlining transformations adapted to attention operators
-def step_streamline_attention(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # Exhaustively apply the pattern of streamlining and moving past fork-nodes
-    model = model.transform(ComposedTransformation([
-        # Apply the set of standard streamlining transformations from finn to
-        # the model
-        Streamline(),
-        # We need a custom streamlining step to enable streamlining through
-        # certain fork-nodes Note: This transform is part of finn, but not
-        # included in the standard streamlining transformations
-        MoveLinearPastFork(),
-        # Streamline again there should be more transformations enabled after
-        # moving some nodes past forks
-        Streamline(),
-    ]))
-
-    # If configured, run a verification of the transformed model on some sample
-    # inputs
-    if (VerificationStepType.STREAMLINED_PYTHON in
-            cfg._resolve_verification_steps()):  # noqa
-        verify_step(
-            model, cfg, "streamlined_attention_python", need_parent=False
-        )
-
-    # Return the streamlined model
-    return model
-
+# Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These
+# cannot be handled by MoveScalarLinearPastInvariants as potential padding makes
+# Add-Im2Col not commute to Im2Col-Add
+class MoveScalesPastIm2Col(Transformation):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to Mul operation types
+            if node.op_type in {"Mul", "Div"}:
+                # Cannot handle fork- or join-multiplications
+                if model.is_fork_node(node) or model.is_join_node(node):
+                    # Softly skip this node
+                    continue
+                # Only handles one forking output for now
+                if len(node.output) > 1:
+                    # Softly skip this node
+                    continue
+                # The first input must be dynamically received from upstream
+                if model.get_initializer(node.input[0]) is not None:
+                    # Softly skip this node
+                    continue
+                # Test whether the node initializer is a scalar...
+                if not is_scalar(model.get_initializer(node.input[1])):
+                    # Softly skip this node
+                    continue
+                # As this is not a fork-node, there can be at most one successor
+                successor = model.find_direct_successors(node)
+                # If this is the final operation in the graph, there might be no
+                # successor
+                if successor is None:
+                    # Softly skip this node
+                    continue
+                # Now there is exactly one successor which needs to be extracted
+                # from the list
+                successor = successor[0]
+                # Handle both, Im2Col and the inverse Col2Im, as well as padding
+                if successor.op_type in {"Im2Col", "Col2Im", "Pad"}:
+                    # Get names of all tensors involved in connecting the
+                    # nodes
+                    inp = node.input[0]  # noqa: Duplicate
+                    mid = node.output[0]
+                    out = successor.output[0]
+                    # Rewire the graph to feed original input into the
+                    # Add node first
+                    successor.input[0] = inp
+                    # Repurpose the middle tensor for the output of the Add
+                    successor.output[0] = mid
+                    # The Mul operator now gets the middle tensor as its
+                    # input
+                    node.input[0] = mid
+                    # Mul now produces the original output tensor
+                    node.output[0] = out
+                    # Delete the shape annotation of the connecting tensors
+                    # to be re-done later
+                    model.set_tensor_shape(mid, None)
+                    model.set_tensor_shape(out, None)
+                    # Track whether the graph has been modified, never
+                    # resets to False
+                    graph_modified = True
+                    # Break the loop after deleting shape annotations to
+                    # immediately re-do these before changing the next
+                    # operator
+                    break
+        # Redo datatype and shape annotations
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the transformation
+        # needs to be applied again
+        return model, graph_modified
 
-# Streamlining transformations to be applied to residual branches
-def step_streamline_residual(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # Exhaustively apply the pattern for streamlining residual branches. This
-    # ensures streamlining to work for arbitrary many consecutive residual
-    # blocks, where one "round" of these transformations is required per block.
-    model = model.transform(ComposedTransformation([
-        # Streamline the residual connections by moving scale factors past
-        # elementwise add nodes
-        MoveLinearPastEltwiseAdd(),
-        MoveLinearPastFork(),
-        MoveScalarLinearPastInvariants(),
-        # Do the normal streamlining flow once again
-        Streamline(),
-    ]))
+# Moves scalar linear elementwise operations past fork nodes, applies to Add,
+# Mul, Sub, Div, etc.
+class MoveScalarLinearPastFork(Transformation):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to Mul-like and Add-like operation types
+            if node.op_type in {"Add", "Sub", "Mul", "Div"}:
+                # Only handles non-joining forks for now
+                if not model.is_fork_node(node) or model.is_join_node(node):
+                    # Softly skip this node
+                    continue
+                # Only handles one forking output for now
+                if len(node.output) > 1:
+                    # Softly skip this node
+                    continue
+                # Test whether the node initializer is a scalar...
+                if not is_scalar(model.get_initializer(node.input[1])):
+                    # Softly skip this node
+                    continue
+                # We need to insert a replica of this operation in front of each
+                # consumer node
+                for consumer in model.find_direct_successors(node):
+                    # Create an exact replica of this operator
+                    copy = deepcopy(node)
+                    # Insert a new unique tensor connecting the output of the
+                    # copy to the consumer
+                    copy.output[0] = model.make_new_valueinfo_name()
+                    # The original node might be connecting to multiple inputs
+                    # of the consumer...
+                    for idx, inp in enumerate(consumer.input):
+                        # Find each instance of connection from original node
+                        if inp == node.output[0]:
+                            # Rewire to connect to the replica
+                            consumer.input[idx] = copy.output[0]
+                    # Insert the new replica node into the graph
+                    graph.node.insert(index + 1, copy)
+                # Remove the original node from the graph
+                graph.node.remove(node)
+        # Redo datatype and shape annotations
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the transformation
+        # needs to be applied again
+        return model, graph_modified
 
-    # If configured, run a verification of the transformed model on some sample
-    # inputs
-    if (VerificationStepType.STREAMLINED_PYTHON in
-            cfg._resolve_verification_steps()):  # noqa
-        verify_step(
-            model, cfg, "streamlined_residual_python", need_parent=False
-        )
+# Moves constant elementwise multiplication past another joining multiplication
+class MoveConstMulPastJoinMul(Transformation):
+    # Applies the transform to a whole model graph  # noqa: Duplicate
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to Mul operation types
+            if node.op_type == "Mul":
+                # Currently does not handle fork- or join-nodes
+                if model.is_fork_node(node) or model.is_join_node(node):
+                    # Softly skip this node
+                    continue
+                # As this is not a fork-node, there can be at most one successor
+                successor = model.find_direct_successors(node)
+                # If Squeeze is the final operation in the graph, there might
+                # be no successor
+                if successor is None:
+                    # Softly skip this node
+                    continue
+                # Now there is exactly one successor which needs to be extracted
+                # from the list
+                successor = successor[0]
+                # Applies to Multiplications
+                if successor.op_type in {"Mul"}:
+                    # Applies only if the second multiplication is a join-node
+                    if model.is_join_node(successor):
+                        # Get names of all tensors involved in connecting the
+                        # nodes
+                        inp = node.input[0]  # noqa: Duplicate
+                        mid = node.output[0]
+                        out = successor.output[0]
+                        # Need to match the correct input of the joining second
+                        # multiplication
+                        for i, name in enumerate(successor.input):
+                            # If the successors input currently matches the
+                            # intermediate tensors, this input needs to be
+                            # rewired
+                            if name == mid:
+                                # Rewire the graph to feed original into the
+                                # second Mul node first
+                                successor.input[i] = inp
+                                # Note: Do not break here as it is perfectly
+                                # legal to connect the same tensor multiple
+                                # times to different inputs
+                        # Repurpose the middle tensor for the output of the
+                        # second Mul
+                        successor.output[0] = mid
+                        # The first Mul operator now gets the middle tensor as
+                        # its input
+                        node.input[0] = mid
+                        # The first Mul now produces the original output tensor
+                        node.output[0] = out
+                        # Delete the shape annotation of the connecting tensors
+                        # to be re-done later
+                        model.set_tensor_shape(mid, None)
+                        model.set_tensor_shape(out, None)
+                        # Track whether the graph has been modified, never
+                        # resets to False
+                        graph_modified = True
+                        # Break the loop after deleting shape annotations to
+                        # immediately re-do these before changing the next
+                        # operator
+                        break
+        # Redo datatype and shape annotations
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the transformation
+        # needs to be applied again
+        return model, graph_modified
+    
+# Moves elementwise additions past MatMul operations: Applicable if each
+# operation has one initializer input
+class MoveAddPastMatMul(Transformation):
+    # Applies the transform to a whole model graph  # noqa: Duplicate
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to Add operations
+            if node.op_type == "Add":
+                # If the add is a join operation, we do not have a constant
+                # added to the input
+                if model.is_join_node(node):
+                    # Skip transforming this
+                    continue
+                # If the Add is a fork operation we should first distribute the
+                # Add into the branches
+                if model.is_fork_node(node):
+                    # Issue a warning to make the use aware of this potential
+                    # transformation if the fork is moved first
+                    warnings.warn(
+                        f"{self.__class__.__name__}:"
+                        f" Skipping near match: {node.name} is a fork-node,"
+                        f" try MoveLinearPastFork first"
+                    )
+                    # Skip transforming this node as moving this would lead
+                    # to messed up or detached graph
+                    continue
+                # Decompose the inputs into the dynamic and the constant
+                # initializer input
+                (x_name,), (c_name,) = group_inputs_by_category(node, model)
+                # Now check the successor node which must be a MatMul
+                consumer = model.find_direct_successors(node)
+                # If there is no consumer, this Add seems to be last node of the
+                # graph
+                if not consumer:
+                    # Skip transforming this
+                    continue
+                # There must be exactly one consumer now
+                consumer = consumer[0]
+                # This transformation only applies to Add in front of MatMul
+                if not consumer.op_type == "MatMul":
+                    # Skip this if not MatMul
+                    continue
+                # MatMul may not be a join operation to apply this
+                # transformation
+                if model.is_join_node(consumer):
+                    # Skip transforming without warning (there is nothing we can
+                    # do about this)
+                    continue
+                # Decompose the inputs to the MatMul to get the weight tensor
+                # name (the other input is the output of the Add)
+                _, (w_name,) = group_inputs_by_category(consumer, model)
+                # Read the weights and the constant addition tensor
+                w = model.get_initializer(w_name)
+                c = model.get_initializer(c_name)
+                # Determine whether the weights are the left or right input to
+                # the MatMul
+                left = w_name == consumer.input[0]
+                # Apply the weights to the constant tensor
+                c = np.matmul(w, c) if left else np.matmul(c, w)
+                # Insert the transformed tensor back into the mode as an
+                # initializer
+                model.set_initializer(c_name, c)
+                # The connecting tensors of this pattern
+                inp = x_name
+                mid = node.output[0]
+                out = consumer.output[0]
+                # Rewire the graph pattern connecting the input to the MatMul
+                # and the MatMul output to the Add node
+                consumer.input[1 if left else 0] = inp
+                # The Add now produces the original MatMul output
+                node.output[0] = out
+                # The middel tensor connects to the Add input
+                node.input[0 if node.input[0] == x_name else 1] = mid
+                # The MatMul feeds the middle tensors
+                consumer.output[0] = mid
+                # Delete the shape annotation of the connecting tensors
+                # to be re-done later
+                model.set_tensor_shape(mid, None)
+                model.set_tensor_shape(out, None)
+                # Delete the type annotations of the connecting tensors
+                # to be re-done later
+                # model.set_tensor_datatype(mid, None)
+                # model.set_tensor_datatype(out, None)
+                # Track whether the graph has been modified, never
+                # resets to False
+                graph_modified = True
+                # Break the loop after deleting shape annotations to
+                # immediately re-do these before changing the next
+                # operator
+                break
+        # Redo datatype and shape annotations
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the transformation
+        # needs to be applied again
+        return model, graph_modified
 
-    # Return the streamlined model
-    return model
 
+# Moves elementwise multiplication past elementwise addition if one input to
+# each of the operators is a known constant
+# Note: Reverse of MoveAddPastMul
+class MoveMulPastAdd(Transformation):
+    # Applies the transform to a whole model graph
+    def apply(self, model: ModelWrapper):  # noqa
+        # Get the model graph out of the model wrapper object
+        graph = model.graph
+        # Keep track of whether the graph has been modified
+        graph_modified = False
+        # Iterate all nodes in the graph keeping track of the index
+        for index, node in enumerate(graph.node):
+            # Applies to Mul operation types
+            if node.op_type == "Mul":
+                # Currently does not handle fork- or join-nodes
+                if model.is_fork_node(node) or model.is_join_node(node):
+                    # Softly skip this node
+                    continue
+                # As this is not a fork-node, there can be at most one successor
+                successor = model.find_direct_successors(node)
+                # If Squeeze is the final operation in the graph, there might
+                # be no successor
+                if successor is None:
+                    # Softly skip this node
+                    continue
+                # Now there is exactly one successor which needs to be extracted
+                # from the list
+                successor = successor[0]
+                # Applies to additions
+                if successor.op_type in {"Add"}:
+                    # The addition may not join as we need to know the second
+                    # input
+                    if not model.is_join_node(successor):
+                        # Get the constant initializer tensors for both
+                        # operations: y = s * x + b
+                        _, s_name = group_inputs_by_category(node, model)
+                        _, b_name = group_inputs_by_category(successor, model)
+                        # Skip if either node has no constant initializer
+                        if not s_name or not b_name:
+                            # Skip without warning ok?
+                            continue
+                        # There must be exactly one constant per operations
+                        assert len(s_name) == 1, \
+                            f"To many constant inputs for {node}"
+                        assert len(b_name) == 1, \
+                            f"To many constant inputs for {successor}"
+                        # Now read the initializer tensors
+                        s = model.get_initializer(*s_name)
+                        b = model.get_initializer(*b_name)
+                        # Update the addition initializer according to the
+                        # distributive law
+                        model.set_initializer(*b_name, b / s)
+                        # Get names of all tensors involved in connecting the
+                        # nodes
+                        inp = node.input[0]  # noqa: Duplicate
+                        mid = node.output[0]
+                        out = successor.output[0]
+                        # Rewire the graph to feed original input into the
+                        # Add node first
+                        successor.input[0] = inp
+                        # Repurpose the middle tensor for the output of the Add
+                        successor.output[0] = mid
+                        # The Mul operator now gets the middle tensor as its
+                        # input
+                        node.input[0] = mid
+                        # Mul now produces the original output tensor
+                        node.output[0] = out
+                        # Delete the shape annotation of the connecting tensors
+                        # to be re-done later
+                        model.set_tensor_shape(mid, None)
+                        model.set_tensor_shape(out, None)
+                        # Track whether the graph has been modified, never
+                        # resets to False
+                        graph_modified = True
+                        # Break the loop after deleting shape annotations to
+                        # immediately re-do these before changing the next
+                        # operator
+                        break
+        # Redo datatype and shape annotations
+        model = model.transform(InferShapes())
+        model = model.transform(InferDataTypes())
+        # Return the transformed model and indicate whether the transformation
+        # needs to be applied again
+        return model, graph_modified
 
-# Streamlining transformation to be applied to the normalization layers
-def step_streamline_norms(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # Exhaustively apply the pattern for streamlining norms. This ensures
-    # streamlining to work for arbitrary many consecutive blocks, where one
-    # round of these transformations is required per block.
-    model = model.transform(ComposedTransformation([
-        # Streamline transposed batch normalization (move transposes past the
-        # scale-bias operator, so they can be collapsed afterward)
-        MoveTransposePastEltwise(),
-        # There should now be transposes next to each other which can be
-        # collapsed
-        CollapseRepeatedTranspose(),
-        # The transposes around the batch normalization should be collapsed by
-        # now and cancel each other out
-        RemoveIdentityTranspose(),
-        # Nested, exhaustive compositions of transformations
+# Define a set of custom streamlining transformations: These are applied once
+# during the actual streamlining step and once after converting attention to
+# hardware (the associated cleanup afterward might enable some Streamlining
+# transformations once again)
+def Streamline():  # noqa: Uppercase
+    # Return a set of exhaustively applies transformations
+    return ComposedTransformation([
+        # On skip-connections: prefer pushing scalar multiplication forward
+        # before MoveAddPastMul
+        MoveMulPastFork(),
+        # The "standard" set of FINN streamlining transformations or at least
+        # inspired by them but applied exhaustively until none of them changes
+        # the graph anymore.
+        # Note: Covers most parts of non-branching linear topologies
+        ComposedTransformation([
+            ConvertSubToAdd(),
+            ConvertDivToMul(),
+            BatchNormToAffine(),
+            ConvertSignToThres(),
+            MoveMulPastMaxPool(),
+            AbsorbSignBiasIntoMultiThreshold(),
+            MoveScalarLinearPastInvariants(),
+            MoveAddPastMul(),
+            MoveScalarAddPastMatMul(),
+            MoveAddPastConv(),
+            MoveScalarMulPastMatMul(),
+            MoveScalarMulPastConv(),
+            MoveAddPastMul(),
+            CollapseRepeatedAdd(),
+            CollapseRepeatedMul(),
+            MoveMulPastMaxPool(),
+            AbsorbAddIntoMultiThreshold(),
+            FactorOutMulSignMagnitude(),
+            AbsorbMulIntoMultiThreshold(),
+            Absorb1BitMulIntoMatMul(),
+            Absorb1BitMulIntoConv(),
+        ]),
+        # Streamlining scales and biases forward through residual topologies
+        # Note: This mostly covers forking and joining operations
         ComposedTransformation([
-            # We now might have transpose operations accumulating in front of
-            # fork nodes
+            # Note: This is probably the most common way of joining skip
+            # connections, i.e., this corresponds to the original residual
+            # addition, i.e., y = f(x) + x
+            MoveLinearPastEltwiseAdd(),
+            MoveScalarLinearPastFork(),
+            MoveScalarLinearPastInvariants(),
+            MoveMulPastFork(),
+            MoveMulPastJoinAdd(),
+            MoveAddPastJoinAdd(),
+            # Note: This brings constant Muls (i.e., quantizer scales to be
+            # removed) forward through joining Muls (i.e., those ending up
+            # as actual hardware operators).
+            MoveConstMulPastJoinMul()
+        ]),
+        # Streamlining scales and biases forward through shape/layout changing
+        # operations, i.e., mostly transposes
+        ComposedTransformation([
+            # Convolution inputs and padding
+            MoveScalesPastIm2Col(),
+            # Streamlining for Split and Concat operations
+            MoveScalarLinearPastSplit(),
+            MoveAffinePastJoinConcat(),
+            MoveMulPastJoinConcat(),
+            MoveAddPastJoinConcat(),
+            # Move transposes around to some place where they could be removed
+            # later, i.e., where they collapse into identities
             MoveTransposePastFork(),
+            MoveTransposePastSplit(),
+            MoveTransposePastJoinConcat(),
             MoveTransposePastEltwise(),
+            MoveTransposePastJoinMul(),
+            MoveTransposePastJoinAdd(),
             CollapseRepeatedTranspose(),
+            # Remove identity shape/layout transformations
             RemoveIdentityTranspose(),
+            RemoveIdentityReshape(),
+            # Squeeze operators can be moved past the thresholding
+            MoveSqueezePastMultiThreshold(),
+            # A certain type of 4d-layout transpose can be absorbed (actually
+            # moved past) MultiThreshold operations
+            AbsorbTransposeIntoMultiThreshold(),
         ]),
-        # This might have caused the normalization scale and bias to accumulate
-        # in front of transpose or fork node
-        MoveLinearPastEltwiseAdd(),
-        MoveLinearPastFork(),
-        MoveScalarLinearPastInvariants(),
-        # This might have enabled more streamlining transformations
-        Streamline(),
-        # We need a custom streamlining step to enable streamlining through
-        # certain fork-nodes Note: This transform is part of finn, but not
-        # included in the standard streamlining transformations
-        MoveLinearPastFork(),
-        # This might have enabled more streamlining transformations
-        Streamline(),
-    ]))
-
-    # If configured, run a verification of the transformed model on some sample
-    # inputs
-    if (VerificationStepType.STREAMLINED_PYTHON in
-            cfg._resolve_verification_steps()):  # noqa
-        verify_step(model, cfg, "streamlined_norms_python", need_parent=False)
-
-    # Return the streamlined model
-    return model
+        # Only round and clip after all streamlining transformations have
+        # been applied exhaustively.
+        # Note: Might still enable another round of streamlining.
+        RoundAndClipThresholds(),
+    ])
 
 
-# Streamlining transformation to be applied to the positional encoding layer
-def step_streamline_positional(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # There is probably a division in front of the quantized positional
-    # encoding, which is exactly the inverse of the multiplication in front of
-    # that: The are the matching scale factors of the shared input quantizer of
-    # input and positional encoding. Convert the division to multiplication, so
-    # these two can be merged.
-    model = model.transform(ConvertDivToMul())
-    # Merge the quantization scales of shared input quantizers
-    model = model.transform(CollapseRepeatedMul())
-    # Push scalar multiplications, probably scale factors of quantizers, into
-    # the branches of a fork
-    model = model.transform(MoveMulPastFork())
+# Prepares the graph to be consumed by FINN:
+# 1. Some graph cleanup removing unused tensors, nodes without effect and
+#  folding constants, i.e., collapsing chains of operations on constant tensors
+# 2. Lowers some "more complex" operations: converts Conv and Gemm to MatMul and
+#  BatchNorm to Mul and Add operations followed by some necessary cleanup
+# 3. Converts all QONNX Quant nodes to MultiThreshold operations which can
+#  absorb scales and biases during streamlining
+def prepare_graph(range_info: RangeInfo):
+    # Wrap the actual transformation/build step function
+    def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig):
+        # Exhaustively apply the set of cleanup transformations
+        model = model.transform(ComposedTransformation([
+            # Adds shape and datatype annotations to all tensors in this graph
+            InferDataTypes(),
+            InferShapes(),
+            # Cleanup the graph by removing redundant, unnecessary and constant
+            # nodes and tensors and give unique names to everything remaining
+            GiveUniqueNodeNames(),
+            GiveReadableTensorNames(),
+            RemoveStaticGraphInputs(),
+            RemoveUnusedTensors(),
+            GiveUniqueParameterTensors(),
+            FoldConstants(),
+            # Remove unnecessary shape and layout transformations
+            RemoveIdentityReshape(),
+            RemoveIdentityTranspose(),
+            # Redo shape and datatype annotations after removing nodes and
+            # tensors
+            InferShapes(),
+            InferDataTypes(),
+        ]))
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if (VerificationStepType.TIDY_UP_PYTHON in
+                cfg._resolve_verification_steps()):  # noqa
+            verify_step(
+                model, cfg, "tidied_up_python", need_parent=False
+            )
+        # Exhaustively apply the lowering transformations
+        model = model.transform(ComposedTransformation([
+            # Moves the bias input to the Conv operator as a separate Add node
+            # behind the Conv node
+            ExtractBiasFromConv(),
+            # Converts Gemm nodes to MatMul (+ bias)
+            GemmToMatMul(),
+            # Need to do some constant and weight folding first
+            FoldConstants(),
+            FoldTransposeIntoQuantInit(),
+            FoldQuantWeights(),
+            # Annotate the graph with shape and data type information
+            InferShapes(),
+            InferDataTypes(),
+            # Converts Conv layers to MatMul
+            LowerConvsToMatMul(),
+            # Converts BatchNorm to affine scale and bias
+            BatchNormToAffine(),
+            # Annotate the graph with shape and data type information
+            InferShapes(),
+            InferDataTypes(),
+        ]))
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
+                cfg._resolve_verification_steps()):  # noqa
+            verify_step(
+                model, cfg, "lowered_python", need_parent=False
+            )
+        # Apply the quantizer to MultiThreshold conversion
+        # Note: This is exhaustive as well as single .transform reapplies as
+        # long as possible.
+        # TODO: Enable once fixed...
+        # model = model.transform(QuantActivationToMultiThreshold(range_info))
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
+                cfg._resolve_verification_steps()):  # noqa
+            verify_step(
+                model, cfg, "quant_to_thresholds_ra_python", need_parent=False
+            )
+        # Apply the standard QONNX to FINN conversion step to convert the
+        # remaining quantizers not yet covered by the new range analysis based
+        # method
+        model = model.transform(ConvertQONNXtoFINN(
+            filter_function=default_filter_function_generator(
+                max_multithreshold_bit_width=cfg.max_multithreshold_bit_width
+            )
+        ))
+        # If configured, run a verification of the transformed model on some
+        # sample inputs
+        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
+                cfg._resolve_verification_steps()):  # noqa
+            verify_step(
+                model, cfg, "prepared_graph_python", need_parent=False
+            )
+        # Return the transformed model
+        return model
 
-    # If configured, run a verification of the transformed model on some sample
-    # inputs
+    # Return the wrapped transformation step function
+    return step_prepare_graph
+
+
+# Applies the custom set of exhaustive streamlining transformations, also taking
+# special topology like attention, residuals, splits and transposes into account
+def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    # These should not be applied exhaustively with the other streamlining
+    # transformations to not end up in cycles.
+    # Note: This is essential to allow some Add operations to be
+    # absorbed by the next round's AbsorbSignBiasIntoMultiThreshold
+    model = model.transform(MoveMulPastAdd())
+    model = model.transform(AbsorbSignBiasIntoMultiThreshold())
+    # Exhaustively apply the following set of transformations to streamline the
+    # graph with the overall goal of collecting scales and biases in front of
+    # MultiThreshold operations or, alternatively, at the end of the graph.
+    # Note: Contains some sets of nested exhaustive transformations meant for
+    # particular architectural patterns, e.g., residual topologies.
+    model = model.transform(Streamline())
+    # If configured, run a verification of the transformed model on some
+    # sample inputs
     if (VerificationStepType.STREAMLINED_PYTHON in
             cfg._resolve_verification_steps()):  # noqa
         verify_step(
-            model, cfg, "streamlined_positional_python", need_parent=False
+            model, cfg, "streamlined_python", need_parent=False
         )
-
-    # Return the streamlined model
+    # Return the transformed model
     return model
 
 
-# Function running the InferScaledDotProductAttention transformation
-def step_convert_attention_to_hw(model: ModelWrapper, _):
+# Converts scaled dot-product attention operations to FINN hardware operations
+# Note: This includes some necessary cleanup after converting the pattern, in
+# particular squeezing the data layouts throughout the graph
+def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
     # Try to infer reshaping of attention heads
     model = model.transform(InferMultiHeads())  # noqa: Duplicate
     # Try to mode the mult-head splitting past the multi thresholds
@@ -374,6 +898,40 @@ def step_convert_attention_to_hw(model: ModelWrapper, _):
     model = model.transform(MoveMergeMultiHeadsPastMultiThreshold())
     # If applicable, absorb the final thresholds into the attention operator
     model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+    # Squeeze (i.e., remove dimensions of size 1) the data layouts throughout
+    # the graph to treat the time dimension as the batch dimension for all MVU
+    # and Threshold operators
+    model = model.transform(Squeeze())
+    # Squeezing might have turned further transpose and reshape operations into
+    # identities (those which just swapped around the dimensions of size 1)
+    model = model.transform(ComposedTransformation([
+        # Move transposes around to some place where they could be removed
+        # later, i.e., where they collapse into identities
+        MoveTransposePastFork(),
+        MoveTransposePastSplit(),
+        MoveTransposePastJoinConcat(),
+        MoveTransposePastEltwise(),
+        MoveTransposePastJoinMul(),
+        MoveTransposePastJoinAdd(),
+        CollapseRepeatedTranspose(),
+        # Remove identity shape/layout transformations
+        RemoveIdentityTranspose(),
+        RemoveIdentityReshape(),
+        # Squeeze operators can be moved past MatMuls and thresholding
+        MoveSqueezePastMatMul(),
+        MoveSqueezePastMultiThreshold(),
+    ]))
+    # Squeezing might enable absorbing adds into thresholds once again
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    # If applicable, absorb the final thresholds into the attention operator
+    #   Note: Might be applicable again after squeezing a transpose away
+    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
+    # We should do another round of streamlining to be sure and support more
+    # general architectural patterns, we are not aware of yet...
+    model = model.transform(Streamline())
+    # Convert Squeeze and Unsqueeze operators to hardware operations
+    model = model.transform(InferSqueeze())
+    model = model.transform(InferUnsqueeze())
     # Return the model with attention and multi-heads mapped to hardware
     # operators
     return model
@@ -389,6 +947,11 @@ def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _):
     ))
 
 
+# Converts Split and Concat operations to hardware custom operators
+def step_convert_split_concat_to_hw(model: ModelWrapper, _):
+    return model.transform(InferSplitLayer()).transform(InferConcatLayer())
+
+
 # Function running the transformations to convert Gather, i.e., index lookup,
 # nodes to their hardware implementations
 def step_convert_lookup_to_hw(model: ModelWrapper, _):
@@ -407,43 +970,18 @@ def step_convert_lookup_to_hw(model: ModelWrapper, _):
     return model.transform(InferLookupLayer())
 
 
+# Converts depth-wise convolution to hardware operator calling the
+# InferVectorVectorActivation transformation
+def step_convert_depth_wise_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
+    return model.transform(InferVectorVectorActivation())
+
+
 # Function running the InferReplicateStream transformation
 def step_replicate_streams(model: ModelWrapper, _):
     # Properly replicate the stream feeding the query, key and value projections
     return model.transform(InferReplicateStream())
 
 
-# Post-processing tidy-up squeezing dimensions and identity operators left over
-# from mapping the attention operators
-def step_tidy_up_post_attention(model: ModelWrapper, _):
-    # Remove dimensions of size 1 (single batch tensors)
-    model = model.transform(Squeeze())
-    model = model.transform(RemoveIdentityTranspose())
-
-    # Squeezing might enable absorbing adds into thresholds once again
-    model = model.transform(AbsorbAddIntoMultiThreshold())
-    # If applicable, absorb the final thresholds into the attention operator
-    #   Note: Might be applicable again after squeezing a transpose away
-    model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention())
-
-    # Squeezing might enable some more streamlining transformations once again
-    model = model.transform(ComposedTransformation([
-        # Streamline the residual connections by moving scale factors past
-        # elementwise add nodes
-        MoveLinearPastEltwiseAdd(),
-        MoveLinearPastFork(),
-        MoveScalarLinearPastInvariants(),
-        # Do the normal streamlining flow once again
-        Streamline(),
-    ]))
-
-    # Clean up the names for debugging
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-    # Return the tidied up model
-    return model
-
-
 # Custom step for setting the parallelism to meet the target of T^2 cycles per
 # sequence
 def set_target_parallelization(seq_len: int,
@@ -736,49 +1274,6 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
     return model
 
 
-# Runs a node-by-node Python simulation of the model saving the fill execution
-# context
-# Note: Assumes no execution mode to be set
-def node_by_node_python(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # Save the original model
-    original = model
-    # Copy the model
-    model = copy.deepcopy(model)
-
-    # Load the verification input/output pair
-    inp = np.load(cfg.verify_input_npy)  # noqa
-    out = np.load(cfg.verify_expected_output_npy)
-
-    # Path to the parent model wrapping the streaming dataflow partition and the
-    # wrapped child model, i.e., the inside of the streaming dataflow partition
-    parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx"
-    child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx"
-    # Save the child model prepared for python simulation
-    model.save(child)
-    # Load the parent model to pass to verification execution
-    parent_model = ModelWrapper(parent)
-
-    # Reshape the input/output to match the model
-    inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name))
-    out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name))
-
-    # Execute the onnx model to collect the result
-    # context = execute_onnx(model, context, return_full_exec_context=True)
-    context = execute_parent(parent, child, inp, return_full_ctx=True)
-    # Extract the output tensor from the execution context
-    model_out = context[parent_model.graph.output[0].name]
-    # Compare input to output
-    result = {True: "SUCCESS", False: "FAIL"}[
-        np.allclose(out, model_out, atol=1e-3)
-    ]
-    # Save the verification outputs into the configured build directory
-    verification_output = f"{cfg.output_dir}/verification_output/"
-    # Save the verification execution context
-    np.savez(f"{verification_output}/verify_python_{result}.npz", **context)
-    # Return the original, unmodified model
-    return original
-
-
 # Runs a node-by-node C++ simulation of the model saving the fill execution
 # context
 def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig):
@@ -816,9 +1311,7 @@ def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig):
     # Extract the output tensor from the execution context
     model_out = context[parent_model.graph.output[0].name]
     # Compare input to output
-    result = {True: "SUCCESS", False: "FAIL"}[
-        np.allclose(out, model_out, atol=1e-3)
-    ]
+    result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)]
     # Save the verification outputs into the configured build directory
     verification_output = f"{cfg.output_dir}/verification_output/"
     # Save the verification execution context
@@ -867,9 +1360,7 @@ def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig):
     # Extract the output tensor from the execution context
     model_out = context[parent_model.graph.output[0].name]
     # Compare input to output
-    result = {True: "SUCCESS", False: "FAIL"}[
-        np.allclose(out, model_out, atol=1e-3)
-    ]
+    result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)]
     # Save the verification outputs into the configured build directory
     verification_output = f"{cfg.output_dir}/verification_output/"
     # Save the verification execution context
diff --git a/benchmarking/dut/transformer_gpt.py b/benchmarking/dut/transformer_gpt.py
deleted file mode 100644
index 5ee77483ab..0000000000
--- a/benchmarking/dut/transformer_gpt.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Adapted from Christoph's attention-dummy repository
-
-# PyTorch base package: Math and Tensor Stuff
-import torch
-# Brevitas wrapper around PyTorch tensors adding quantization information
-from brevitas.quant_tensor import QuantTensor
-# Brevitas: Quantized versions of PyTorch layers
-from brevitas.nn import (
-    QuantMultiheadAttention,
-    QuantEltwiseAdd,
-    QuantIdentity,
-    QuantLinear,
-    QuantReLU
-)
-from qonnx.core.modelwrapper import ModelWrapper
-# Progressbar
-from tqdm import trange
-import numpy as np
-from brevitas.export import export_qonnx
-import random
-import json
-import subprocess
-# FINN dataflow builder
-import finn.builder.build_dataflow as build
-import finn.builder.build_dataflow_config as build_cfg
-from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
-from bench_base import bench, step_synth_harness
-import os
-from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
-
-# Custom build steps required to streamline and convert the attention operator
-from dut.transformer_custom_steps import (
-    step_tidy_up_pre_attention,
-    step_tidy_up_post_attention,
-    step_streamline_attention,
-    step_streamline_residual,
-    step_streamline_norms,
-    step_streamline_positional,
-    step_convert_attention_to_hw,
-    step_convert_elementwise_binary_to_hw,
-    step_convert_lookup_to_hw,
-    step_replicate_streams,
-    set_target_parallelization,
-    set_fifo_depths,
-    step_apply_folding_config,
-    node_by_node_rtlsim, # noqa: Maybe unused, only for debugging
-    node_by_node_python, # noqa: Maybe unused, only for debugging
-    node_by_node_cppsim
-)
-from performance.platform_build_steps import(
-     test_step_gen_vitis_xo,
-     test_step_gen_instrumentation_wrapper,
-     test_step_gen_instrwrap_sim,
-     test_step_insert_tlastmarker,
-     test_step_export_xo,
-     test_step_build_platform,
-     test_step_run_instrwrap_sim
-)
-
-### ADAPTED FROM utils.py
-# Seeds all relevant random number generators to the same seed for
-# reproducibility
-def seed(s):
-    random.seed(s)
-    np.random.seed(s)
-    torch.manual_seed(s)
-
-template_folding_yaml = """
-# Per operator type default configurations
-defaults:
-    # Scaled dot-product attention head implemented via HLS
-    ScaledDotProductAttention_hls:
-        # Type of memory to be used for internal buffer storage
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Type of memory to be used for threshold storage
-        #   Options: auto, block, distributed
-        ram_style_thresholds: block
-        # Type of memory to be used fo the attention mask (if present)
-        #   Options: auto, block, distributed
-        ram_style_mask: block
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        mac_resource: lut
-    # Addition of two inputs (constants or streamed) implemented via HLS
-    ElementwiseAdd_hls:
-        # Type of memory to be used for internal buffer storage and/or constant
-        # parameter tensors
-        #   Options: auto, block, distributed, ultra
-        ram_style: distributed
-    # Matrix vector activation unit implemented via HLS
-    MVAU_hls:
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        resType: dsp
-        # Memory mode for weight storage
-        #   Options: internal_embedded, internal_decoupled, external
-        mem_mode: internal_decoupled
-        # Type of memory to be used for weight storage if "internal_decoupled"
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Type of memory to be used for threshold storage
-        #   Options: auto, block, distributed
-        ram_style_thresholds: block
-        # Makes weights writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Matrix vector activation unit implemented via RTL
-    MVAU_rtl:
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        # Note: RTL MVAU currently does not support LUT-based implementation
-        resType: dsp
-        # Memory mode for weight storage
-        #   Options: internal_embedded, internal_decoupled, external
-        mem_mode: internal_decoupled
-        # Type of memory to be used for weight storage if "internal_decoupled"
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Makes weights writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
-    Thresholding_hls:
-        # Memory mode for threshold storage
-        #   Options: internal_embedded, internal_decoupled
-        mem_mode: internal_decoupled
-        # Type of memory to be used for threshold storage if "internal_decoupled"
-        #   Options: distributed, block
-        ram_style: distributed
-        # Makes thresholds writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
-    Thresholding_rtl:
-        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
-        # depth of the thresholds
-        # Note: This combination forces "distributed" LUT implementation
-        depth_trigger_uram: 2147483647  # "infinity"
-        depth_trigger_bram: 2147483647  # "infinity"
-    #    # Note: This combination forces "block" RAM implementation
-    #    depth_trigger_uram: 0
-    #    depth_trigger_bram: 1
-    #    # Note: This combination forces "ultra" RAM implementation
-    #    depth_trigger_uram: 1
-    #    depth_trigger_bram: 0
-    #    # Note: This combination is equivalent to "auto"
-    #    depth_trigger_uram: 0
-    #    depth_trigger_bram: 0
-        # Makes thresholds writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
-    StreamingFIFO_rtl:
-        # RTL vs. IPI implementation of FIFOs
-        #   Options: rtl, vivado
-        impl_style: rtl
-        # Resource type for FIFOs when impl_style is vivado
-        #   Options: auto, block, distributed, ultra
-        ram_style: distributed
-    # Individual, named node-specific configurations here
-    # ...
-"""
-
-class bench_transformer_gpt(bench):
-    def step_build(self):
-        #with open("params.yaml") as file:
-        #    params = yaml.safe_load(file)
-        # Seed all RNGs
-        seed(self.params["seed"])
-
-        # Extract sequence length and embedding dimension from the output of the
-        # first quantizer in the model
-        # Note: Embedding and Sequence dimension flip later
-        model = ModelWrapper(self.build_inputs["onnx_path"])
-        _, emb_dim, seq_len = model.get_tensor_shape(
-            "/emb_add/input_quant/export_handler/Quant_output_0"
-        )
-
-        # Prepare config files
-        # TODO: make configurable
-        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
-        specialize_layers_dict = {
-            "Defaults": {
-                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
-            },
-            "": {
-                "preferred_impl_style": ""
-            }
-        }
-        with open("specialize_layers.json", "w") as f:
-                json.dump(specialize_layers_dict, f, indent=2)
-        with open("folding.yaml", "w") as f:
-                f.write(template_folding_yaml)
-
-        #TODO: make configurable instead of hardcoding exception
-        self.board = "U280"
-        self.part = "xcu280-fsvh2892-2L-e"
-
-        # Create a configuration for building the scaled dot-product attention
-        # operator to a hardware accelerator
-        cfg = build_cfg.DataflowBuildConfig(
-            # Unpack the build configuration parameters
-            #**params["build"],
-            output_dir = self.build_inputs["build_dir"],
-            stitched_ip_gen_dcp = True,
-            synth_clk_period_ns = self.clock_period_ns,
-            board = self.board,
-            shell_flow_type = "vitis_alveo", #TODO: proper Alveo support instead of hardcoding
-            folding_config_file = "folding.yaml",
-            specialize_layers_config_file = "specialize_layers.json",
-            standalone_thresholds = True,
-            max_multithreshold_bit_width = 16,
-            mvau_wwidth_max = 2048,
-            split_large_fifos = True,
-
-            verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log
-
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
-                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
-                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
-                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
-                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
-                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
-            ],
-
-            verify_steps=[
-                # Verify the model after converting to the FINN onnx dialect
-                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
-                # Verify the model again using python mode after the default
-                # streamlining step
-                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
-                # Verify the model again after tidy up transformations, right before
-                # converting to HLS
-                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
-                # Verify the model after generating C++ HLS and applying folding
-                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
-            ],
-            # File with test inputs for verification
-            verify_input_npy=self.build_inputs["input_npy_path"],
-            # File with expected test outputs for verification
-            verify_expected_output_npy=self.build_inputs["output_npy_path"],
-            # Save the intermediate model graphs
-            save_intermediate_models=True,
-            # Avoid RTL simulation for setting the FIFO sizes
-            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
-            # Do not automatically set FIFO sizes as this requires RTL simulation
-            # not implemented for the attention operator
-            auto_fifo_depths=False,
-            # Build steps to execute
-            steps=[
-                # Need to apply some tidy-up transformations before converting to
-                # the finn dialect of onnx
-                step_tidy_up_pre_attention,
-                # Convert all QONNX Quant nodes to Multithreshold nodes
-                "step_qonnx_to_finn",
-                # Tidy up the graph after converting from QONNX to FINN format
-                # Note: Triggers a verification step
-                "step_tidy_up",
-                # Positional encoding needs to be streamlined first with slightly
-                # different order of certain streamlining transformations to avoid
-                # weird rounding issue of intermediate results
-                step_streamline_positional,
-                # Custom streamlining for models containing attention operators
-                step_streamline_attention,
-                # Streamlining of the residual branches
-                step_streamline_residual,
-                # Streamline the normalization layers, i.e., transposed batch norm
-                step_streamline_norms,
-                # Another round using the default streamlining steps
-                # Note: Triggers a verification step
-                "step_streamline",
-                # New conversion of the scaled dot-product attention pattern
-                step_convert_attention_to_hw,
-                # Another tidy-up step to remove unnecessary dimensions and
-                # operations after converting the attention operators to HLS
-                step_tidy_up_post_attention,
-                # Convert the elementwise binary operations to hardware operators.
-                # These include for example adding residual branches and positional
-                # encoding
-                step_convert_elementwise_binary_to_hw,
-                # Convert the Gather layer realizing the input token embedding to
-                # the FINN hardware implementation, i.e., the Lookup layer
-                step_convert_lookup_to_hw,
-                # Properly replicate the stream feeding the query, key and value
-                # projections
-                step_replicate_streams,
-                # Convert most other layers supported by FINN to HW operators
-                "step_convert_to_hw",
-                # Specialize HW layer implementations as either HLS or RTL
-                "step_specialize_layers",
-                "step_create_dataflow_partition",
-                # Set the folding configuration to meet the cycles per sequence
-                # target
-                set_target_parallelization(seq_len, emb_dim),
-                # Apply folding configuration, specifying hardware implementation
-                # details
-                # Note: This triggers a verification step
-                step_apply_folding_config,
-                "step_minimize_bit_width",
-                # The ScaledDotProductAttention custom op does not define any
-                # estimates
-                "step_generate_estimate_reports",
-                "step_hw_codegen",
-                "step_hw_ipgen",
-                # Set the attention- and residual-related FIFO depths insert FIFOs
-                # and apply folding configuration once again
-                # Note: Implement all FIFOs with a depth at least as deep as the
-                # sequence length in URAM.
-                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
-                # Run additional node-by-node verification in RTL simulation of the
-                # model before creating the stitched IP
-                # Note: end-to-end verification of the stitched IP in RTL simulation
-                # is still not possible due to missing float IPs
-                node_by_node_cppsim,
-                # Only for debugging for now, does not work if "vivado" style
-                # StreamingFIFOs are used
-                # node_by_node_rtlsim,
-
-                test_step_insert_tlastmarker, # required for instrumentation_wrapper
-
-                "step_create_stitched_ip",
-
-                # "step_measure_rtlsim_performance", # not possible due to float components
-
-                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
-                
-                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
-
-                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
-                #"step_synthesize_bitfile", 
-                #"step_make_pynq_driver",
-                #"step_deployment_package",
-
-                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
-                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
-
-                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
-                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
-                
-                #test_step_export_xo, # preparation step for original instr wrapper integration
-                #test_step_build_platform # synthesis with instr wrapper
-            ]
-        )
-        # Run the build process on the dummy attention operator graph
-        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
-        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
-
-    def run(self):
-        self.steps_full_build_flow()
diff --git a/benchmarking/dut/transformer_radioml.py b/benchmarking/dut/transformer_radioml.py
deleted file mode 100644
index 4d77cb4b8d..0000000000
--- a/benchmarking/dut/transformer_radioml.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Adapted from Christoph's attention-dummy repository
-
-# PyTorch base package: Math and Tensor Stuff
-import torch
-# Brevitas wrapper around PyTorch tensors adding quantization information
-from brevitas.quant_tensor import QuantTensor
-# Brevitas: Quantized versions of PyTorch layers
-from brevitas.nn import (
-    QuantMultiheadAttention,
-    QuantEltwiseAdd,
-    QuantIdentity,
-    QuantLinear,
-    QuantReLU
-)
-# Progressbar
-from tqdm import trange
-import numpy as np
-from brevitas.export import export_qonnx
-import random
-import json
-import subprocess
-# FINN dataflow builder
-import finn.builder.build_dataflow as build
-import finn.builder.build_dataflow_config as build_cfg
-from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
-from bench_base import bench, step_synth_harness
-import os
-from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
-
-# Custom build steps required to streamline and convert the attention operator
-from dut.transformer_custom_steps import (
-    step_tidy_up_pre_attention,
-    step_tidy_up_post_attention,
-    step_streamline_attention,
-    step_streamline_residual,
-    step_streamline_norms,
-    step_streamline_positional,
-    step_convert_attention_to_hw,
-    step_convert_elementwise_binary_to_hw,
-    step_convert_lookup_to_hw,
-    step_replicate_streams,
-    set_target_parallelization,
-    set_fifo_depths,
-    step_apply_folding_config,
-    node_by_node_rtlsim,
-    node_by_node_cppsim
-)
-from performance.platform_build_steps import(
-     test_step_gen_vitis_xo,
-     test_step_gen_instrumentation_wrapper,
-     test_step_gen_instrwrap_sim,
-     test_step_insert_tlastmarker,
-     test_step_export_xo,
-     test_step_build_platform,
-     test_step_run_instrwrap_sim
-)
-
-### ADAPTED FROM utils.py
-# Seeds all relevant random number generators to the same seed for
-# reproducibility
-def seed(s):
-    random.seed(s)
-    np.random.seed(s)
-    torch.manual_seed(s)
-
-template_folding_yaml = """
-# Per operator type default configurations
-defaults:
-    # Scaled dot-product attention head implemented via HLS
-    ScaledDotProductAttention_hls:
-        # Type of memory to be used for internal buffer storage
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Type of memory to be used for threshold storage
-        #   Options: auto, block, distributed
-        ram_style_thresholds: block
-        # Type of memory to be used fo the attention mask (if present)
-        #   Options: auto, block, distributed
-        ram_style_mask: block
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        mac_resource: lut
-    # Addition of two inputs (constants or streamed) implemented via HLS
-    ElementwiseAdd_hls:
-        # Type of memory to be used for internal buffer storage and/or constant
-        # parameter tensors
-        #   Options: auto, block, distributed, ultra
-        ram_style: distributed
-    # Matrix vector activation unit implemented via HLS
-    MVAU_hls:
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        resType: dsp
-        # Memory mode for weight storage
-        #   Options: internal_embedded, internal_decoupled, external
-        mem_mode: internal_decoupled
-        # Type of memory to be used for weight storage if "internal_decoupled"
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Type of memory to be used for threshold storage
-        #   Options: auto, block, distributed
-        ram_style_thresholds: block
-        # Makes weights writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Matrix vector activation unit implemented via RTL
-    MVAU_rtl:
-        # Resource type to be used for implementing multiplications/MACs
-        #   Options: auto, lut or dsp
-        # Note: RTL MVAU currently does not support LUT-based implementation
-        resType: dsp
-        # Memory mode for weight storage
-        #   Options: internal_embedded, internal_decoupled, external
-        mem_mode: internal_decoupled
-        # Type of memory to be used for weight storage if "internal_decoupled"
-        #   Options: auto, block, distributed, ultra
-        ram_style: block
-        # Makes weights writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Multi-thresholds implemented via HLS (applies to standalone thresholds)
-    Thresholding_hls:
-        # Memory mode for threshold storage
-        #   Options: internal_embedded, internal_decoupled
-        mem_mode: internal_decoupled
-        # Type of memory to be used for threshold storage if "internal_decoupled"
-        #   Options: distributed, block
-        ram_style: distributed
-        # Makes thresholds writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # Multi-thresholds implemented via RTL (applies to standalone thresholds)
-    Thresholding_rtl:
-        # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the
-        # depth of the thresholds
-        # Note: This combination forces "distributed" LUT implementation
-        depth_trigger_uram: 2147483647  # "infinity"
-        depth_trigger_bram: 2147483647  # "infinity"
-    #    # Note: This combination forces "block" RAM implementation
-    #    depth_trigger_uram: 0
-    #    depth_trigger_bram: 1
-    #    # Note: This combination forces "ultra" RAM implementation
-    #    depth_trigger_uram: 1
-    #    depth_trigger_bram: 0
-    #    # Note: This combination is equivalent to "auto"
-    #    depth_trigger_uram: 0
-    #    depth_trigger_bram: 0
-        # Makes thresholds writeable through AXI-lite interface at runtime
-        runtime_writeable_weights: 0
-    # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN)
-    StreamingFIFO_rtl:
-        # RTL vs. IPI implementation of FIFOs
-        #   Options: rtl, vivado
-        impl_style: rtl
-        # Resource type for FIFOs when impl_style is vivado
-        #   Options: auto, block, distributed, ultra
-        ram_style: distributed
-    # Individual, named node-specific configurations here
-    # ...
-"""
-
-class bench_transformer_radioml(bench):
-    def step_build(self):
-        #with open("params.yaml") as file:
-        #    params = yaml.safe_load(file)
-        # Seed all RNGs
-        seed(self.params["seed"])
-        # Extract sequence length and embedding dimension from parameters
-        _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape
-
-        # Prepare config files
-        # TODO: make configurable
-        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
-        specialize_layers_dict = {
-            "Defaults": {
-                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
-            },
-            "": {
-                "preferred_impl_style": ""
-            }
-        }
-        with open("specialize_layers.json", "w") as f:
-                json.dump(specialize_layers_dict, f, indent=2)
-        with open("folding.yaml", "w") as f:
-                f.write(template_folding_yaml)
-
-        # Create a configuration for building the scaled dot-product attention
-        # operator to a hardware accelerator
-        cfg = build_cfg.DataflowBuildConfig(
-            # Unpack the build configuration parameters
-            #**params["build"],
-            output_dir = self.build_inputs["build_dir"],
-            stitched_ip_gen_dcp = True,
-            synth_clk_period_ns = self.clock_period_ns,
-            board = self.board,
-            shell_flow_type = "vivado_zynq", #TODO: Alveo support
-            folding_config_file = "folding.yaml",
-            specialize_layers_config_file = "specialize_layers.json",
-            standalone_thresholds = True,
-            max_multithreshold_bit_width = 16,
-            mvau_wwidth_max = 2048,
-            split_large_fifos = True,
-
-            verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log
-
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
-                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
-                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
-                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
-                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
-                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
-            ],
-
-            verify_steps=[
-                # Verify the model after converting to the FINN onnx dialect
-                build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
-                # Verify the model again using python mode after the default
-                # streamlining step
-                build_cfg.VerificationStepType.STREAMLINED_PYTHON,
-                # Verify the model again after tidy up transformations, right before
-                # converting to HLS
-                build_cfg.VerificationStepType.TIDY_UP_PYTHON,
-                # Verify the model after generating C++ HLS and applying folding
-                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
-            ],
-            # File with test inputs for verification
-            verify_input_npy=self.build_inputs["input_npy_path"],
-            # File with expected test outputs for verification
-            verify_expected_output_npy=self.build_inputs["output_npy_path"],
-            # Save the intermediate model graphs
-            save_intermediate_models=True,
-            # Avoid RTL simulation for setting the FIFO sizes
-            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
-            # Do not automatically set FIFO sizes as this requires RTL simulation
-            # not implemented for the attention operator
-            auto_fifo_depths=False,
-            # Build steps to execute
-            steps=[
-                # Need to apply some tidy-up transformations before converting to
-                # the finn dialect of onnx
-                step_tidy_up_pre_attention,
-                # Convert all QONNX Quant nodes to Multithreshold nodes
-                "step_qonnx_to_finn",
-                # Tidy up the graph after converting from QONNX to FINN format
-                # Note: Triggers a verification step
-                "step_tidy_up",
-                # Positional encoding needs to be streamlined first with slightly
-                # different order of certain streamlining transformations to avoid
-                # weird rounding issue of intermediate results
-                step_streamline_positional,
-                # Custom streamlining for models containing attention operators
-                step_streamline_attention,
-                # Streamlining of the residual branches
-                step_streamline_residual,
-                # Streamline the normalization layers, i.e., transposed batch norm
-                step_streamline_norms,
-                # Another round using the default streamlining steps
-                # Note: Triggers a verification step
-                "step_streamline",
-                # New conversion of the scaled dot-product attention pattern
-                step_convert_attention_to_hw,
-                # Another tidy-up step to remove unnecessary dimensions and
-                # operations after converting the attention operators to HLS
-                step_tidy_up_post_attention,
-                # Convert the elementwise binary operations to hardware operators.
-                # These include for example adding residual branches and positional
-                # encoding
-                step_convert_elementwise_binary_to_hw,
-                # Convert the Gather layer realizing the input token embedding to
-                # the FINN hardware implementation, i.e., the Lookup layer
-                step_convert_lookup_to_hw,
-                # Properly replicate the stream feeding the query, key and value
-                # projections
-                step_replicate_streams,
-                # Convert most other layers supported by FINN to HW operators
-                "step_convert_to_hw",
-                # Specialize HW layer implementations as either HLS or RTL
-                "step_specialize_layers",
-                "step_create_dataflow_partition",
-                # Set the folding configuration to meet the cycles per sequence
-                # target
-                set_target_parallelization(seq_len, emb_dim),
-                # Apply folding configuration, specifying hardware implementation
-                # details
-                # Note: This triggers a verification step
-                step_apply_folding_config,
-                "step_minimize_bit_width",
-                # The ScaledDotProductAttention custom op does not define any
-                # estimates
-                "step_generate_estimate_reports",
-                "step_hw_codegen",
-                "step_hw_ipgen",
-                # Set the attention- and residual-related FIFO depths insert FIFOs
-                # and apply folding configuration once again
-                # Note: Implement all FIFOs with a depth at least as deep as the
-                # sequence length in URAM.
-                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
-                # Run additional node-by-node verification in RTL simulation of the
-                # model before creating the stitched IP
-                # Note: end-to-end verification of the stitched IP in RTL simulation
-                # is still not possible due to missing float IPs
-                node_by_node_cppsim,
-                # Only for debugging for now, does not work if "vivado" style
-                # StreamingFIFOs are used
-                # node_by_node_rtlsim,
-
-                test_step_insert_tlastmarker, # required for instrumentation_wrapper
-
-                "step_create_stitched_ip",
-
-                # "step_measure_rtlsim_performance", # not possible due to float components
-
-                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
-                
-                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
-
-                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
-                #"step_synthesize_bitfile", 
-                #"step_make_pynq_driver",
-                #"step_deployment_package",
-
-                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
-                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
-
-                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
-                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
-                
-                #test_step_export_xo, # preparation step for original instr wrapper integration
-                #test_step_build_platform # synthesis with instr wrapper
-            ]
-        )
-        # Run the build process on the dummy attention operator graph
-        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
-        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
-
-    def run(self):
-        self.steps_full_build_flow()

From 47cb5ac3eb387b3ab80b2b0cbb1ac40271ef5806 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 14:53:17 +0000
Subject: [PATCH 003/125] Fix imports

---
 benchmarking/dut/transformer_custom_steps.py | 679 +++++++++----------
 1 file changed, 328 insertions(+), 351 deletions(-)

diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index e122f79a0d..2dc387a94a 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -4,226 +4,207 @@
 # custom/composed_transformation.py
 # custom/streamline.py
 
-# Python warning messages
-import warnings
-# Copies of python objects
-from copy import deepcopy
 # Copies (deep-copies) python objects
 import copy
+
 # Numpy for loading and comparing the verification input/output
 import numpy as np
+
+# Python warning messages
+import warnings
+
 # YAML for loading experiment configurations
 import yaml
 
+# Copies of python objects
+from copy import deepcopy
+
+# QONNX quantization data types
+from qonnx.core.datatype import DataType
+
 # QONNX wrapper of ONNX model graphs
 from qonnx.core.modelwrapper import ModelWrapper
-# Range information structure for seeding the range analysis for converting
-# quantized activations to MultiThreshold
-from qonnx.util.range_analysis import RangeInfo
+
+# Converts ONNX graph nodes to QONNX custom-ops if possible
+from qonnx.custom_op.registry import getCustomOp
+
+# Converts BatchNorm operation to affine transformation
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+
+# If we have a convolution with a bias tensors input, QONNX and later FINN
+# expect the bias to be expressed as a standalone Add node following the Conv
+# node.
+from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv
+
+# Collapses chains of constants into a single constant operation or even
+# initializer tensors.
+from qonnx.transformation.fold_constants import FoldConstants
+
+# Converts Gemm operation to MatMul with extracted standalone bias op
+from qonnx.transformation.gemm_to_matmul import GemmToMatMul
 
 # QONNX graph transformations for renaming and cleaning up
 from qonnx.transformation.general import (
-    Transformation,
-    GiveUniqueNodeNames,
+    ConvertDivToMul,
+    ConvertSubToAdd,
     GiveReadableTensorNames,
+    GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
+    Transformation,
 )
+
 # QONNX graph transformations for annotating the graph with datatype and shape
 # information
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
 
-# If we have a convolution with a bias tensors input, QONNX and later FINN
-# expect the bias to be expressed as a standalone Add node following the Conv
-# node.
-from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv
-# Converts BatchNorm operation to affine transformation
-from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
-# Converts Gemm operation to MatMul with extracted standalone bias op
-from qonnx.transformation.gemm_to_matmul import GemmToMatMul
 # Converts Conv to Im2Col and MatMul with extracted standalone bias op
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
 # Transposes the initializer tensors of a Quant node instead of having a
 # standalone Transpose following
-from qonnx.transformation.quant_constant_folding import (
-    FoldTransposeIntoQuantInit
-)
-# Collapses chains of constants into a single constant operation or even
-# initializer tensors.
-from qonnx.transformation.fold_constants import FoldConstants
-# Folds quantizers into weight tensor initializers, needed for lowering
-# convolutions to MatMuls
-from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
-# FINN streamlining transformations reordering the graph
-from finn.transformation.streamline.reorder import (
-    MoveTransposePastFork,
-    MoveTransposePastEltwise,
-    MoveTransposePastJoinMul,
-    MoveTransposePastJoinAdd,
-    MoveTransposePastSplit,
-    MoveTransposePastJoinConcat,
-    MoveSqueezePastMultiThreshold,
-    MoveSqueezePastMatMul
-)
-# FINN streamlining transformations absorbing tensors/nodes into others
-from finn.transformation.streamline.absorb import (
-    AbsorbAddIntoMultiThreshold,
-    AbsorbSignBiasIntoMultiThreshold,
-)
-# FINN streamlining transformations fusing/collapsing operations of the same
-# kind
-from finn.transformation.streamline.collapse_repeated import (
-    CollapseRepeatedTranspose
-)
-# FINN streamlining transformations removing nodes without real effect from the
-# graph
-from finn.transformation.streamline.remove import (
-    RemoveIdentityTranspose,
-    RemoveIdentityReshape,
-    RemoveIdentityOps
-)
-# Cleanup transformation getting rid of 3d data layout
-from finn.transformation.squeeze import Squeeze
+from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
+from qonnx.transformation.remove import RemoveIdentityOps
+
+# Range information structure for seeding the range analysis for converting
+# quantized activations to MultiThreshold
+from qonnx.util.range_analysis import RangeInfo
+
+# FINN dataflow builder configuration
+from finn.builder.build_dataflow_config import DataflowBuildConfig, VerificationStepType
+
+# FINN verification after build/graph transformation steps
+from finn.builder.build_dataflow_steps import verify_step
+
 # Detects the attention pattern and converts to hardware custom op
 from finn.transformation.fpgadataflow.attention import (
+    AbsorbMultiThresholdIntoScaledDotProductAttention,
     InferScaledDotProductAttention,
-    AbsorbMultiThresholdIntoScaledDotProductAttention
 )
+
 # Mult-Head Attention support
 from finn.transformation.fpgadataflow.attention_heads import (
     InferMultiHeads,
-    UnrollMultiHeadAttention,
+    MoveMergeMultiHeadsPastMultiThreshold,
     MoveSplitMultiHeadsPastMultiThreshold,
-    MoveMergeMultiHeadsPastMultiThreshold
+    UnrollMultiHeadAttention,
 )
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+
 # Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps
 from finn.transformation.fpgadataflow.convert_to_hw_layers import (
-    InferSqueeze,
-    InferUnsqueeze,
-    InferElementwiseBinaryOperation,
-    InferSplitLayer,
     InferConcatLayer,
+    InferElementwiseBinaryOperation,
     InferLookupLayer,
-    InferVectorVectorActivation
-)
-# Converts fork-nodes to ReplicateStream hardware operator
-from finn.transformation.fpgadataflow.replicate_stream import (
-    InferReplicateStream
-)
-# Standard QONNX to FINN conversion function
-from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
-from finn.transformation.qonnx.quant_act_to_multithreshold import (
-    default_filter_function_generator,
+    InferSplitLayer,
+    InferSqueeze,
+    InferUnsqueeze,
+    InferVectorVectorActivation,
 )
-# QONNX quantization data types
-from qonnx.core.datatype import DataType
-# Converts ONNX graph nodes to QONNX custom-ops if possible
-from qonnx.custom_op.registry import getCustomOp
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+
 # Inserts data-width converter and FIFO nodes into the model graph
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+
+# Transformations preparing the operators for synthesis and simulation
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+
+# Converts fork-nodes to ReplicateStream hardware operator
+from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
 # Splitting and removing of FIFOs from the model graph
 from finn.transformation.fpgadataflow.set_fifo_depths import (
     RemoveShallowFIFOs,
     SplitLargeFIFOs,
 )
-# Specializes each layer's implementation style: HLS or RTL implementation
-from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-# FINN dataflow builder configuration
-from finn.builder.build_dataflow_config import (
-    VerificationStepType, DataflowBuildConfig
-)
+
 # Graph transformation setting the folding, i.e., parallelization configuration
 from finn.transformation.fpgadataflow.set_folding import SetFolding
-# FINN verification after build/graph transformation steps
-from finn.builder.build_dataflow_steps import verify_step
 
-# Transformations preparing the operators for synthesis and simulation
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+# Specializes each layer's implementation style: HLS or RTL implementation
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 
-# Execute onnx model graphs from the dataflow parent for verification
-from finn.util.test import execute_parent
+# Standard QONNX to FINN conversion function
+from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN
 
-# Base class for all QONNX graph transformations and some basic cleanup
-# transformations
-from qonnx.transformation.general import (
-    Transformation,
-    ConvertDivToMul,
-    ConvertSubToAdd,
+# Folds quantizers into weight tensor initializers, needed for lowering
+# convolutions to MatMuls
+from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
+from finn.transformation.qonnx.quant_act_to_multithreshold import (
+    default_filter_function_generator,
 )
 
-# QONNX graph transformations for annotating the graph with datatype and shape
-# information
-from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.infer_shapes import InferShapes
-# Converts BatchNorm operation to affine transformation
-from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
-
-# Groups node inputs by dynamic vs. initializer category
-from finn.transformation.streamline.absorb import group_inputs_by_category
+# Cleanup transformation getting rid of 3d data layout
+from finn.transformation.squeeze import Squeeze
 
 # FINN streamlining transformations converting and rounding values
-from finn.transformation.streamline import (
-    ConvertSignToThres,
-    RoundAndClipThresholds
-)
-# FINN streamlining transformations reordering the graph
-from finn.transformation.streamline.reorder import (
-    MoveMulPastFork,
-    MoveTransposePastFork,
-    MoveLinearPastEltwiseAdd,
-    MoveScalarLinearPastInvariants,
-    MoveTransposePastEltwise,
-    MoveMulPastMaxPool,
-    MoveAddPastMul,
-    MoveScalarAddPastMatMul,
-    MoveAddPastConv,
-    MoveScalarMulPastMatMul,
-    MoveScalarMulPastConv,
-    MoveTransposePastJoinMul,
-    MoveTransposePastJoinAdd,
-    MoveMulPastJoinAdd,
-    MoveAddPastJoinAdd,
-    MoveScalarLinearPastSplit,
-    MoveAffinePastJoinConcat,
-    MoveMulPastJoinConcat,
-    MoveAddPastJoinConcat,
-    MoveTransposePastSplit,
-    MoveTransposePastJoinConcat,
-    MoveSqueezePastMultiThreshold,
-    is_scalar
-)
-# FINN streamlining transformations absorbing tensors/nodes into others
+from finn.transformation.streamline import ConvertSignToThres, RoundAndClipThresholds
 from finn.transformation.streamline.absorb import (
+    Absorb1BitMulIntoConv,
+    Absorb1BitMulIntoMatMul,
     AbsorbAddIntoMultiThreshold,
+    AbsorbMulIntoMultiThreshold,
     AbsorbSignBiasIntoMultiThreshold,
+    AbsorbTransposeIntoMultiThreshold,
     FactorOutMulSignMagnitude,
-    AbsorbMulIntoMultiThreshold,
-    Absorb1BitMulIntoMatMul,
-    Absorb1BitMulIntoConv,
-    AbsorbTransposeIntoMultiThreshold
+    group_inputs_by_category,
 )
+
 # FINN streamlining transformations fusing/collapsing operations of the same
 # kind
 from finn.transformation.streamline.collapse_repeated import (
+    CollapseRepeatedAdd,
     CollapseRepeatedMul,
     CollapseRepeatedTranspose,
-    CollapseRepeatedAdd
 )
+
 # FINN streamlining transformations removing nodes without real effect from the
 # graph
 from finn.transformation.streamline.remove import (
+    RemoveIdentityReshape,
     RemoveIdentityTranspose,
-    RemoveIdentityReshape
 )
 
+# FINN streamlining transformations reordering the graph
+from finn.transformation.streamline.reorder import (
+    MoveAddPastConv,
+    MoveAddPastJoinAdd,
+    MoveAddPastJoinConcat,
+    MoveAddPastMul,
+    MoveAffinePastJoinConcat,
+    MoveLinearPastEltwiseAdd,
+    MoveMulPastFork,
+    MoveMulPastJoinAdd,
+    MoveMulPastJoinConcat,
+    MoveMulPastMaxPool,
+    MoveScalarAddPastMatMul,
+    MoveScalarLinearPastInvariants,
+    MoveScalarLinearPastSplit,
+    MoveScalarMulPastConv,
+    MoveScalarMulPastMatMul,
+    MoveSqueezePastMatMul,
+    MoveSqueezePastMultiThreshold,
+    MoveTransposePastEltwise,
+    MoveTransposePastFork,
+    MoveTransposePastJoinAdd,
+    MoveTransposePastJoinConcat,
+    MoveTransposePastJoinMul,
+    MoveTransposePastSplit,
+    is_scalar,
+)
+
+# Execute onnx model graphs from the dataflow parent for verification
+from finn.util.test import execute_parent
+
+# FINN streamlining transformations absorbing tensors/nodes into others
+
+
 # Composes graph transformations such that each individual transformation as
 # well as the whole sequence is applied exhaustively
 class ComposedTransformation(Transformation):
@@ -269,12 +250,14 @@ def apply(self, model: ModelWrapper):  # noqa
         # sequence of transformations will be reapplied
         return model, graph_modified
 
+
 # # Custom conversion from Quant to MultiThreshold
 # TODO: Enable once fixed...
 # from custom.quant_activation_to_multithreshold import (
 #     QuantActivationToMultiThreshold
 # )
 
+
 # Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These
 # cannot be handled by MoveScalarLinearPastInvariants as potential padding makes
 # Add-Im2Col not commute to Im2Col-Add
@@ -350,6 +333,7 @@ def apply(self, model: ModelWrapper):  # noqa
         # needs to be applied again
         return model, graph_modified
 
+
 # Moves scalar linear elementwise operations past fork nodes, applies to Add,
 # Mul, Sub, Div, etc.
 class MoveScalarLinearPastFork(Transformation):
@@ -401,6 +385,7 @@ def apply(self, model: ModelWrapper):  # noqa
         # needs to be applied again
         return model, graph_modified
 
+
 # Moves constant elementwise multiplication past another joining multiplication
 class MoveConstMulPastJoinMul(Transformation):
     # Applies the transform to a whole model graph  # noqa: Duplicate
@@ -474,7 +459,8 @@ def apply(self, model: ModelWrapper):  # noqa
         # Return the transformed model and indicate whether the transformation
         # needs to be applied again
         return model, graph_modified
-    
+
+
 # Moves elementwise additions past MatMul operations: Applicable if each
 # operation has one initializer input
 class MoveAddPastMatMul(Transformation):
@@ -620,10 +606,8 @@ def apply(self, model: ModelWrapper):  # noqa
                             # Skip without warning ok?
                             continue
                         # There must be exactly one constant per operations
-                        assert len(s_name) == 1, \
-                            f"To many constant inputs for {node}"
-                        assert len(b_name) == 1, \
-                            f"To many constant inputs for {successor}"
+                        assert len(s_name) == 1, f"To many constant inputs for {node}"
+                        assert len(b_name) == 1, f"To many constant inputs for {successor}"
                         # Now read the initializer tensors
                         s = model.get_initializer(*s_name)
                         b = model.get_initializer(*b_name)
@@ -663,93 +647,102 @@ def apply(self, model: ModelWrapper):  # noqa
         # needs to be applied again
         return model, graph_modified
 
+
 # Define a set of custom streamlining transformations: These are applied once
 # during the actual streamlining step and once after converting attention to
 # hardware (the associated cleanup afterward might enable some Streamlining
 # transformations once again)
 def Streamline():  # noqa: Uppercase
     # Return a set of exhaustively applies transformations
-    return ComposedTransformation([
-        # On skip-connections: prefer pushing scalar multiplication forward
-        # before MoveAddPastMul
-        MoveMulPastFork(),
-        # The "standard" set of FINN streamlining transformations or at least
-        # inspired by them but applied exhaustively until none of them changes
-        # the graph anymore.
-        # Note: Covers most parts of non-branching linear topologies
-        ComposedTransformation([
-            ConvertSubToAdd(),
-            ConvertDivToMul(),
-            BatchNormToAffine(),
-            ConvertSignToThres(),
-            MoveMulPastMaxPool(),
-            AbsorbSignBiasIntoMultiThreshold(),
-            MoveScalarLinearPastInvariants(),
-            MoveAddPastMul(),
-            MoveScalarAddPastMatMul(),
-            MoveAddPastConv(),
-            MoveScalarMulPastMatMul(),
-            MoveScalarMulPastConv(),
-            MoveAddPastMul(),
-            CollapseRepeatedAdd(),
-            CollapseRepeatedMul(),
-            MoveMulPastMaxPool(),
-            AbsorbAddIntoMultiThreshold(),
-            FactorOutMulSignMagnitude(),
-            AbsorbMulIntoMultiThreshold(),
-            Absorb1BitMulIntoMatMul(),
-            Absorb1BitMulIntoConv(),
-        ]),
-        # Streamlining scales and biases forward through residual topologies
-        # Note: This mostly covers forking and joining operations
-        ComposedTransformation([
-            # Note: This is probably the most common way of joining skip
-            # connections, i.e., this corresponds to the original residual
-            # addition, i.e., y = f(x) + x
-            MoveLinearPastEltwiseAdd(),
-            MoveScalarLinearPastFork(),
-            MoveScalarLinearPastInvariants(),
+    return ComposedTransformation(
+        [
+            # On skip-connections: prefer pushing scalar multiplication forward
+            # before MoveAddPastMul
             MoveMulPastFork(),
-            MoveMulPastJoinAdd(),
-            MoveAddPastJoinAdd(),
-            # Note: This brings constant Muls (i.e., quantizer scales to be
-            # removed) forward through joining Muls (i.e., those ending up
-            # as actual hardware operators).
-            MoveConstMulPastJoinMul()
-        ]),
-        # Streamlining scales and biases forward through shape/layout changing
-        # operations, i.e., mostly transposes
-        ComposedTransformation([
-            # Convolution inputs and padding
-            MoveScalesPastIm2Col(),
-            # Streamlining for Split and Concat operations
-            MoveScalarLinearPastSplit(),
-            MoveAffinePastJoinConcat(),
-            MoveMulPastJoinConcat(),
-            MoveAddPastJoinConcat(),
-            # Move transposes around to some place where they could be removed
-            # later, i.e., where they collapse into identities
-            MoveTransposePastFork(),
-            MoveTransposePastSplit(),
-            MoveTransposePastJoinConcat(),
-            MoveTransposePastEltwise(),
-            MoveTransposePastJoinMul(),
-            MoveTransposePastJoinAdd(),
-            CollapseRepeatedTranspose(),
-            # Remove identity shape/layout transformations
-            RemoveIdentityTranspose(),
-            RemoveIdentityReshape(),
-            # Squeeze operators can be moved past the thresholding
-            MoveSqueezePastMultiThreshold(),
-            # A certain type of 4d-layout transpose can be absorbed (actually
-            # moved past) MultiThreshold operations
-            AbsorbTransposeIntoMultiThreshold(),
-        ]),
-        # Only round and clip after all streamlining transformations have
-        # been applied exhaustively.
-        # Note: Might still enable another round of streamlining.
-        RoundAndClipThresholds(),
-    ])
+            # The "standard" set of FINN streamlining transformations or at least
+            # inspired by them but applied exhaustively until none of them changes
+            # the graph anymore.
+            # Note: Covers most parts of non-branching linear topologies
+            ComposedTransformation(
+                [
+                    ConvertSubToAdd(),
+                    ConvertDivToMul(),
+                    BatchNormToAffine(),
+                    ConvertSignToThres(),
+                    MoveMulPastMaxPool(),
+                    AbsorbSignBiasIntoMultiThreshold(),
+                    MoveScalarLinearPastInvariants(),
+                    MoveAddPastMul(),
+                    MoveScalarAddPastMatMul(),
+                    MoveAddPastConv(),
+                    MoveScalarMulPastMatMul(),
+                    MoveScalarMulPastConv(),
+                    MoveAddPastMul(),
+                    CollapseRepeatedAdd(),
+                    CollapseRepeatedMul(),
+                    MoveMulPastMaxPool(),
+                    AbsorbAddIntoMultiThreshold(),
+                    FactorOutMulSignMagnitude(),
+                    AbsorbMulIntoMultiThreshold(),
+                    Absorb1BitMulIntoMatMul(),
+                    Absorb1BitMulIntoConv(),
+                ]
+            ),
+            # Streamlining scales and biases forward through residual topologies
+            # Note: This mostly covers forking and joining operations
+            ComposedTransformation(
+                [
+                    # Note: This is probably the most common way of joining skip
+                    # connections, i.e., this corresponds to the original residual
+                    # addition, i.e., y = f(x) + x
+                    MoveLinearPastEltwiseAdd(),
+                    MoveScalarLinearPastFork(),
+                    MoveScalarLinearPastInvariants(),
+                    MoveMulPastFork(),
+                    MoveMulPastJoinAdd(),
+                    MoveAddPastJoinAdd(),
+                    # Note: This brings constant Muls (i.e., quantizer scales to be
+                    # removed) forward through joining Muls (i.e., those ending up
+                    # as actual hardware operators).
+                    MoveConstMulPastJoinMul(),
+                ]
+            ),
+            # Streamlining scales and biases forward through shape/layout changing
+            # operations, i.e., mostly transposes
+            ComposedTransformation(
+                [
+                    # Convolution inputs and padding
+                    MoveScalesPastIm2Col(),
+                    # Streamlining for Split and Concat operations
+                    MoveScalarLinearPastSplit(),
+                    MoveAffinePastJoinConcat(),
+                    MoveMulPastJoinConcat(),
+                    MoveAddPastJoinConcat(),
+                    # Move transposes around to some place where they could be removed
+                    # later, i.e., where they collapse into identities
+                    MoveTransposePastFork(),
+                    MoveTransposePastSplit(),
+                    MoveTransposePastJoinConcat(),
+                    MoveTransposePastEltwise(),
+                    MoveTransposePastJoinMul(),
+                    MoveTransposePastJoinAdd(),
+                    CollapseRepeatedTranspose(),
+                    # Remove identity shape/layout transformations
+                    RemoveIdentityTranspose(),
+                    RemoveIdentityReshape(),
+                    # Squeeze operators can be moved past the thresholding
+                    MoveSqueezePastMultiThreshold(),
+                    # A certain type of 4d-layout transpose can be absorbed (actually
+                    # moved past) MultiThreshold operations
+                    AbsorbTransposeIntoMultiThreshold(),
+                ]
+            ),
+            # Only round and clip after all streamlining transformations have
+            # been applied exhaustively.
+            # Note: Might still enable another round of streamlining.
+            RoundAndClipThresholds(),
+        ]
+    )
 
 
 # Prepares the graph to be consumed by FINN:
@@ -763,62 +756,64 @@ def prepare_graph(range_info: RangeInfo):
     # Wrap the actual transformation/build step function
     def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig):
         # Exhaustively apply the set of cleanup transformations
-        model = model.transform(ComposedTransformation([
-            # Adds shape and datatype annotations to all tensors in this graph
-            InferDataTypes(),
-            InferShapes(),
-            # Cleanup the graph by removing redundant, unnecessary and constant
-            # nodes and tensors and give unique names to everything remaining
-            GiveUniqueNodeNames(),
-            GiveReadableTensorNames(),
-            RemoveStaticGraphInputs(),
-            RemoveUnusedTensors(),
-            GiveUniqueParameterTensors(),
-            FoldConstants(),
-            # Remove unnecessary shape and layout transformations
-            RemoveIdentityReshape(),
-            RemoveIdentityTranspose(),
-            # Redo shape and datatype annotations after removing nodes and
-            # tensors
-            InferShapes(),
-            InferDataTypes(),
-        ]))
+        model = model.transform(
+            ComposedTransformation(
+                [
+                    # Adds shape and datatype annotations to all tensors in this graph
+                    InferDataTypes(),
+                    InferShapes(),
+                    # Cleanup the graph by removing redundant, unnecessary and constant
+                    # nodes and tensors and give unique names to everything remaining
+                    GiveUniqueNodeNames(),
+                    GiveReadableTensorNames(),
+                    RemoveStaticGraphInputs(),
+                    RemoveUnusedTensors(),
+                    GiveUniqueParameterTensors(),
+                    FoldConstants(),
+                    # Remove unnecessary shape and layout transformations
+                    RemoveIdentityReshape(),
+                    RemoveIdentityTranspose(),
+                    # Redo shape and datatype annotations after removing nodes and
+                    # tensors
+                    InferShapes(),
+                    InferDataTypes(),
+                ]
+            )
+        )
         # If configured, run a verification of the transformed model on some
         # sample inputs
-        if (VerificationStepType.TIDY_UP_PYTHON in
-                cfg._resolve_verification_steps()):  # noqa
-            verify_step(
-                model, cfg, "tidied_up_python", need_parent=False
-            )
+        if VerificationStepType.TIDY_UP_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "tidied_up_python", need_parent=False)
         # Exhaustively apply the lowering transformations
-        model = model.transform(ComposedTransformation([
-            # Moves the bias input to the Conv operator as a separate Add node
-            # behind the Conv node
-            ExtractBiasFromConv(),
-            # Converts Gemm nodes to MatMul (+ bias)
-            GemmToMatMul(),
-            # Need to do some constant and weight folding first
-            FoldConstants(),
-            FoldTransposeIntoQuantInit(),
-            FoldQuantWeights(),
-            # Annotate the graph with shape and data type information
-            InferShapes(),
-            InferDataTypes(),
-            # Converts Conv layers to MatMul
-            LowerConvsToMatMul(),
-            # Converts BatchNorm to affine scale and bias
-            BatchNormToAffine(),
-            # Annotate the graph with shape and data type information
-            InferShapes(),
-            InferDataTypes(),
-        ]))
+        model = model.transform(
+            ComposedTransformation(
+                [
+                    # Moves the bias input to the Conv operator as a separate Add node
+                    # behind the Conv node
+                    ExtractBiasFromConv(),
+                    # Converts Gemm nodes to MatMul (+ bias)
+                    GemmToMatMul(),
+                    # Need to do some constant and weight folding first
+                    FoldConstants(),
+                    FoldTransposeIntoQuantInit(),
+                    FoldQuantWeights(),
+                    # Annotate the graph with shape and data type information
+                    InferShapes(),
+                    InferDataTypes(),
+                    # Converts Conv layers to MatMul
+                    LowerConvsToMatMul(),
+                    # Converts BatchNorm to affine scale and bias
+                    BatchNormToAffine(),
+                    # Annotate the graph with shape and data type information
+                    InferShapes(),
+                    InferDataTypes(),
+                ]
+            )
+        )
         # If configured, run a verification of the transformed model on some
         # sample inputs
-        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
-                cfg._resolve_verification_steps()):  # noqa
-            verify_step(
-                model, cfg, "lowered_python", need_parent=False
-            )
+        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "lowered_python", need_parent=False)
         # Apply the quantizer to MultiThreshold conversion
         # Note: This is exhaustive as well as single .transform reapplies as
         # long as possible.
@@ -826,26 +821,22 @@ def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig):
         # model = model.transform(QuantActivationToMultiThreshold(range_info))
         # If configured, run a verification of the transformed model on some
         # sample inputs
-        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
-                cfg._resolve_verification_steps()):  # noqa
-            verify_step(
-                model, cfg, "quant_to_thresholds_ra_python", need_parent=False
-            )
+        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "quant_to_thresholds_ra_python", need_parent=False)
         # Apply the standard QONNX to FINN conversion step to convert the
         # remaining quantizers not yet covered by the new range analysis based
         # method
-        model = model.transform(ConvertQONNXtoFINN(
-            filter_function=default_filter_function_generator(
-                max_multithreshold_bit_width=cfg.max_multithreshold_bit_width
+        model = model.transform(
+            ConvertQONNXtoFINN(
+                filter_function=default_filter_function_generator(
+                    max_multithreshold_bit_width=cfg.max_multithreshold_bit_width
+                )
             )
-        ))
+        )
         # If configured, run a verification of the transformed model on some
         # sample inputs
-        if (VerificationStepType.QONNX_TO_FINN_PYTHON in
-                cfg._resolve_verification_steps()):  # noqa
-            verify_step(
-                model, cfg, "prepared_graph_python", need_parent=False
-            )
+        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
+            verify_step(model, cfg, "prepared_graph_python", need_parent=False)
         # Return the transformed model
         return model
 
@@ -870,11 +861,8 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(Streamline())
     # If configured, run a verification of the transformed model on some
     # sample inputs
-    if (VerificationStepType.STREAMLINED_PYTHON in
-            cfg._resolve_verification_steps()):  # noqa
-        verify_step(
-            model, cfg, "streamlined_python", need_parent=False
-        )
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():  # noqa
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
     # Return the transformed model
     return model
 
@@ -904,23 +892,27 @@ def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
     model = model.transform(Squeeze())
     # Squeezing might have turned further transpose and reshape operations into
     # identities (those which just swapped around the dimensions of size 1)
-    model = model.transform(ComposedTransformation([
-        # Move transposes around to some place where they could be removed
-        # later, i.e., where they collapse into identities
-        MoveTransposePastFork(),
-        MoveTransposePastSplit(),
-        MoveTransposePastJoinConcat(),
-        MoveTransposePastEltwise(),
-        MoveTransposePastJoinMul(),
-        MoveTransposePastJoinAdd(),
-        CollapseRepeatedTranspose(),
-        # Remove identity shape/layout transformations
-        RemoveIdentityTranspose(),
-        RemoveIdentityReshape(),
-        # Squeeze operators can be moved past MatMuls and thresholding
-        MoveSqueezePastMatMul(),
-        MoveSqueezePastMultiThreshold(),
-    ]))
+    model = model.transform(
+        ComposedTransformation(
+            [
+                # Move transposes around to some place where they could be removed
+                # later, i.e., where they collapse into identities
+                MoveTransposePastFork(),
+                MoveTransposePastSplit(),
+                MoveTransposePastJoinConcat(),
+                MoveTransposePastEltwise(),
+                MoveTransposePastJoinMul(),
+                MoveTransposePastJoinAdd(),
+                CollapseRepeatedTranspose(),
+                # Remove identity shape/layout transformations
+                RemoveIdentityTranspose(),
+                RemoveIdentityReshape(),
+                # Squeeze operators can be moved past MatMuls and thresholding
+                MoveSqueezePastMatMul(),
+                MoveSqueezePastMultiThreshold(),
+            ]
+        )
+    )
     # Squeezing might enable absorbing adds into thresholds once again
     model = model.transform(AbsorbAddIntoMultiThreshold())
     # If applicable, absorb the final thresholds into the attention operator
@@ -942,9 +934,9 @@ def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig):
 def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _):
     # Convert elementwise operations to hardware operators
     #   Note: Do not convert the final Mul operator at the output
-    return model.transform(InferElementwiseBinaryOperation(
-        InferElementwiseBinaryOperation.reject_output_dequant
-    ))
+    return model.transform(
+        InferElementwiseBinaryOperation(InferElementwiseBinaryOperation.reject_output_dequant)
+    )
 
 
 # Converts Split and Concat operations to hardware custom operators
@@ -984,13 +976,10 @@ def step_replicate_streams(model: ModelWrapper, _):
 
 # Custom step for setting the parallelism to meet the target of T^2 cycles per
 # sequence
-def set_target_parallelization(seq_len: int,
-                               emb_dim: int):  # noqa: emb_dim
+def set_target_parallelization(seq_len: int, emb_dim: int):  # noqa: emb_dim
     # The wrapping function is a generator and this is the actual build step
     # function taking the model and build configuration
-    def step_set_target_parallelization(
-            model: ModelWrapper, cfg: DataflowBuildConfig
-    ):
+    def step_set_target_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig):
         # Run over all nodes in the model graph to look for attention operators,
         # which are currently not handled by the SetFolding transformation
         for index, node in enumerate(model.graph.node):
@@ -1006,9 +995,9 @@ def step_set_target_parallelization(
                 inst.set_nodeattr("SeqFold", seq_len)
         # Apply the built-in folding configuration transformation with the
         # T^2 target cycles
-        model = model.transform(SetFolding(
-            seq_len ** 2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation
-        ))
+        model = model.transform(
+            SetFolding(seq_len**2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation)
+        )
         # TODO: Extract the folding configuration
         # Return the model with configured parallelization
         return model
@@ -1033,8 +1022,7 @@ def apply(self, model: ModelWrapper):  # noqa
         # Iterate all nodes in the graph keeping track of the index
         for index, node in enumerate(graph.node):
             # A node should not be named "defaults"...
-            assert node.name != "defaults", \
-                "Node has reserved name 'defaults'"
+            assert node.name != "defaults", "Node has reserved name 'defaults'"
             # Convert this to the custom-op instance for easy access to node
             # attributes
             inst = getCustomOp(node)
@@ -1059,9 +1047,7 @@ def apply(self, model: ModelWrapper):  # noqa
 
 
 # Custom build step trying to set appropriate FIFO sizes for the transformer
-def set_fifo_depths(
-        seq_len: int, emb_dim: int, uram_threshold: int = 32  # noqa: emb_dim
-):
+def set_fifo_depths(seq_len: int, emb_dim: int, uram_threshold: int = 32):  # noqa: emb_dim
     # The wrapping function is a generator and this is the actual build step
     # function taking the model and build configuration
     def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
@@ -1091,9 +1077,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                 # Each folded input stream needs to be buffered completely
                 # TODO: Not exactly sure whether this is always correct or just
                 #  the worst-case
-                in_depths = [
-                    inst.get_number_input_values(i) for i in range(num_inputs)
-                ]
+                in_depths = [inst.get_number_input_values(i) for i in range(num_inputs)]
                 # Note: No special treatment of the output FIFO
                 # out_depths = ...
 
@@ -1113,7 +1097,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                     #  figuring out which of the two is the longer/deeper branch
                     #  in terms of cycles to set a corresponding buffer only to
                     #  the shorter branch.
-                    in_depths = [seq_len ** 2, seq_len ** 2]
+                    in_depths = [seq_len**2, seq_len**2]
                     # Note: No special treatment of the output FIFO
                     # out_depths = ...
 
@@ -1131,16 +1115,14 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         # no other depth is specified)
         model = model.transform(InsertFIFO(create_shallow_fifos=True))
         # Specialize the implementation variant of the (newly added FIFO) layers
-        model = model.transform(
-            SpecializeLayers(cfg._resolve_fpga_part())  # noqa: Access _ method
-        )
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))  # noqa: Access _ method
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
         # Only applies if a configuration file is given
         if cfg.folding_config_file is not None:
             # Load the configuration dictionary form YAML file
-            with (open(cfg.folding_config_file, "r") as file):
+            with open(cfg.folding_config_file, "r") as file:
                 # Load YAML string
                 config = yaml.safe_load(file)
                 # Assign unique names to the nodes which can be matched by
@@ -1232,9 +1214,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again
         # this will only run for the new nodes (e.g. FIFOs and DWCs)
         model = model.transform(
-            PrepareIP(
-                cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()  # noqa
-            )
+            PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())  # noqa
         )
         model = model.transform(HLSSynthIP())
 
@@ -1250,7 +1230,7 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
     # Only applies if a configuration file is given
     if cfg.folding_config_file is not None:
         # Load the configuration dictionary form YAML file
-        with (open(cfg.folding_config_file, "r") as file):
+        with open(cfg.folding_config_file, "r") as file:
             # Load YAML string
             config = yaml.safe_load(file)
             # Assign unique names to the nodes which can be matched by
@@ -1260,8 +1240,7 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(ApplyConfig(config))
     # If configured, run a verification of the transformed model on some sample
     # inputs
-    if (VerificationStepType.FOLDED_HLS_CPPSIM in
-            cfg._resolve_verification_steps()):  # noqa
+    if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps():  # noqa
         # Prepare C++ Simulation for verification
         model = model.transform(PrepareCppSim())
         model = model.transform(CompileCppSim())
@@ -1331,9 +1310,7 @@ def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(SetExecMode("rtlsim"))
     # Generates the C++ source and compiles the RTL simulation
     model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP(
-        cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)  # noqa
-    )
+    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns))  # noqa
     model = model.transform(HLSSynthIP())
     model = model.transform(PrepareRTLSim())
 

From cb7152939516fc341d718edcff16b28e6c1672a1 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 15:04:24 +0000
Subject: [PATCH 004/125] Fix imports

---
 benchmarking/bench.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index db6f00c159..b34951f34b 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -8,8 +8,6 @@
 
 from dut.mvau import bench_mvau
 from dut.transformer import bench_transformer
-from dut.transformer_radioml import bench_transformer_radioml
-from dut.transformer_gpt import bench_transformer_gpt
 from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing
 
 

From 7d8a5f153f16f854ef9a227c3baac2552ca9c914 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 15:40:50 +0000
Subject: [PATCH 005/125] Add convformer, workaround streamlining

---
 benchmarking/cfg/transformer_radioml_all.json | 5 +++++
 benchmarking/dut/transformer_custom_steps.py  | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json
index 7dbdc217d7..f2000fb9c3 100644
--- a/benchmarking/cfg/transformer_radioml_all.json
+++ b/benchmarking/cfg/transformer_radioml_all.json
@@ -3,5 +3,10 @@
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
         "dut_duplication": [1]
+    },
+    {
+        "seed": [12],
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
+        "dut_duplication": [1]
     }
 ]
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 2dc387a94a..91bdebb206 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -179,6 +179,7 @@
     MoveAddPastMul,
     MoveAffinePastJoinConcat,
     MoveLinearPastEltwiseAdd,
+    MoveLinearPastFork,
     MoveMulPastFork,
     MoveMulPastJoinAdd,
     MoveMulPastJoinConcat,
@@ -696,7 +697,7 @@ def Streamline():  # noqa: Uppercase
                     # connections, i.e., this corresponds to the original residual
                     # addition, i.e., y = f(x) + x
                     MoveLinearPastEltwiseAdd(),
-                    MoveScalarLinearPastFork(),
+                    MoveLinearPastFork(), #DEBUG for positional encoding streamlining, MoveScalarLinearPastFork()
                     MoveScalarLinearPastInvariants(),
                     MoveMulPastFork(),
                     MoveMulPastJoinAdd(),

From 51a5fdf21355760e5f40efaa849e37ebe77b8af6 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 29 Jan 2025 21:22:31 +0000
Subject: [PATCH 006/125] Combine test and benchmark CI defs

---
 .gitlab-ci.yml | 152 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 141 insertions(+), 11 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ebfa2f6f88..b44a26cdc1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,37 +1,107 @@
 stages:
-  - update
-  - build
+  - sync
+  - singularity_build
   - load_deps
   - test
-  - trigger_benchmarks
 
 variables:
   PIPELINE_NAME:
     description: "Optional name to better identify this pipeline"
     value: ""
+  TEST_SUITE:
+    description: "Select test suite to run"
+    value: "full"
+    options:
+      - "none"
+      - "quicktest"
+      - "main"
+      - "rtlsim"
+      - "end2end"
+      - "full"
   CPU_CORES:
     description: "Select number of CPU cores and test workers"
     value: "8"
   PARALLEL_JOBS:
-    description: "Number of parallel Slurm array jobs per CI job"
+    description: "Number of parallel Slurm array jobs per Benchmark job"
     value: "2"
   SLURM_TIMEOUT:
-    description: "Timeout"
-    value: "2-0" # [days-hours]
-  MANUAL_CFG_PATH:
-    description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner"
-    value: ""
+    description: "Select SLURM timeout"
+    value: "3-0" # [days-hours]
   SLURM_PARTITION:
     description: "Slurm partition (e.g., normal, largemem, fpga, gpu)"
     value: "normal"
   SLURM_QOS:
     description: "Optional QoS option (include --qos, e.g., --qos express)"
     value: ""
+  MANUAL_CFG_PATH:
+    description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner"
+    value: ""
   FINN_XILINX_VERSION:
     value: "2022.2"
+  SINGULARITY_IMG_SELECT:
+    value: "finn_dev.sif"
 
 workflow:
   name: '$PIPELINE_NAME'
+  rules:
+    # Run pipeline for GitHub PRs to dev (does not support PRs from forks)
+    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
+    # Run pipeline for pushes to dev
+    - if: $CI_COMMIT_BRANCH == "dev"
+    # Run pipeline if manually triggered via API or web GUI
+    - if: $CI_PIPELINE_SOURCE == "api"
+    - if: $CI_PIPELINE_SOURCE == "web"
+    # Run pipeline if scheduled (only for nightly sync of finn-dev)
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+
+Sync finn-dev:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: sync
+  tags:
+    # Run where full Docker + Singularity is available
+    - image_build
+  rules:
+    # Only run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+  script:
+    - mkdir -p ../github_clone && cd ../github_clone
+    - rm -rf finn-plus # Ensure we do a fresh clone (TODO: better way to handle this on job level?)
+    - git clone git@github.com:eki-project/finn-plus.git && cd finn-plus
+    - git remote add upstream https://github.com/Xilinx/finn.git
+    - git checkout finn-dev
+    - git pull upstream dev
+    - git push origin finn-dev
+
+Singularity Image Build:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: singularity_build
+  tags:
+    # Run where full Docker + Singularity is available
+    - image_build
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Only run if relevant files changed relative to dev branch
+    - changes:
+        paths:
+          - requirements.txt
+          - docker/Dockerfile.finn
+          - docker/finn_entrypoint.sh
+          - docker/quicktest.sh
+        compare_to: "dev"
+  script:
+    - docker build --no-cache -f docker/Dockerfile.finn --tag=finn_docker_export .
+    - apptainer build --force finn_singularity_image.sif docker-daemon://finn_docker_export:latest
+    - rsync -vh finn_singularity_image.sif $PATH_SINGULARITY_IMG_BUILD/finn-plus/finn_$CI_COMMIT_REF_SLUG.sif
+  after_script: # Clean caches
+    - echo 'y' | docker image prune
+    - echo 'y' | docker builder prune
+    - echo 'y' | apptainer cache clean
 
 Fetch Repos:
   id_tokens:
@@ -40,6 +110,12 @@ Fetch Repos:
   stage: load_deps
   tags:
     - login
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Otherwise run
+    - when: always
   cache: 
     key: $CI_COMMIT_SHA
     paths:
@@ -47,9 +123,58 @@ Fetch Repos:
   script:
     - ./fetch-repos.sh
 
+FINN Test Suite 2022.2:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Do not run if test suite has been deselected
+    - if: $TEST_SUITE == "none"
+      when: never
+    # Select different Singularity image if it deviates from default (dev branch)
+    - changes:
+        paths:
+          - requirements.txt
+          - docker/Dockerfile.finn
+          - docker/finn_entrypoint.sh
+          - docker/quicktest.sh
+        compare_to: "dev"
+      variables:
+        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
+    # Always run, as long as there was no prior failure
+    - when: on_success
+  cache: 
+    key: $CI_COMMIT_SHA
+    policy: pull
+    paths:
+      - deps
+  variables:
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
+    PYTEST_PARALLEL: "$CPU_CORES"
+    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT"
+    FINN_XILINX_VERSION: "2022.2"
+  before_script:
+    - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
+    - cd $PATH_WORKDIR/finn-plus
+    - module load system singularity
+  script:
+    - ./run-docker.sh quicktest.sh $TEST_SUITE
+
+FINN Test Suite 2024.1:
+  extends: FINN Test Suite 2022.2
+  variables:
+    FINN_XILINX_VERSION: "2024.1"
+
 Bench (Manual):
-  stage: trigger_benchmarks
+  stage: test
   rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
     - if: $MANUAL_CFG_PATH != ""
   trigger:
     include: benchmarking/bench-ci.yml
@@ -60,8 +185,11 @@ Bench (Manual):
     BENCH_CFG: "manual"
 
 Bench:
-  stage: trigger_benchmarks
+  stage: test
   rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
     - if: $MANUAL_CFG_PATH == ""
   trigger:
     include: benchmarking/bench-ci.yml
@@ -76,6 +204,8 @@ Bench:
 #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test
 #transformer: transformer_test, transformer_radioml_all
 
+#TODO: add selector for none, reduced, full benchmark suite
+
 #TODO: introduce result collect job on parent level for easier visualization/excel interfacing
 #TODO: more control via (optional) variables
 #TODO: move power measurement from polling-based script to its own job/runner

From 941984e6f5116ec1318fddb278b53ca1437bc50c Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 15:14:37 +0000
Subject: [PATCH 007/125] Refactor DUTs

---
 .gitlab-ci.yml                                |   2 +-
 benchmarking/bench.py                         |  41 +--
 benchmarking/bench_base.py                    | 246 ++++++++++++----
 benchmarking/cfg/fifosizing_test.json         |   5 +-
 benchmarking/cfg/metafi_fifosizing_test.json  |   7 +-
 benchmarking/cfg/metafi_test.json             |  10 +
 benchmarking/cfg/mvau_test.json               |   1 +
 .../cfg/resnet50_fifosizing_test.json         |   8 +-
 benchmarking/cfg/resnet50_test.json           |  13 +
 benchmarking/cfg/transformer_gpt_all.json     |   4 +
 benchmarking/cfg/transformer_radioml_all.json |   2 +
 benchmarking/cfg/transformer_sweep.json       |   5 +
 benchmarking/cfg/transformer_test.json        |   1 +
 benchmarking/dut/metafi.py                    |  83 ++++++
 benchmarking/dut/resnet50.py                  |  57 ++++
 .../{fifosizing.py => synthetic_nonlinear.py} | 263 +-----------------
 benchmarking/dut/transformer.py               |  47 ++--
 17 files changed, 430 insertions(+), 365 deletions(-)
 create mode 100644 benchmarking/cfg/metafi_test.json
 create mode 100644 benchmarking/cfg/resnet50_test.json
 create mode 100644 benchmarking/dut/metafi.py
 create mode 100644 benchmarking/dut/resnet50.py
 rename benchmarking/dut/{fifosizing.py => synthetic_nonlinear.py} (50%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2d28f34602..066a7dc289 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -198,7 +198,7 @@ Bench:
       pipeline_variables: true
   parallel:
     matrix:
-      - BENCH_CFG: [mvau_test]
+      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test]
 
 #dev: mvau_test
 #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index b34951f34b..f3a4c0f424 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -7,9 +7,22 @@
 import onnxruntime as ort
 
 from dut.mvau import bench_mvau
-from dut.transformer import bench_transformer
-from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing
-
+from dut.resnet50 import bench_resnet50
+from dut.metafi import bench_metafi
+from dut.synthetic_nonlinear import bench_synthetic_nonlinear
+
+dut = dict()
+dut["mvau"] = bench_mvau
+dut["resnet50"] = bench_resnet50
+dut["metafi"] = bench_metafi
+dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
+
+# TODO: remove guard once transformer support has been fully merged
+try:
+    from dut.transformer import bench_transformer
+    dut["transformer"] = bench_transformer
+except ImportError:
+    pass
 
 def main(config_name):
     exit_code = 0
@@ -124,20 +137,16 @@ def get_default_session_options_new():
 
         log_dict = {"run_id": run_id, "task_id": task_id, "params": params}
 
-        # Determine which DUT to run TODO: do this lookup more generically?
-        # give bench subclass name directly in config?
-        if config_select.startswith("mvau"):
-            bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("transformer"):
-            bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("fifosizing"):
-            bench_object = bench_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("metafi_fifosizing"):
-            bench_object = bench_metafi_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
-        elif config_select.startswith("resnet50_fifosizing"):
-            bench_object = bench_resnet50_fifosizing(params, task_id, run_id, artifacts_dir, save_dir)
+        # Create bench object for respective DUT
+        if "dut" in params:
+            if params.dut in dut:
+                bench_object = dut[params.dut](params, task_id, run_id, artifacts_dir, save_dir)
+            else:
+                print("ERROR: unknown DUT specified")
+                return 1
         else:
-            print("ERROR: unknown DUT specified")
+            print("ERROR: no DUT specified")
+            return 1
 
         start_time = time.time()
         try:
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 5c191d911f..0bd7be6907 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -6,6 +6,7 @@
 import time
 import traceback
 import glob
+import numpy as np
 from shutil import copy as shcopy
 from shutil import copytree
 import finn.core.onnx_exec as oxe
@@ -33,6 +34,7 @@
     gen_finn_dt_tensor,
     roundup_to_integer_multiple,
 )
+import finn.builder.build_dataflow as build
 from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
 from qonnx.core.modelwrapper import ModelWrapper
 from finn.builder.build_dataflow_config import DataflowBuildConfig
@@ -613,21 +615,22 @@ def save_local_artifacts_collection(self):
         for (name, source_path) in self.local_artifacts_collection:
             self.save_local_artifact(name, source_path)
 
+    # only used in simple flow (TODO: unify)
     def step_make_model(self):
-        # may be implemented in subclass
         pass
-
+    
+    # only used in full build flow
     def step_export_onnx(self):
-        # may be implemented in subclass
         pass
 
-    def step_build(self):
-        # may be implemented in subclass
+    # only used in full build flow
+    def step_build_setup(self):
         pass
 
+    # defaults to full build flow
+    # may be overwritten by subclass (e.g., to call simple flow instead)
     def run(self):
-        # must be implemented in subclass
-        pass
+        self.steps_full_build_flow()
 
     def step_finn_estimate(self):
         # Gather FINN estimates
@@ -813,51 +816,172 @@ def step_synth_power(self):
     def step_parse_builder_output(self, build_dir):
         # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow
 
-        # COPY bitstreams and other outputs
-        # TODO: integrate better (e.g. as artifact) and remove redundant copy
-        # TODO: make this more configurable or switch to job/artifact based power measurement
-        # TODO: make compatible to new instr wrapper (or however we generate these outputs)
-        shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
-        shcopy(os.path.join(build_dir, "harness/top.hwh"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
-        shcopy(os.path.join(build_dir, "harness/synth_report.xml"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
-        clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
-        measurement_settings = {"freq_mhz": clock_period_mhz}
-        with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
-            json.dump(measurement_settings, f, indent=2)
+        ### SAVE BITSTREAMS ###
+        if (os.path.exists(os.path.join(build_dir, "harness"))):
+            # TODO: integrate better (e.g. as artifact) and remove redundant copy
+            # TODO: make this more configurable or switch to job/artifact based power measurement
+            # TODO: make compatible to new instr wrapper (or however we generate these outputs)
+            shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), 
+                os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
+            shcopy(os.path.join(build_dir, "harness/top.hwh"), 
+                os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
+            shcopy(os.path.join(build_dir, "harness/synth_report.xml"), 
+                os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
+            clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
+            measurement_settings = {"freq_mhz": clock_period_mhz}
+            with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
+                json.dump(measurement_settings, f, indent=2)
+        else:
+            pass #TODO: warn/skip?
+
+        ### CHECK FOR VERIFICATION STEP SUCCESS ###
+        if (os.path.exists(os.path.join(build_dir, "verification_output"))):
+            # Collect all verification output filenames
+            outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy"))
+            # Extract the verification status for each verification output by matching
+            # to the SUCCESS string contained in the filename
+            status = all([
+                out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs
+            ])
+    
+            # Construct a dictionary reporting the verification status as string
+            self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]}
+            # TODO: mark job as failed if verification fails
+        else:
+            pass #TODO: warn/skip?
+
+        ### PARSE SYNTH RESOURCE REPORT ###
+        if (os.path.exists(os.path.join(build_dir, "harness/post_synth_resources.json"))):
+            report_path = os.path.join(build_dir, "harness/post_synth_resources.json") 
+            # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run)
+            report_filter = "(top)"
+            # Open the report file
+            with open(report_path) as file:
+                # Load the JSON formatted report
+                report = pd.read_json(file, orient="index")
+            # Filter the reported rows according to some regex filter rule
+            report = report.filter(regex=report_filter, axis="rows")
+            # Generate a summary of the total resources
+            summary = report.sum()
+
+            #TODO: parse finn estimates, hls estimates, step times, rtlsim performance(rtlsim n=1, n=100)
+            #TODO: optional simulation of instr wrapper instead of running on hw
+
+            self.output_dict["builder"] = summary.to_dict()
+        else:
+            pass #TODO: warn/skip?
+
+        ### ANALYZE FIFOs ###
+        fifo_info = {}
+        # TODO: skip if not present
+        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
+
+        fifo_info["fifo_depths"] = {}
+        fifo_info["fifo_sizes"] = {}
+        total_fifo_size = 0
+        for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            node_inst = getCustomOp(node)
+            fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
+            fifo_info["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
+            total_fifo_size += fifo_info["fifo_sizes"][node.name] 
+        fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
+
+        self.output_dict["fifos"] = fifo_info
+
+    def step_fifotest(self, onnx_path, cfg, build_dir):
+        # requires certain output products (e.g., ESTIMATE_REPORTS, RTLSIM_PERFORMANCE)
+        # TODO: check them and skip/warn if missing
+        log = {}
+        # load performance reports
+        with open(build_dir + "/report/estimate_network_performance.json") as f:
+            est_data = json.load(f)
+        with open(build_dir + "/report/rtlsim_performance.json") as f:
+            sim_data = json.load(f) 
+
+        # check for deadlock
+        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
+        first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+        last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+        deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+        log["deadlock"] = deadlock.tolist()
+
+        # check rtlsim throughput
+        throughput = sim_data["throughput[images/s]"]
+        stable_throughput = sim_data["stable_throughput[images/s]"]
+        estimated_throughput = est_data["estimated_throughput_fps"]
+        throughput_factor = throughput / estimated_throughput
+        stable_throughput_factor = stable_throughput / estimated_throughput
+
+        # TODO: Take throughput or stable_throughput?
+        throughput_pass = throughput_factor > self.params["fifo_throughput_factor_threshold"]
+
+        log["throughput_pass"] = throughput_pass
+        log["throughput"] = throughput
+        log["stable_throughput"] = stable_throughput
+        log["estimated_throughput"] = estimated_throughput
+
+        # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear
+        fifo_reduction_pass = []
+        log["fifo_reduction_results"] = {}
+        model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx")
+        for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            model = copy.deepcopy(model_orig)
+            node = model.get_node_from_name(node_orig.name)
+            node_inst = getCustomOp(node)
+
+            # skip shallow FIFOs
+            # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado?
+            if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]:
+                log["fifo_reduction_results"][node.name] = "skip"
+                continue
+
+            # reduce depth of current FIFO and reset generated code
+            node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"]))
+            node_inst.set_nodeattr("code_gen_dir_ipgen", "")
+            node_inst.set_nodeattr("ip_path", "")
+            node_inst.set_nodeattr("ipgen_path", "")
+
+            # save model variation
+            tmp_output_dir_var = build_dir + "/variations/" + node.name
+            os.makedirs(tmp_output_dir_var)
+            model.save(tmp_output_dir_var + "/model.onnx")
+
+            # build again, only re-run necessary steps to save time
+            cfg.output_dir = tmp_output_dir_var
+            cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"]
+            build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg)
+
+            # load performance report
+            with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f:
+                sim_data = json.load(f)
+
+            # check for deadlock
+            model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
+            first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
+            last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
+            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+            var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
+
+            # check rtlsim throughput
+            var_throughput = sim_data["throughput[images/s]"]
+            var_stable_throughput = sim_data["stable_throughput[images/s]"]
+            # TODO: take throughput or stable_throughput?
+            throughput_drop = (throughput - var_throughput) / throughput
+
+            if var_deadlock:   
+                fifo_reduction_pass.append(True)
+                log["fifo_reduction_results"][node.name] = 1.0
+            elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]:
+                fifo_reduction_pass.append(True)
+                log["fifo_reduction_results"][node.name] = throughput_drop
+            else:
+                fifo_reduction_pass.append(False)
+                log["fifo_reduction_results"][node.name] = "fail (no drop)"
 
-        # CHECK FOR VERIFICATION STEP SUCCESS
-        # Collect all verification output filenames
-        outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy"))
-        # Extract the verification status for each verification output by matching
-        # to the SUCCESS string contained in the filename
-        status = all([
-            out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs
-        ])
-   
-        # Construct a dictionary reporting the verification status as string
-        self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]}
-        # TODO: mark job as failed if verification fails
-
-        # PARSE LOGS
-        report_path = os.path.join(build_dir, "harness/post_synth_resources.json") 
-        # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run)
-        report_filter = "(top)"
-        # Open the report file
-        with open(report_path) as file:
-            # Load the JSON formatted report
-            report = pd.read_json(file, orient="index")
-        # Filter the reported rows according to some regex filter rule
-        report = report.filter(regex=report_filter, axis="rows")
-        # Generate a summary of the total resources
-        summary = report.sum()
-
-        #TODO: parse finn estimates, hls estimates, step times, (rtlsim n=1, n=100)
-        #TODO: add vivado latency simulation for special transformer case
-        
-        self.output_dict["builder"] = summary.to_dict()
+        self.output_dict["fifos"]["fifotest"] = log
 
     def steps_simple_model_flow(self):
         # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops)
@@ -898,6 +1022,7 @@ def steps_simple_model_flow(self):
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
 
+        ### SETUP ###
         # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
         # Ensure it exists but is empty (clear potential artifacts from previous runs)
         tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow")
@@ -907,6 +1032,7 @@ def steps_full_build_flow(self):
         os.makedirs(self.build_inputs["build_dir"], exist_ok=True)
         self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"]))
 
+        ### MODEL CREATION/IMPORT ###
         if "model_dir" in self.params:
             # input ONNX model and verification input/output pairs are provided
             model_dir = self.params["model_dir"]
@@ -928,6 +1054,22 @@ def steps_full_build_flow(self):
         if "floorplan_path" in self.params:
             self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
 
-        self.step_build()
+        ### BUILD SETUP ###
+        cfg = self.step_build_setup()
+        cfg.board = self.board
+        if "folding_path" in self.build_inputs:
+            cfg.folding_config_file = self.build_inputs["folding_path"]
+        if "specialize_path" in self.build_inputs:
+            cfg.specialize_layers_config_file = self.build_inputs["specialize_path"]
+        if "floorplan_path" in self.build_inputs:
+            cfg.floorplan_path = self.build_inputs["floorplan_path"]
 
+        ### BUILD ###
+        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
+
+        ### ANALYSIS ###
         self.step_parse_builder_output(self.build_inputs["build_dir"])
+
+        # Only run in-depth FIFO test if selected
+        if "fifo_rtlsim_n" in self.params:
+            self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"])
diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
index 890f4c5b66..519b7fe430 100644
--- a/benchmarking/cfg/fifosizing_test.json
+++ b/benchmarking/cfg/fifosizing_test.json
@@ -1,5 +1,6 @@
 [
     {
+        "dut": "synthetic_nonlinear",
         "dim": [32],
         "kernel_size": [5],
         "ch": [4],
@@ -12,8 +13,8 @@
 
         "strategy": ["analytical", "rtlsim"],
 
-        "rtlsim_n": [10],
-        "throughput_factor_threshold": [0.9],
+        "fifo_rtlsim_n": [10],
+        "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [64],
         "fifo_reduction_factor": [0.5],
         "fifo_reduction_throughput_drop_threshold": [0.01]
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index 2a3aa895ab..7540949eaf 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -1,15 +1,14 @@
 [
     {
+        "dut": "metafi",
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
-        "strategy": ["analytical"],
-
-        "rtlsim_n": [10],
-        "throughput_factor_threshold": [0.9],
+        "fifo_rtlsim_n": [10],
+        "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [1024],
         "fifo_reduction_factor": [0.5],
         "fifo_reduction_throughput_drop_threshold": [0.01]
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
new file mode 100644
index 0000000000..63a26d0dbc
--- /dev/null
+++ b/benchmarking/cfg/metafi_test.json
@@ -0,0 +1,10 @@
+[
+    {
+        "dut": "metafi",
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10]
+    }
+    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json
index 0c3abdb574..e9fc3358b5 100644
--- a/benchmarking/cfg/mvau_test.json
+++ b/benchmarking/cfg/mvau_test.json
@@ -1,5 +1,6 @@
 [
     {
+        "dut": ["mvau"],
         "idt": ["INT4","INT2"],
         "wdt": ["INT4"],
         "act": ["INT4"],
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index fbb0075dae..9ded5630f0 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -1,5 +1,7 @@
 [
     {
+        "dut": "resnet50",
+
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
         "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
@@ -8,10 +10,8 @@
         "board": ["U280"],
         "clock_period_ns": [4],
 
-        "strategy": ["analytical"],
-
-        "rtlsim_n": [2],
-        "throughput_factor_threshold": [0.9],
+        "fifo_rtlsim_n": [2],
+        "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [1024],
         "fifo_reduction_factor": [0.5],
         "fifo_reduction_throughput_drop_threshold": [0.01]
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
new file mode 100644
index 0000000000..bb9a65873e
--- /dev/null
+++ b/benchmarking/cfg/resnet50_test.json
@@ -0,0 +1,13 @@
+[
+    {
+        "dut": "resnet50",
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["U280"],
+        "clock_period_ns": [4]
+    }
+    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json
index 27c426606e..fd228710f1 100644
--- a/benchmarking/cfg/transformer_gpt_all.json
+++ b/benchmarking/cfg/transformer_gpt_all.json
@@ -1,20 +1,24 @@
 [
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"],
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"],
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"],
         "dut_duplication": [1]
     },   
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
         "dut_duplication": [1]
diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json
index f2000fb9c3..207839f5d5 100644
--- a/benchmarking/cfg/transformer_radioml_all.json
+++ b/benchmarking/cfg/transformer_radioml_all.json
@@ -1,10 +1,12 @@
 [
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
         "dut_duplication": [1]
diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json
index d10c4d94ca..d30df90b87 100644
--- a/benchmarking/cfg/transformer_sweep.json
+++ b/benchmarking/cfg/transformer_sweep.json
@@ -1,5 +1,6 @@
 [
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
@@ -18,6 +19,7 @@
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
@@ -36,6 +38,7 @@
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
@@ -54,6 +57,7 @@
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
@@ -72,6 +76,7 @@
         "dut_duplication": [1]
     },
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json
index 784d96f93d..d7346e6068 100644
--- a/benchmarking/cfg/transformer_test.json
+++ b/benchmarking/cfg/transformer_test.json
@@ -1,5 +1,6 @@
 [
     {
+        "dut": "transformer",
         "seed": [12],
 
         "calibration_passes": [32],
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
new file mode 100644
index 0000000000..94bb4b068c
--- /dev/null
+++ b/benchmarking/dut/metafi.py
@@ -0,0 +1,83 @@
+import finn.builder.build_dataflow_config as build_cfg
+
+from bench_base import bench
+
+# # custom steps
+# from custom_steps import (
+#     step_extract_absorb_bias,
+#     step_pre_streamline,
+#     step_residual_convert_to_hw,
+#     step_residual_streamline,
+#     step_residual_tidy,
+#     step_residual_topo,
+#     step_set_preferred_impl_style,
+#     step_convert_final_layers
+# )
+
+class bench_metafi(bench):
+    def step_build_setup(self):
+        # create build config for MetaFi models
+
+        steps = [
+            # step_residual_tidy,
+            # step_extract_absorb_bias,
+            # step_residual_topo,
+            # step_pre_streamline,
+            # step_residual_streamline,
+            # step_residual_convert_to_hw,
+            "step_create_dataflow_partition",
+            # step_set_preferred_impl_style,
+            "step_specialize_layers",
+            "step_target_fps_parallelization",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            "step_create_stitched_ip",
+            "step_measure_rtlsim_performance",
+            "step_out_of_context_synthesis",
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+
+        cfg = build_cfg.DataflowBuildConfig(
+            output_dir = self.build_inputs["build_dir"],
+            synth_clk_period_ns = self.clock_period_ns,
+            steps=steps,
+            verbose=False,
+            target_fps=None, #23
+            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
+            #vitis_platform=vitis_platform,
+
+            auto_fifo_depths=False,
+            split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test
+
+            # general rtlsim settings
+            force_python_rtlsim=False,
+            rtlsim_batch_size=self.params["rtlsim_n"],
+
+            # folding_config_file=folding_config_file,
+            # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
+            # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
+            # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json",
+            auto_fifo_strategy="characterize",
+            characteristic_function_strategy=self.params["strategy"],
+            #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
+            # standalone_thresholds=True,
+            # enable extra performance optimizations (physopt)
+            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP,
+                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
+            ],
+        )
+
+        # where is this used and why?
+        cfg.use_conv_rtl = True,  # use rtl for conv layers (MVAU cannot use rtl in our model)
+
+        return cfg
\ No newline at end of file
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
new file mode 100644
index 0000000000..701f7f65e2
--- /dev/null
+++ b/benchmarking/dut/resnet50.py
@@ -0,0 +1,57 @@
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import alveo_default_platform
+
+from dut.resnet50_custom_steps import (
+        step_resnet50_tidy,
+        step_resnet50_streamline,
+        step_resnet50_convert_to_hw,
+        step_resnet50_slr_floorplan,
+    )
+
+from bench_base import bench
+
+class bench_resnet50(bench):
+    def step_build_setup(self):
+        # create build config for ResNet-50 (based on finn-examples)
+
+        resnet50_build_steps = [
+            step_resnet50_tidy,
+            step_resnet50_streamline,
+            step_resnet50_convert_to_hw,
+            "step_create_dataflow_partition",
+            "step_specialize_layers",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            step_resnet50_slr_floorplan,
+            "step_create_stitched_ip", # was not in finn-examples
+            "step_measure_rtlsim_performance", # was not in finn-examples
+            "step_out_of_context_synthesis", # was not in finn-examples
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+
+        cfg = build_cfg.DataflowBuildConfig(
+            output_dir = self.build_inputs["build_dir"],
+            synth_clk_period_ns = self.clock_period_ns,
+            steps=resnet50_build_steps,
+            shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end
+            auto_fifo_depths=False,
+            split_large_fifos=True,
+            vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
+
+            # enable extra performance optimizations (physopt)
+            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            generate_outputs=[
+                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+                build_cfg.DataflowOutputType.STITCHED_IP,
+                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
+                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
+            ],
+        )
+
+        return cfg
\ No newline at end of file
diff --git a/benchmarking/dut/fifosizing.py b/benchmarking/dut/synthetic_nonlinear.py
similarity index 50%
rename from benchmarking/dut/fifosizing.py
rename to benchmarking/dut/synthetic_nonlinear.py
index 46b972deb0..3193432798 100644
--- a/benchmarking/dut/fifosizing.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -28,12 +28,7 @@
 from finn.util.test import get_trained_network_and_ishape
 from finn.util.basic import alveo_default_platform
 
-from dut.resnet50_custom_steps import (
-        step_resnet50_tidy,
-        step_resnet50_streamline,
-        step_resnet50_convert_to_hw,
-        step_resnet50_slr_floorplan,
-    )
+
 
 from bench_base import bench
 
@@ -251,13 +246,11 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe):
     model = model.transform(GiveReadableTensorNames())
     return model
 
-class bench_fifosizing(bench):
+class bench_synthetic_nonlinear(bench):
     def step_export_onnx(self, onnx_export_path):
         np.random.seed(0)
         tmp_output_dir = make_build_dir("test_fifosizing")
 
-        #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests
-        # or make into a build flow output product "fifo_report"
         #TODO: allow manual folding/fifo config as input
 
         #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness?
@@ -318,259 +311,7 @@ def step_build_setup(self):
         )
 
         return cfg
-    
-    def step_fifotest(self, onnx_path, cfg, build_dir):
-        log = {}
-        build.build_dataflow_cfg(onnx_path, cfg)
-
-        # load performance reports
-        with open(build_dir + "/report/estimate_network_performance.json") as f:
-            est_data = json.load(f)
-        with open(build_dir + "/report/rtlsim_performance.json") as f:
-            sim_data = json.load(f) 
-
-        # check for deadlock
-        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
-        first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
-        last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
-        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
-        deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
-        log["deadlock"] = deadlock.tolist()
-
-        # check rtlsim throughput
-        throughput = sim_data["throughput[images/s]"]
-        stable_throughput = sim_data["stable_throughput[images/s]"]
-        estimated_throughput = est_data["estimated_throughput_fps"]
-        throughput_factor = throughput / estimated_throughput
-        stable_throughput_factor = stable_throughput / estimated_throughput
-
-        # TODO: Take throughput or stable_throughput?
-        throughput_pass = throughput_factor > self.params["throughput_factor_threshold"]
-
-        log["throughput_pass"] = throughput_pass
-        log["throughput"] = throughput
-        log["stable_throughput"] = stable_throughput
-        log["estimated_throughput"] = estimated_throughput
-
-        # log FIFO sizes for easier inspection
-        log["fifo_depths"] = {}
-        log["fifo_sizes"] = {}
-        total_fifo_size = 0
-        for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"):
-            node_inst = getCustomOp(node)
-            log["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
-            log["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
-            total_fifo_size += log["fifo_sizes"][node.name] 
-        log["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
-
-        # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear
-        fifo_reduction_pass = []
-        log["fifo_reduction_results"] = {}
-        model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx")
-        for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"):
-            model = copy.deepcopy(model_orig)
-            node = model.get_node_from_name(node_orig.name)
-            node_inst = getCustomOp(node)
-
-            # skip shallow FIFOs
-            # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado?
-            if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]:
-                log["fifo_reduction_results"][node.name] = "skip"
-                continue
-
-            # reduce depth of current FIFO and reset generated code
-            node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"]))
-            node_inst.set_nodeattr("code_gen_dir_ipgen", "")
-            node_inst.set_nodeattr("ip_path", "")
-            node_inst.set_nodeattr("ipgen_path", "")
-
-            # save model variation
-            tmp_output_dir_var = build_dir + "/variations/" + node.name
-            os.makedirs(tmp_output_dir_var)
-            model.save(tmp_output_dir_var + "/model.onnx")
-
-            # build again, only re-run necessary steps to save time
-            cfg.output_dir = tmp_output_dir_var
-            cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"]
-            build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg)
-
-            # load performance report
-            with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f:
-                sim_data = json.load(f)
-
-            # check for deadlock
-            model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
-            first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
-            last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
-            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
-            var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
-
-            # check rtlsim throughput
-            var_throughput = sim_data["throughput[images/s]"]
-            var_stable_throughput = sim_data["stable_throughput[images/s]"]
-            # TODO: take throughput or stable_throughput?
-            throughput_drop = (throughput - var_throughput) / throughput
-
-            if var_deadlock:   
-                fifo_reduction_pass.append(True)
-                log["fifo_reduction_results"][node.name] = 1.0
-            elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]:
-                fifo_reduction_pass.append(True)
-                log["fifo_reduction_results"][node.name] = throughput_drop
-            else:
-                fifo_reduction_pass.append(False)
-                log["fifo_reduction_results"][node.name] = "fail (no drop)"
-        
-        self.output_dict["fifosizing_testresults"] = log
-
-    def step_build(self):
-        # TODO: rename steps to model three phases: model creation/import, dataflow build, analysis
-        # dataflow build should be easily swappable and adpaptable to finn-examples
-        cfg = self.step_build_setup()
-        cfg.board = self.board
-        if "folding_path" in self.build_inputs:
-            cfg.folding_config_file = self.build_inputs["folding_path"]
-        if "specialize_path" in self.build_inputs:
-            cfg.specialize_layers_config_file = self.build_inputs["specialize_path"]
-        self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"])
 
     def step_parse_builder_output(self, build_dir):
         # build output itself is not relevant here (yet)
         pass
-
-    def run(self):
-        self.steps_full_build_flow()
-
-
-# # custom steps
-# from custom_steps import (
-#     step_extract_absorb_bias,
-#     step_pre_streamline,
-#     step_residual_convert_to_hw,
-#     step_residual_streamline,
-#     step_residual_tidy,
-#     step_residual_topo,
-#     step_set_preferred_impl_style,
-#     step_convert_final_layers
-# )
-
-# TODO: put these definitions into separate files/classes so we can use them for other types of benchmaks as well
-class bench_metafi_fifosizing(bench_fifosizing):
-    def step_build_setup(self):
-        # create build config for MetaFi models
-
-        steps = [
-            # step_residual_tidy,
-            # step_extract_absorb_bias,
-            # step_residual_topo,
-            # step_pre_streamline,
-            # step_residual_streamline,
-            # step_residual_convert_to_hw,
-            "step_create_dataflow_partition",
-            # step_set_preferred_impl_style,
-            "step_specialize_layers",
-            "step_target_fps_parallelization",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            "step_create_stitched_ip",
-            "step_measure_rtlsim_performance",
-            "step_out_of_context_synthesis",
-            "step_synthesize_bitfile",
-            "step_make_pynq_driver",
-            "step_deployment_package",
-        ]
-
-        cfg = build_cfg.DataflowBuildConfig(
-            output_dir = self.build_inputs["build_dir"],
-            synth_clk_period_ns = self.clock_period_ns,
-            steps=steps,
-            verbose=False,
-            target_fps=None, #23
-            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
-            #vitis_platform=vitis_platform,
-
-            auto_fifo_depths=False,
-            split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test
-
-            # general rtlsim settings
-            force_python_rtlsim=False,
-            rtlsim_batch_size=self.params["rtlsim_n"],
-
-            # folding_config_file=folding_config_file,
-            # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
-            # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
-            # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json",
-            auto_fifo_strategy="characterize",
-            characteristic_function_strategy=self.params["strategy"],
-            #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
-            # standalone_thresholds=True,
-            # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP,
-                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
-                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
-            ],
-        )
-
-        # where is this used and why?
-        cfg.use_conv_rtl = True,  # use rtl for conv layers (MVAU cannot use rtl in our model)
-
-        return cfg
-
-
-class bench_resnet50_fifosizing(bench_fifosizing):
-    def step_build_setup(self):
-        # create build config for ResNet-50 (based on finn-examples)
-
-        resnet50_build_steps = [
-            step_resnet50_tidy,
-            step_resnet50_streamline,
-            step_resnet50_convert_to_hw,
-            "step_create_dataflow_partition",
-            "step_specialize_layers",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            step_resnet50_slr_floorplan,
-            "step_create_stitched_ip", # was not in finn-examples
-            "step_measure_rtlsim_performance", # was not in finn-examples
-            "step_out_of_context_synthesis", # was not in finn-examples
-            "step_synthesize_bitfile",
-            "step_make_pynq_driver",
-            "step_deployment_package",
-        ]
-
-        cfg = build_cfg.DataflowBuildConfig(
-            output_dir = self.build_inputs["build_dir"],
-            synth_clk_period_ns = self.clock_period_ns,
-            steps=resnet50_build_steps,
-            shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end
-            auto_fifo_depths=False,
-            split_large_fifos=True,
-            vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
-
-            # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP,
-                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
-                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
-            ],
-        )
-
-        # non-standard build parameter for custom step
-        cfg.floorplan_path = self.build_inputs["floorplan_path"]
-
-        return cfg
\ No newline at end of file
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index ed9991100b..305cac8188 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -858,7 +858,7 @@ def step_export_onnx(self, output_onnx_path):
                     opset_version = 14, 
                     do_constant_folding = True)
 
-    def step_build(self):
+    def step_build_setup(self):
         #with open("params.yaml") as file:
         #    params = yaml.safe_load(file)
         # Seed all RNGs
@@ -910,7 +910,6 @@ def step_build(self):
             output_dir = self.build_inputs["build_dir"],
             stitched_ip_gen_dcp = False, # only needed for further manual integration
             synth_clk_period_ns = self.clock_period_ns,
-            board = self.board,
             shell_flow_type = shell_flow,
             folding_config_file = "folding.yaml",
             specialize_layers_config_file = "specialize_layers.json",
@@ -928,7 +927,7 @@ def step_build(self):
                 #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
                 #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
                 #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
-                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components
+                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components TODO: try with pyXSI
                 #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
             ],
 
@@ -1041,25 +1040,23 @@ def step_build(self):
                 #test_step_build_platform # synthesis with instr wrapper
             ]
         )
-        # Run the build process on the dummy attention operator graph
-        # TODO: maybe let this function return the cfg only, so it can be modified by bench context
-        build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
-
-    def run(self):
-        self.steps_full_build_flow()
-
-        # DEBUG code for live logging of long instr wrapper simulation:
-        # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log")
-        # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True)
-        # sim_output_dir = build_dir + "/instrwrap_sim"
-        # # Prepare bash script
-        # bash_script = os.getcwd() + "/run_vivado_sim.sh"
-        # with open(bash_script, "w") as script:
-        #     script.write("#!/bin/bash\n")
-        #     script.write("cd %s\n"%(sim_output_dir))
-        #     script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path))
-        # # Run script
-        # print("Running Vivado simulation of instrumentation wrapper")
-        # sub_proc = subprocess.Popen(["bash", bash_script])
-        # sub_proc.communicate()
-        #######
+
+        return cfg
+
+    #def run(self):
+    # self.steps_full_build_flow()
+    # DEBUG code for live logging of long instr wrapper simulation:
+    # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log")
+    # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True)
+    # sim_output_dir = build_dir + "/instrwrap_sim"
+    # # Prepare bash script
+    # bash_script = os.getcwd() + "/run_vivado_sim.sh"
+    # with open(bash_script, "w") as script:
+    #     script.write("#!/bin/bash\n")
+    #     script.write("cd %s\n"%(sim_output_dir))
+    #     script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path))
+    # # Run script
+    # print("Running Vivado simulation of instrumentation wrapper")
+    # sub_proc = subprocess.Popen(["bash", bash_script])
+    # sub_proc.communicate()
+    #######

From f6d196b69249d405fac5e2003c68ead216c42139 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 15:36:31 +0000
Subject: [PATCH 008/125] Fix bench class lookup

---
 benchmarking/bench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index f3a4c0f424..7e38a2f0c8 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -139,8 +139,8 @@ def get_default_session_options_new():
 
         # Create bench object for respective DUT
         if "dut" in params:
-            if params.dut in dut:
-                bench_object = dut[params.dut](params, task_id, run_id, artifacts_dir, save_dir)
+            if params["dut"] in dut:
+                bench_object = dut[params["dut"]](params, task_id, run_id, artifacts_dir, save_dir)
             else:
                 print("ERROR: unknown DUT specified")
                 return 1

From c6ae70fa5f929f0176befb0c919fd6fdc5e7bf0b Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 15:54:50 +0000
Subject: [PATCH 009/125] Fix cfgs

---
 benchmarking/cfg/fifosizing_test.json          |  2 +-
 benchmarking/cfg/metafi_fifosizing_test.json   |  2 +-
 benchmarking/cfg/metafi_test.json              |  2 +-
 benchmarking/cfg/resnet50_fifosizing_test.json |  2 +-
 benchmarking/cfg/resnet50_test.json            |  2 +-
 benchmarking/cfg/transformer_gpt_all.json      |  8 ++++----
 benchmarking/cfg/transformer_radioml_all.json  |  4 ++--
 benchmarking/cfg/transformer_sweep.json        | 10 +++++-----
 benchmarking/cfg/transformer_test.json         |  2 +-
 9 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
index 519b7fe430..d3d4559e43 100644
--- a/benchmarking/cfg/fifosizing_test.json
+++ b/benchmarking/cfg/fifosizing_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "synthetic_nonlinear",
+        "dut": ["synthetic_nonlinear"],
         "dim": [32],
         "kernel_size": [5],
         "ch": [4],
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index 7540949eaf..a98089d046 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "metafi",
+        "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
 
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index 63a26d0dbc..2d382d3a61 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "metafi",
+        "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
 
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index 9ded5630f0..82b3d36659 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "resnet50",
+        "dut": ["resnet50"],
 
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index bb9a65873e..19c555dd9d 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "resnet50",
+        "dut": ["resnet50"],
 
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json
index fd228710f1..4b1ee011c1 100644
--- a/benchmarking/cfg/transformer_gpt_all.json
+++ b/benchmarking/cfg/transformer_gpt_all.json
@@ -1,24 +1,24 @@
 [
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"],
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"],
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"],
         "dut_duplication": [1]
     },   
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
         "dut_duplication": [1]
diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json
index 207839f5d5..f2c8733c20 100644
--- a/benchmarking/cfg/transformer_radioml_all.json
+++ b/benchmarking/cfg/transformer_radioml_all.json
@@ -1,12 +1,12 @@
 [
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
         "dut_duplication": [1]
diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json
index d30df90b87..e1795ff3f8 100644
--- a/benchmarking/cfg/transformer_sweep.json
+++ b/benchmarking/cfg/transformer_sweep.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],
@@ -19,7 +19,7 @@
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],
@@ -38,7 +38,7 @@
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],
@@ -57,7 +57,7 @@
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],
@@ -76,7 +76,7 @@
         "dut_duplication": [1]
     },
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],
diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json
index d7346e6068..a740a447b6 100644
--- a/benchmarking/cfg/transformer_test.json
+++ b/benchmarking/cfg/transformer_test.json
@@ -1,6 +1,6 @@
 [
     {
-        "dut": "transformer",
+        "dut": ["transformer"],
         "seed": [12],
 
         "calibration_passes": [32],

From 3d4e7a618d0dbb5413e0ba34b56f190dc033f34a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 16:16:02 +0000
Subject: [PATCH 010/125] Fix misc

---
 benchmarking/bench_base.py              | 7 ++++++-
 benchmarking/cfg/metafi_test.json       | 4 +++-
 benchmarking/cfg/resnet50_test.json     | 4 +++-
 benchmarking/dut/metafi.py              | 3 +--
 benchmarking/dut/resnet50.py            | 2 +-
 benchmarking/dut/synthetic_nonlinear.py | 1 -
 6 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 0bd7be6907..5ed6750820 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1055,8 +1055,13 @@ def steps_full_build_flow(self):
             self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
 
         ### BUILD SETUP ###
+        # TODO: select output products here, depending on what shall be tested
+        # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow
         cfg = self.step_build_setup()
         cfg.board = self.board
+        if "fifo_rtlsim_n" in self.params:
+            # TODO: determine automatically or replace by exact instr wrapper sim
+            cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"]
         if "folding_path" in self.build_inputs:
             cfg.folding_config_file = self.build_inputs["folding_path"]
         if "specialize_path" in self.build_inputs:
@@ -1071,5 +1076,5 @@ def steps_full_build_flow(self):
         self.step_parse_builder_output(self.build_inputs["build_dir"])
 
         # Only run in-depth FIFO test if selected
-        if "fifo_rtlsim_n" in self.params:
+        if "fifo_throughput_factor_threshold" in self.params:
             self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"])
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index 2d382d3a61..b0989eabca 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -5,6 +5,8 @@
         "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10]
+        "clock_period_ns": [10],
+
+        "fifo_rtlsim_n": [10]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index 19c555dd9d..30131923a4 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -8,6 +8,8 @@
         "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
         "board": ["U280"],
-        "clock_period_ns": [4]
+        "clock_period_ns": [4],
+
+        "fifo_rtlsim_n": [2]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index 94bb4b068c..52e31eabee 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -57,7 +57,6 @@ def step_build_setup(self):
 
             # general rtlsim settings
             force_python_rtlsim=False,
-            rtlsim_batch_size=self.params["rtlsim_n"],
 
             # folding_config_file=folding_config_file,
             # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
@@ -68,7 +67,7 @@ def step_build_setup(self):
             #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
             # standalone_thresholds=True,
             # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST,
             generate_outputs=[
                 build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
                 build_cfg.DataflowOutputType.STITCHED_IP,
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
index 701f7f65e2..c4f80737c0 100644
--- a/benchmarking/dut/resnet50.py
+++ b/benchmarking/dut/resnet50.py
@@ -45,7 +45,7 @@ def step_build_setup(self):
             vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
 
             # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST,
+            vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST,
             generate_outputs=[
                 build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
                 build_cfg.DataflowOutputType.STITCHED_IP,
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index 3193432798..852d47012f 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -301,7 +301,6 @@ def step_build_setup(self):
             target_fps=None,
             # general rtlsim settings
             force_python_rtlsim=False,
-            rtlsim_batch_size=self.params["rtlsim_n"],
             shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
             generate_outputs=[
                 build_cfg.DataflowOutputType.ESTIMATE_REPORTS,

From a5bd7ab2a9ab235425833ea2be01cc4f8c1268ad Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 16:53:51 +0000
Subject: [PATCH 011/125] Unify fifosizing settings

---
 benchmarking/bench_base.py                     | 18 +++++++++++++++++-
 benchmarking/cfg/fifosizing_test.json          |  3 ++-
 benchmarking/cfg/metafi_fifosizing_test.json   |  2 ++
 benchmarking/cfg/metafi_test.json              |  2 ++
 benchmarking/cfg/resnet50_fifosizing_test.json |  2 ++
 benchmarking/cfg/resnet50_test.json            |  2 ++
 benchmarking/dut/metafi.py                     |  9 ++-------
 benchmarking/dut/resnet50.py                   |  1 -
 benchmarking/dut/synthetic_nonlinear.py        |  9 ++-------
 benchmarking/dut/transformer.py                |  3 ---
 10 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 5ed6750820..04583c1652 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1059,9 +1059,25 @@ def steps_full_build_flow(self):
         # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow
         cfg = self.step_build_setup()
         cfg.board = self.board
+        cfg.verbose = False
+        cfg.enable_build_pdb_debug = False
+        cfg.force_python_rtlsim = False
+
+        # "manual or "characterize" or "largefifo_rtlsim"
+        if "fifo_method" in self.params:
+            if self.params["fifo_method"] == "manual":
+                cfg.auto_fifo_depths = False
+            else:
+                cfg.auto_fifo_depths = True
+                cfg.auto_fifo_strategy = self.params["fifo_method"]
+        # only relevant for "characterize" method: "rtlsim" or "analytical"
+        if "fifo_strategy" in self.params:
+            cfg.characteristic_function_strategy = self.params["fifo_strategy"]
+
+        # TODO: determine automatically or replace by exact instr wrapper sim
         if "fifo_rtlsim_n" in self.params:
-            # TODO: determine automatically or replace by exact instr wrapper sim
             cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"]
+
         if "folding_path" in self.build_inputs:
             cfg.folding_config_file = self.build_inputs["folding_path"]
         if "specialize_path" in self.build_inputs:
diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
index d3d4559e43..20e2588282 100644
--- a/benchmarking/cfg/fifosizing_test.json
+++ b/benchmarking/cfg/fifosizing_test.json
@@ -11,7 +11,8 @@
         "lb_num_layers": [1],
         "rb_num_layers": [3],
 
-        "strategy": ["analytical", "rtlsim"],
+        "fifo_method": ["characterize"],
+        "fifo_strategy": ["analytical", "rtlsim"],
 
         "fifo_rtlsim_n": [10],
         "fifo_throughput_factor_threshold": [0.9],
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index a98089d046..6a441cbcd5 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -7,6 +7,8 @@
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
+        "fifo_method": ["largefifo_rtlsim"],
+
         "fifo_rtlsim_n": [10],
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [1024],
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index b0989eabca..7ede065c76 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -7,6 +7,8 @@
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
+        "fifo_method": ["manual"],
+
         "fifo_rtlsim_n": [10]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index 82b3d36659..b4dddc24f9 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -10,6 +10,8 @@
         "board": ["U280"],
         "clock_period_ns": [4],
 
+        "fifo_method": ["largefifo_rtlsim"],
+
         "fifo_rtlsim_n": [2],
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [1024],
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index 30131923a4..df81e83661 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -10,6 +10,8 @@
         "board": ["U280"],
         "clock_period_ns": [4],
 
+        "fifo_method": ["manual"],
+
         "fifo_rtlsim_n": [2]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index 52e31eabee..462314c2ec 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -47,23 +47,18 @@ def step_build_setup(self):
             output_dir = self.build_inputs["build_dir"],
             synth_clk_period_ns = self.clock_period_ns,
             steps=steps,
-            verbose=False,
+
             target_fps=None, #23
             shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
             #vitis_platform=vitis_platform,
 
-            auto_fifo_depths=False,
             split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test
 
-            # general rtlsim settings
-            force_python_rtlsim=False,
-
             # folding_config_file=folding_config_file,
             # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
             # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
             # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json",
-            auto_fifo_strategy="characterize",
-            characteristic_function_strategy=self.params["strategy"],
+
             #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
             # standalone_thresholds=True,
             # enable extra performance optimizations (physopt)
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
index c4f80737c0..87c6e04e2e 100644
--- a/benchmarking/dut/resnet50.py
+++ b/benchmarking/dut/resnet50.py
@@ -40,7 +40,6 @@ def step_build_setup(self):
             synth_clk_period_ns = self.clock_period_ns,
             steps=resnet50_build_steps,
             shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end
-            auto_fifo_depths=False,
             split_large_fifos=True,
             vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
 
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index 852d47012f..a3039d6c5f 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -291,16 +291,11 @@ def step_build_setup(self):
         cfg = build_cfg.DataflowBuildConfig(
             output_dir = self.build_inputs["build_dir"],
             synth_clk_period_ns = self.clock_period_ns,
-            verbose=False,
-            # only works with characterization-based FIFO-sizing
-            auto_fifo_depths=True,
-            auto_fifo_strategy="characterize",
-            characteristic_function_strategy=self.params["strategy"],
+
             split_large_fifos=False,
             # manual folding
             target_fps=None,
-            # general rtlsim settings
-            force_python_rtlsim=False,
+
             shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
             generate_outputs=[
                 build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 305cac8188..014da2e13e 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -918,9 +918,6 @@ def step_build_setup(self):
             mvau_wwidth_max = 2048,
             split_large_fifos = True,
 
-            verbose = False, # if True prints stdout and stderr to console instead of build_dataflow.log
-            enable_build_pdb_debug = False,
-
             generate_outputs=[
                 build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
                 build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM

From e6998fb09df332b6fa8d73275ccee0d92574ae0e Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 20:33:16 +0000
Subject: [PATCH 012/125] Use correct Singularity image for benchmarks

---
 .gitlab-ci.yml            | 20 ++++++++++++++++++++
 benchmarking/bench-ci.yml |  1 -
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 066a7dc289..d30b08becc 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -175,6 +175,16 @@ Bench (Manual):
     # Do not run on a schedule
     - if: $CI_PIPELINE_SOURCE == "schedule"
       when: never
+    # Select different Singularity image if it deviates from default (dev branch)
+    - changes:
+        paths:
+          - requirements.txt
+          - docker/Dockerfile.finn
+          - docker/finn_entrypoint.sh
+          - docker/quicktest.sh
+        compare_to: "dev"
+      variables:
+        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     - if: $MANUAL_CFG_PATH != ""
   trigger:
     include: benchmarking/bench-ci.yml
@@ -190,6 +200,16 @@ Bench:
     # Do not run on a schedule
     - if: $CI_PIPELINE_SOURCE == "schedule"
       when: never
+    # Select different Singularity image if it deviates from default (dev branch)
+    - changes:
+        paths:
+          - requirements.txt
+          - docker/Dockerfile.finn
+          - docker/finn_entrypoint.sh
+          - docker/quicktest.sh
+        compare_to: "dev"
+      variables:
+        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     - if: $MANUAL_CFG_PATH == ""
   trigger:
     include: benchmarking/bench-ci.yml
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index f50bd1d3f8..5fdcd360f2 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -19,7 +19,6 @@ FINN Build:
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"
-    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/xilinx/finn_dev.sif"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus

From be19a1af1555c9dc498524e064b6fab9b9133b19 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 20:39:55 +0000
Subject: [PATCH 013/125] Select Singularity image in child pipeline

---
 .gitlab-ci.yml            | 20 --------------------
 benchmarking/bench-ci.yml | 13 +++++++++++++
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d30b08becc..066a7dc289 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -175,16 +175,6 @@ Bench (Manual):
     # Do not run on a schedule
     - if: $CI_PIPELINE_SOURCE == "schedule"
       when: never
-    # Select different Singularity image if it deviates from default (dev branch)
-    - changes:
-        paths:
-          - requirements.txt
-          - docker/Dockerfile.finn
-          - docker/finn_entrypoint.sh
-          - docker/quicktest.sh
-        compare_to: "dev"
-      variables:
-        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     - if: $MANUAL_CFG_PATH != ""
   trigger:
     include: benchmarking/bench-ci.yml
@@ -200,16 +190,6 @@ Bench:
     # Do not run on a schedule
     - if: $CI_PIPELINE_SOURCE == "schedule"
       when: never
-    # Select different Singularity image if it deviates from default (dev branch)
-    - changes:
-        paths:
-          - requirements.txt
-          - docker/Dockerfile.finn
-          - docker/finn_entrypoint.sh
-          - docker/quicktest.sh
-        compare_to: "dev"
-      variables:
-        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     - if: $MANUAL_CFG_PATH == ""
   trigger:
     include: benchmarking/bench-ci.yml
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 5fdcd360f2..3485ebfdfe 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -19,6 +19,19 @@ FINN Build:
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"
+  rules:
+    # Select different Singularity image if it deviates from default (dev branch)
+    - changes:
+        paths:
+          - requirements.txt
+          - docker/Dockerfile.finn
+          - docker/finn_entrypoint.sh
+          - docker/quicktest.sh
+        compare_to: "dev"
+      variables:
+        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
+    # Always run (when triggered), as long as there was no prior failure
+    - when: on_success
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus

From 5db60fad0f7da3c59fd0c54b340e7ee457a280b7 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 30 Jan 2025 21:14:48 +0000
Subject: [PATCH 014/125] Fix img

---
 benchmarking/bench-ci.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 3485ebfdfe..05980e689f 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -16,9 +16,6 @@ FINN Build:
     CI_JOB_JWT:
       aud: https://git.uni-paderborn.de
   stage: synth
-  variables:
-    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
-    PYTEST_PARALLEL: "$CPU_CORES"
   rules:
     # Select different Singularity image if it deviates from default (dev branch)
     - changes:
@@ -32,6 +29,10 @@ FINN Build:
         SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     # Always run (when triggered), as long as there was no prior failure
     - when: on_success
+  variables:
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    PYTEST_PARALLEL: "$CPU_CORES"
+    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus

From 98176098d9a3d10c7a8ee9bd3393da2bd72490d5 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 08:09:03 +0000
Subject: [PATCH 015/125] Try fix for Transformer streamlining

---
 benchmarking/dut/transformer_custom_steps.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 91bdebb206..28f23ded7c 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -687,6 +687,10 @@ def Streamline():  # noqa: Uppercase
                     AbsorbMulIntoMultiThreshold(),
                     Absorb1BitMulIntoMatMul(),
                     Absorb1BitMulIntoConv(),
+                    MoveMulPastAdd(),
+                    AbsorbMulIntoMultiThreshold(),
+                    AbsorbAddIntoMultiThreshold(),
+                    MoveAddPastMul()
                 ]
             ),
             # Streamlining scales and biases forward through residual topologies

From dbeb3a0e0b8bd210ac71056cc0251e8f1f9daad7 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 09:16:25 +0000
Subject: [PATCH 016/125] Display .sif file name

---
 benchmarking/bench.py | 7 +++----
 run-docker.sh         | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 7e38a2f0c8..855f57cd50 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -63,14 +63,13 @@ def get_default_session_options_new():
     os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True)
     log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id))
     
-    # save dir for saving bitstreams (and optionally full build artifacts for debugging (TODO))
-    # TODO: make this more configurable or switch to job/artifact based power measurement
+    # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
     if job_id == 0:
         #DEBUG mode
         save_dir = experiment_dir + "_save"
     else:
-        save_dir = os.path.join("/scratch/hpc-prf-radioml/felix/jobs/",
-                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
+        save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"),
+                            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
     print("Saving additional artifacts in path: %s" % save_dir)
     os.makedirs(save_dir, exist_ok=True)
 
diff --git a/run-docker.sh b/run-docker.sh
index 4047205e57..b99615e2e8 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -318,7 +318,7 @@ else
   SINGULARITY_EXEC="${SINGULARITY_EXEC//"-v "/"-B "}"
   SINGULARITY_EXEC="${SINGULARITY_EXEC//"-w "/"--pwd "}"
   CMD_TO_RUN="$SINGULARITY_BASE $SINGULARITY_EXEC $FINN_SINGULARITY /usr/local/bin/finn_entrypoint.sh $DOCKER_CMD"
-  gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker"
+  gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker: $FINN_SINGULARITY"
 fi
 
 $CMD_TO_RUN

From c4dbd34de4e912674722d2b690c9f6b436de85e7 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 10:15:17 +0000
Subject: [PATCH 017/125] Disable fifo reduction testing for big models

---
 benchmarking/bench.py                          | 1 +
 benchmarking/cfg/metafi_fifosizing_test.json   | 2 +-
 benchmarking/cfg/resnet50_fifosizing_test.json | 4 ++--
 benchmarking/cfg/resnet50_test.json            | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 855f57cd50..efc38eed41 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -163,6 +163,7 @@ def get_default_session_options_new():
             log_dict["status"] = "failed"
             print("Run failed: " + traceback.format_exc())
             exit_code = 1
+            # TODO: exception catch all in builder prevents internal failures from being caught here
 
         log_dict["total_time"] = int(time.time() - start_time)
         log_dict["output"] = output_dict
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index 6a441cbcd5..7e7ff45de9 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -11,7 +11,7 @@
 
         "fifo_rtlsim_n": [10],
         "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [1024],
+        "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],
         "fifo_reduction_throughput_drop_threshold": [0.01]
     }
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index b4dddc24f9..10806ef1a6 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -7,14 +7,14 @@
         "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
         "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
-        "board": ["U280"],
+        "board": ["U250"],
         "clock_period_ns": [4],
 
         "fifo_method": ["largefifo_rtlsim"],
 
         "fifo_rtlsim_n": [2],
         "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [1024],
+        "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],
         "fifo_reduction_throughput_drop_threshold": [0.01]
     }
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index df81e83661..8cef76af87 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -7,7 +7,7 @@
         "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
         "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
-        "board": ["U280"],
+        "board": ["U250"],
         "clock_period_ns": [4],
 
         "fifo_method": ["manual"],

From 349995fbfd8a5e6e908ed495a70c23e9f540a4a3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 12:35:46 +0000
Subject: [PATCH 018/125] Try 2nd streamlining fix

---
 benchmarking/bench_base.py                   | 1 +
 benchmarking/dut/transformer_custom_steps.py | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 04583c1652..24d8369055 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1062,6 +1062,7 @@ def steps_full_build_flow(self):
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
         cfg.force_python_rtlsim = False
+        #rtlsim_use_vivado_comps # TODO ?
 
         # "manual or "characterize" or "largefifo_rtlsim"
         if "fifo_method" in self.params:
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 28f23ded7c..9c2a07d05e 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -687,10 +687,6 @@ def Streamline():  # noqa: Uppercase
                     AbsorbMulIntoMultiThreshold(),
                     Absorb1BitMulIntoMatMul(),
                     Absorb1BitMulIntoConv(),
-                    MoveMulPastAdd(),
-                    AbsorbMulIntoMultiThreshold(),
-                    AbsorbAddIntoMultiThreshold(),
-                    MoveAddPastMul()
                 ]
             ),
             # Streamlining scales and biases forward through residual topologies
@@ -864,6 +860,11 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
     # Note: Contains some sets of nested exhaustive transformations meant for
     # particular architectural patterns, e.g., residual topologies.
     model = model.transform(Streamline())
+    # DEBUG for streamlining after moving to MoveLinearPastFork with workaround applied
+    model = model.transform(MoveMulPastAdd())
+    model = model.transform(AbsorbMulIntoMultiThreshold())
+    model = model.transform(AbsorbAddIntoMultiThreshold())
+    model = model.transform(MoveAddPastMul())
     # If configured, run a verification of the transformed model on some
     # sample inputs
     if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():  # noqa

From 139c62448a3ae7f555f3f44f4e748fcfb3eb40b1 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 14:08:55 +0000
Subject: [PATCH 019/125] ResNet disable inferdatalayouts

---
 benchmarking/dut/resnet50_custom_steps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py
index ddf8b0d0de..e808072baa 100644
--- a/benchmarking/dut/resnet50_custom_steps.py
+++ b/benchmarking/dut/resnet50_custom_steps.py
@@ -175,7 +175,7 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
 
 def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
-    model = model.transform(InferDataLayouts())
+    #model = model.transform(InferDataLayouts())
     model = model.transform(DoubleToSingleFloat())
     model = model.transform(InferDataTypes())
     model = model.transform(SortGraph())
@@ -196,7 +196,7 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     ]
     for trn in to_hw_transformations:
         model = model.transform(trn())
-        model = model.transform(InferDataLayouts())
+        #model = model.transform(InferDataLayouts())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataTypes())
 

From 358a2c6b155d7fe65ea1402c87e881a689e8c446 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 16:01:36 +0000
Subject: [PATCH 020/125] Use dotenv artifact

---
 .gitlab-ci.yml            | 23 ++++++++++-------------
 benchmarking/bench-ci.yml | 20 ++++++--------------
 2 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 066a7dc289..773d0ebb42 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -38,8 +38,6 @@ variables:
     value: ""
   FINN_XILINX_VERSION:
     value: "2022.2"
-  SINGULARITY_IMG_SELECT:
-    value: "finn_dev.sif"
 
 workflow:
   name: '$PIPELINE_NAME'
@@ -98,10 +96,15 @@ Singularity Image Build:
     - docker build --no-cache -f docker/Dockerfile.finn --tag=finn_docker_export .
     - apptainer build --force finn_singularity_image.sif docker-daemon://finn_docker_export:latest
     - rsync -vh finn_singularity_image.sif $PATH_SINGULARITY_IMG_BUILD/finn-plus/finn_$CI_COMMIT_REF_SLUG.sif
+    - echo SINGULARITY_IMG_SELECT=finn_$CI_COMMIT_REF_SLUG.sif > FINN_environment.env
   after_script: # Clean caches
     - echo 'y' | docker image prune
     - echo 'y' | docker builder prune
     - echo 'y' | apptainer cache clean
+  # Save env var selecting Singularity image to be used in subsequent jobs
+  artifacts:
+    reports:
+      dotenv: FINN_environment.env
 
 Fetch Repos:
   id_tokens:
@@ -135,16 +138,6 @@ FINN Test Suite 2022.2:
     # Do not run if test suite has been deselected
     - if: $TEST_SUITE == "none"
       when: never
-    # Select different Singularity image if it deviates from default (dev branch)
-    - changes:
-        paths:
-          - requirements.txt
-          - docker/Dockerfile.finn
-          - docker/finn_entrypoint.sh
-          - docker/quicktest.sh
-        compare_to: "dev"
-      variables:
-        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
     # Always run, as long as there was no prior failure
     - when: on_success
   cache:
@@ -155,12 +148,13 @@ FINN Test Suite 2022.2:
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
     PYTEST_PARALLEL: "$CPU_CORES"
-    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT"
+    SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact
     FINN_XILINX_VERSION: "2022.2"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus
     - module load system singularity
+    - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT
   script:
     - ./run-docker.sh quicktest.sh $TEST_SUITE
 
@@ -182,6 +176,7 @@ Bench (Manual):
     forward:
       pipeline_variables: true
   variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
     BENCH_CFG: "manual"
 
 Bench:
@@ -196,6 +191,8 @@ Bench:
     strategy: depend
     forward:
       pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   parallel:
     matrix:
       - BENCH_CFG: [mvau_test, resnet50_test, metafi_test]
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 05980e689f..877caee30d 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -16,27 +16,19 @@ FINN Build:
     CI_JOB_JWT:
       aud: https://git.uni-paderborn.de
   stage: synth
-  rules:
-    # Select different Singularity image if it deviates from default (dev branch)
-    - changes:
-        paths:
-          - requirements.txt
-          - docker/Dockerfile.finn
-          - docker/finn_entrypoint.sh
-          - docker/quicktest.sh
-        compare_to: "dev"
-      variables:
-        SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif"
-    # Always run (when triggered), as long as there was no prior failure
-    - when: on_success
+  needs:
+    - pipeline: $PARENT_PIPELINE_ID
+      job: Singularity Image Build
+      optional: true
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"
-    FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT"
+    SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus
     - module load system singularity
+    - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT
   script:
     - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG
   cache: 

From cda98a665d7d443887354903400716ae08474c0b Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 16:21:49 +0000
Subject: [PATCH 021/125] Try without optional

---
 benchmarking/bench-ci.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 877caee30d..388cd18e73 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -19,7 +19,6 @@ FINN Build:
   needs:
     - pipeline: $PARENT_PIPELINE_ID
       job: Singularity Image Build
-      optional: true
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"

From 91da4f5bc490c1eb52d96b67bbc477bec35125b3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 16:56:49 +0000
Subject: [PATCH 022/125] Try optional again

---
 benchmarking/bench-ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 388cd18e73..44ceda3265 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -17,8 +17,9 @@ FINN Build:
       aud: https://git.uni-paderborn.de
   stage: synth
   needs:
-    - pipeline: $PARENT_PIPELINE_ID
-      job: Singularity Image Build
+    - job: Singularity Image Build
+      pipeline: $PARENT_PIPELINE_ID
+      optional: true
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"

From 0b92591a97f5bec677cad44c4afdccf821d0d922 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 31 Jan 2025 17:18:40 +0000
Subject: [PATCH 023/125] Workaround optional artifact

---
 .gitlab-ci.yml            | 10 +++++++++-
 benchmarking/bench-ci.yml |  4 +---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 773d0ebb42..e8249863bf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -123,8 +123,17 @@ Fetch Repos:
     key: $CI_COMMIT_SHA
     paths:
       - deps
+  variables:
+    SINGULARITY_IMG_SELECT: "finn_dev.sif" # default, may be overwritten by dotenv artifact
   script:
     - ./fetch-repos.sh
+    # Workaround for https://gitlab.com/gitlab-org/gitlab/-/issues/349538
+    # Passing artifacts from optional parent jobs to child pipelines is not supported
+    # Therefore, we pass the dotenv artifact from "Singularity Image Build" through this job
+    - echo SINGULARITY_IMG_SELECT=$SINGULARITY_IMG_SELECT > FINN_environment_passthrough.env
+  artifacts:
+    reports:
+      dotenv: FINN_environment_passthrough.env
 
 FINN Test Suite 2022.2:
   id_tokens:
@@ -148,7 +157,6 @@ FINN Test Suite 2022.2:
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
     PYTEST_PARALLEL: "$CPU_CORES"
-    SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact
     FINN_XILINX_VERSION: "2022.2"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 44ceda3265..5cf0568c31 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -17,13 +17,11 @@ FINN Build:
       aud: https://git.uni-paderborn.de
   stage: synth
   needs:
-    - job: Singularity Image Build
+    - job: Fetch Repos
       pipeline: $PARENT_PIPELINE_ID
-      optional: true
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
     PYTEST_PARALLEL: "$CPU_CORES"
-    SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus

From b7145aa1cd2dc1d96a358196d544b63560268acc Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 2 Feb 2025 16:04:04 +0000
Subject: [PATCH 024/125] Revert RN-50 removal of inferdatalayouts

---
 benchmarking/dut/resnet50_custom_steps.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py
index e808072baa..ddf8b0d0de 100644
--- a/benchmarking/dut/resnet50_custom_steps.py
+++ b/benchmarking/dut/resnet50_custom_steps.py
@@ -175,7 +175,7 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
 
 def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"])
-    #model = model.transform(InferDataLayouts())
+    model = model.transform(InferDataLayouts())
     model = model.transform(DoubleToSingleFloat())
     model = model.transform(InferDataTypes())
     model = model.transform(SortGraph())
@@ -196,7 +196,7 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     ]
     for trn in to_hw_transformations:
         model = model.transform(trn())
-        #model = model.transform(InferDataLayouts())
+        model = model.transform(InferDataLayouts())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(InferDataTypes())
 

From 503f73ee53d917500641c63161fbe7b45bd7db60 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 2 Feb 2025 16:20:12 +0000
Subject: [PATCH 025/125] Sweep over fifosim n

---
 benchmarking/bench_base.py                    | 19 +++++++++++++------
 benchmarking/cfg/fifosizing_test.json         |  2 +-
 benchmarking/cfg/metafi_fifosizing_test.json  |  4 +++-
 benchmarking/cfg/metafi_test.json             |  2 +-
 .../cfg/resnet50_fifosizing_test.json         |  4 +++-
 benchmarking/cfg/resnet50_test.json           |  2 +-
 benchmarking/dut/metafi.py                    |  2 +-
 7 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 24d8369055..18797579f7 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -902,8 +902,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir):
         model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
         first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
         last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"]
-        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
+        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
         deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
         log["deadlock"] = deadlock.tolist()
 
@@ -961,8 +961,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir):
             model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
             first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
             last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"]
-            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"]
+            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
+            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
             var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
 
             # check rtlsim throughput
@@ -1063,6 +1063,8 @@ def steps_full_build_flow(self):
         cfg.enable_build_pdb_debug = False
         cfg.force_python_rtlsim = False
         #rtlsim_use_vivado_comps # TODO ?
+        #cfg.default_swg_exception
+        #cfg.large_fifo_mem_style
 
         # "manual or "characterize" or "largefifo_rtlsim"
         if "fifo_method" in self.params:
@@ -1075,9 +1077,14 @@ def steps_full_build_flow(self):
         if "fifo_strategy" in self.params:
             cfg.characteristic_function_strategy = self.params["fifo_strategy"]
 
+        # Batch size used for RTLSim performance measurement (and in-depth FIFO test here)
         # TODO: determine automatically or replace by exact instr wrapper sim
-        if "fifo_rtlsim_n" in self.params:
-            cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"]
+        if "rtlsim_n" in self.params:
+            cfg.rtlsim_batch_size=self.params["rtlsim_n"]
+
+        # Batch size used for FIFO sizing (largefifo_rtlsim only)
+        if "fifo_rtlsim_n":
+            cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"]
 
         if "folding_path" in self.build_inputs:
             cfg.folding_config_file = self.build_inputs["folding_path"]
diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
index 20e2588282..cf49aa80a7 100644
--- a/benchmarking/cfg/fifosizing_test.json
+++ b/benchmarking/cfg/fifosizing_test.json
@@ -14,7 +14,7 @@
         "fifo_method": ["characterize"],
         "fifo_strategy": ["analytical", "rtlsim"],
 
-        "fifo_rtlsim_n": [10],
+        "rtlsim_n": [10],
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [64],
         "fifo_reduction_factor": [0.5],
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index 7e7ff45de9..02116cfeb5 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -7,9 +7,11 @@
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
+        "rtlsim_n": [10],
+
         "fifo_method": ["largefifo_rtlsim"],
 
-        "fifo_rtlsim_n": [10],
+        "fifo_rtlsim_n": [2, 4, 8],
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index 7ede065c76..0ee1339441 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -9,6 +9,6 @@
 
         "fifo_method": ["manual"],
 
-        "fifo_rtlsim_n": [10]
+        "rtlsim_n": [3]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index 10806ef1a6..c4dc4daf78 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -10,9 +10,11 @@
         "board": ["U250"],
         "clock_period_ns": [4],
 
+        "rtlsim_n": [10],
+
         "fifo_method": ["largefifo_rtlsim"],
 
-        "fifo_rtlsim_n": [2],
+        "fifo_rtlsim_n": [2, 4, 8],
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index 8cef76af87..4937cb8395 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -12,6 +12,6 @@
 
         "fifo_method": ["manual"],
 
-        "fifo_rtlsim_n": [2]
+        "rtlsim_n": [3]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index 462314c2ec..7808f11856 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -52,7 +52,7 @@ def step_build_setup(self):
             shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
             #vitis_platform=vitis_platform,
 
-            split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test
+            split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test
 
             # folding_config_file=folding_config_file,
             # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",

From 2c0903d3cbe260ebd7ca5e18af48013e14a7205c Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 2 Feb 2025 17:15:04 +0000
Subject: [PATCH 026/125] Log partial results in failure

---
 benchmarking/bench.py                   | 7 ++-----
 benchmarking/bench_base.py              | 8 +++++++-
 benchmarking/dut/synthetic_nonlinear.py | 4 ----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index efc38eed41..686c97ddc2 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -150,23 +150,20 @@ def get_default_session_options_new():
         start_time = time.time()
         try:
             bench_object.run()
-            output_dict = bench_object.output_dict
-            if output_dict is None:
-                output_dict = {}
+            if not bench_object.output_dict:
                 log_dict["status"] = "skipped"
                 print("Run skipped")
             else:
                 log_dict["status"] = "ok"
                 print("Run completed")
         except Exception:
-            output_dict = {}
             log_dict["status"] = "failed"
             print("Run failed: " + traceback.format_exc())
             exit_code = 1
             # TODO: exception catch all in builder prevents internal failures from being caught here
 
         log_dict["total_time"] = int(time.time() - start_time)
-        log_dict["output"] = output_dict
+        log_dict["output"] = bench_object.output_dict
         log.append(log_dict)
         # overwrite output log file every time to allow early abort
         with open(log_path, "w") as f:
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 18797579f7..8565dfb57f 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -981,6 +981,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir):
                 fifo_reduction_pass.append(False)
                 log["fifo_reduction_results"][node.name] = "fail (no drop)"
 
+        if "fifos" not in self.output_dict:
+            self.output_dict["fifos"] = {}
         self.output_dict["fifos"]["fifotest"] = log
 
     def steps_simple_model_flow(self):
@@ -992,7 +994,11 @@ def steps_simple_model_flow(self):
         do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False
 
         # Perform steps
-        model, dut_info = self.step_make_model()
+        make_model_result = self.step_make_model()
+        if make_model_result is None:
+            return
+        else:
+            model, dut_info = make_model_result
 
         # Save model for logging purposes
         # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index a3039d6c5f..19ba3a6ce0 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -305,7 +305,3 @@ def step_build_setup(self):
         )
 
         return cfg
-
-    def step_parse_builder_output(self, build_dir):
-        # build output itself is not relevant here (yet)
-        pass

From 9d71a4ab42c0220a3d2f5b4c5a8538f5e2a6479d Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 2 Feb 2025 17:49:44 +0000
Subject: [PATCH 027/125] Fix typo

---
 benchmarking/bench_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 8565dfb57f..895e849a53 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1089,7 +1089,7 @@ def steps_full_build_flow(self):
             cfg.rtlsim_batch_size=self.params["rtlsim_n"]
 
         # Batch size used for FIFO sizing (largefifo_rtlsim only)
-        if "fifo_rtlsim_n":
+        if "fifo_rtlsim_n" in self.params:
             cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"]
 
         if "folding_path" in self.build_inputs:

From 6c744f85f84605cc04b00e1a505d49b44acbb94f Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 4 Feb 2025 15:07:08 +0000
Subject: [PATCH 028/125] Fifo testcase extension

---
 benchmarking/bench_base.py                    |  4 ++
 benchmarking/cfg/metafi_fifosizing_test.json  | 39 +++++++++++++++-
 .../cfg/resnet50_fifosizing_test.json         | 45 ++++++++++++++++++-
 3 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 895e849a53..7374e4007e 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1099,6 +1099,10 @@ def steps_full_build_flow(self):
         if "floorplan_path" in self.build_inputs:
             cfg.floorplan_path = self.build_inputs["floorplan_path"]
 
+        # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
+        # TODO: make configurable or set on pipeline level?
+        os.environ["LIVENESS_THRESHOLD"] = "10000000"
+
         ### BUILD ###
         build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
 
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index 02116cfeb5..f61ec93217 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -2,16 +2,53 @@
     {
         "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/metafi_fifosizing_xsi_n2.json"],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
         "rtlsim_n": [10],
 
+        "fifo_method": ["manual"],
+
+        "fifo_rtlsim_n": [2],
+        "fifo_throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [99999999999],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    },
+    {
+        "dut": ["metafi"],
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "rtlsim_n": [5],
+
         "fifo_method": ["largefifo_rtlsim"],
 
         "fifo_rtlsim_n": [2, 4, 8],
+        "fifo_throttle_factor": [0.5, 2],
+        "fifo_throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [99999999999],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    },
+    {
+        "dut": ["metafi"],
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "rtlsim_n": [5],
+
+        "fifo_method": ["characterize"],
+        "fifo_strategy": ["rtlsim", "analytical"],
+
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
index c4dc4daf78..075acda981 100644
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ b/benchmarking/cfg/resnet50_fifosizing_test.json
@@ -3,7 +3,7 @@
         "dut": ["resnet50"],
 
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/rn-50_fifosizing_xsi_n2.json"],
         "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
         "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
@@ -12,9 +12,52 @@
 
         "rtlsim_n": [10],
 
+        "fifo_method": ["manual"],
+
+        "fifo_rtlsim_n": [2],
+        "fifo_throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [99999999999],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    },
+    {
+        "dut": ["resnet50"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["U250"],
+        "clock_period_ns": [4],
+
+        "rtlsim_n": [5],
+
         "fifo_method": ["largefifo_rtlsim"],
 
         "fifo_rtlsim_n": [2, 4, 8],
+        "fifo_throttle_factor": [0.5, 2],
+        "fifo_throughput_factor_threshold": [0.9],
+        "fifo_reduction_skip_threshold": [99999999999],
+        "fifo_reduction_factor": [0.5],
+        "fifo_reduction_throughput_drop_threshold": [0.01]
+    },
+    {
+        "dut": ["resnet50"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["U250"],
+        "clock_period_ns": [4],
+
+        "rtlsim_n": [5],
+
+        "fifo_method": ["characterize"],
+        "fifo_strategy": ["rtlsim", "analytical"],
+
         "fifo_throughput_factor_threshold": [0.9],
         "fifo_reduction_skip_threshold": [99999999999],
         "fifo_reduction_factor": [0.5],

From b17cc23b93808b4b06b0092e9c4b40a725c37331 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 5 Feb 2025 10:40:18 +0000
Subject: [PATCH 029/125] Missing change from merge branch

---
 benchmarking/bench_base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 7374e4007e..9493a12786 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -1092,6 +1092,10 @@ def steps_full_build_flow(self):
         if "fifo_rtlsim_n" in self.params:
             cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"]
 
+        # Manual correction factor for FIFO-Sim input throttling
+        if "fifo_throttle_factor" in self.params:
+            cfg.fifo_throttle_factor = self.params["fifo_throttle_factor"]
+
         if "folding_path" in self.build_inputs:
             cfg.folding_config_file = self.build_inputs["folding_path"]
         if "specialize_path" in self.build_inputs:

From 7956a58ebe0ed91a4eb7a6fcc2f3242bba0361e6 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Feb 2025 15:58:02 +0000
Subject: [PATCH 030/125] Increase stack size, NUM_WORKERS

---
 benchmarking/bench-ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 5cf0568c31..c3c40d4b0e 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -21,15 +21,17 @@ FINN Build:
       pipeline: $PARENT_PIPELINE_ID
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    NUM_DEFAULT_WORKERS: "$CPU_CORES"
     PYTEST_PARALLEL: "$CPU_CORES"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus
     - module load system singularity
+    - ulimit -s unlimited # Increase stack size limit
     - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT
   script:
     - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG
-  cache: 
+  cache:
     key: $CI_COMMIT_SHA
     policy: pull
     paths:

From 76a780b9e6097d2947b304e16d006332cc16a563 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Feb 2025 16:00:54 +0000
Subject: [PATCH 031/125] Adapt transformer flow to new FINN+ dev

---
 benchmarking/dut/transformer_custom_steps.py | 598 +------------------
 1 file changed, 7 insertions(+), 591 deletions(-)

diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 9c2a07d05e..1a96117e22 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -1,8 +1,6 @@
 # ADAPTED FROM Christoph's radioml-transformer repository, specifically these files:
 # build_steps.py
 # custom/apply_config.py
-# custom/composed_transformation.py
-# custom/streamline.py
 
 # Copies (deep-copies) python objects
 import copy
@@ -10,15 +8,9 @@
 # Numpy for loading and comparing the verification input/output
 import numpy as np
 
-# Python warning messages
-import warnings
-
 # YAML for loading experiment configurations
 import yaml
 
-# Copies of python objects
-from copy import deepcopy
-
 # QONNX quantization data types
 from qonnx.core.datatype import DataType
 
@@ -31,6 +23,9 @@
 # Converts BatchNorm operation to affine transformation
 from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
 
+# Transformation for exhaustively composing transformations
+from qonnx.transformation.composed import ComposedTransformation
+
 # If we have a convolution with a bias tensors input, QONNX and later FINN
 # expect the bias to be expressed as a standalone Add node following the Conv
 # node.
@@ -45,8 +40,6 @@
 
 # QONNX graph transformations for renaming and cleaning up
 from qonnx.transformation.general import (
-    ConvertDivToMul,
-    ConvertSubToAdd,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
     GiveUniqueParameterTensors,
@@ -66,7 +59,6 @@
 # Transposes the initializer tensors of a Quant node instead of having a
 # standalone Transpose following
 from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit
-from qonnx.transformation.remove import RemoveIdentityOps
 
 # Range information structure for seeding the range analysis for converting
 # quantized activations to MultiThreshold
@@ -142,27 +134,15 @@
 
 # Cleanup transformation getting rid of 3d data layout
 from finn.transformation.squeeze import Squeeze
-
-# FINN streamlining transformations converting and rounding values
-from finn.transformation.streamline import ConvertSignToThres, RoundAndClipThresholds
 from finn.transformation.streamline.absorb import (
-    Absorb1BitMulIntoConv,
-    Absorb1BitMulIntoMatMul,
     AbsorbAddIntoMultiThreshold,
     AbsorbMulIntoMultiThreshold,
     AbsorbSignBiasIntoMultiThreshold,
-    AbsorbTransposeIntoMultiThreshold,
-    FactorOutMulSignMagnitude,
-    group_inputs_by_category,
 )
 
 # FINN streamlining transformations fusing/collapsing operations of the same
 # kind
-from finn.transformation.streamline.collapse_repeated import (
-    CollapseRepeatedAdd,
-    CollapseRepeatedMul,
-    CollapseRepeatedTranspose,
-)
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedTranspose
 
 # FINN streamlining transformations removing nodes without real effect from the
 # graph
@@ -173,22 +153,8 @@
 
 # FINN streamlining transformations reordering the graph
 from finn.transformation.streamline.reorder import (
-    MoveAddPastConv,
-    MoveAddPastJoinAdd,
-    MoveAddPastJoinConcat,
     MoveAddPastMul,
-    MoveAffinePastJoinConcat,
-    MoveLinearPastEltwiseAdd,
-    MoveLinearPastFork,
-    MoveMulPastFork,
-    MoveMulPastJoinAdd,
-    MoveMulPastJoinConcat,
-    MoveMulPastMaxPool,
-    MoveScalarAddPastMatMul,
-    MoveScalarLinearPastInvariants,
-    MoveScalarLinearPastSplit,
-    MoveScalarMulPastConv,
-    MoveScalarMulPastMatMul,
+    MoveMulPastAdd,
     MoveSqueezePastMatMul,
     MoveSqueezePastMultiThreshold,
     MoveTransposePastEltwise,
@@ -197,554 +163,12 @@
     MoveTransposePastJoinConcat,
     MoveTransposePastJoinMul,
     MoveTransposePastSplit,
-    is_scalar,
 )
+from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline
 
 # Execute onnx model graphs from the dataflow parent for verification
 from finn.util.test import execute_parent
 
-# FINN streamlining transformations absorbing tensors/nodes into others
-
-
-# Composes graph transformations such that each individual transformation as
-# well as the whole sequence is applied exhaustively
-class ComposedTransformation(Transformation):
-    # Initializes the transformation given a list of transformations
-    def __init__(self, transformations: list[Transformation]):
-        # Initialize the transformation base class
-        super().__init__()
-        # Register the list of transformations to be applied in apply()
-        self.transformations = transformations
-
-    # Applies the transform to a whole model graph
-    def apply(self, model: ModelWrapper):  # noqa
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all transformations to be applied
-        for transformation in self.transformations:
-            # Start each transformation on a deep copy of the model to mimic the
-            # behavior of ModelWrapper.transform()
-            model = copy.deepcopy(model)
-            # Exhaustively apply the transformation until it no longer modifies
-            # the graph
-            while True:
-                # Apply the transformation once, reporting back whether any node
-                # or pattern has been modified
-                model, _graph_modified = transformation.apply(model)
-                # Keep track whether the graph has been modified at least once
-                graph_modified = graph_modified or _graph_modified
-                # Break the loop if this transformation did not change anything
-                if not _graph_modified:
-                    break
-            # Apply the cleanup transformations of the ModelWrapper
-            model.cleanup()
-            # Apply some further cleanup transformations to the model graph
-            # removing some clutter and keeping all names readable and ordered
-            # at any time
-            model = model.transform(RemoveIdentityOps())
-            model = model.transform(GiveUniqueNodeNames())
-            model = model.transform(GiveReadableTensorNames())
-            model = model.transform(InferShapes())
-            model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the graph actually
-        # has been transformed by at least one transformation so the whole
-        # sequence of transformations will be reapplied
-        return model, graph_modified
-
-
-# # Custom conversion from Quant to MultiThreshold
-# TODO: Enable once fixed...
-# from custom.quant_activation_to_multithreshold import (
-#     QuantActivationToMultiThreshold
-# )
-
-
-# Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These
-# cannot be handled by MoveScalarLinearPastInvariants as potential padding makes
-# Add-Im2Col not commute to Im2Col-Add
-class MoveScalesPastIm2Col(Transformation):
-    # Applies the transform to a whole model graph
-    def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
-        graph = model.graph
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
-        for index, node in enumerate(graph.node):
-            # Applies to Mul operation types
-            if node.op_type in {"Mul", "Div"}:
-                # Cannot handle fork- or join-multiplications
-                if model.is_fork_node(node) or model.is_join_node(node):
-                    # Softly skip this node
-                    continue
-                # Only handles one forking output for now
-                if len(node.output) > 1:
-                    # Softly skip this node
-                    continue
-                # The first input must be dynamically received from upstream
-                if model.get_initializer(node.input[0]) is not None:
-                    # Softly skip this node
-                    continue
-                # Test whether the node initializer is a scalar...
-                if not is_scalar(model.get_initializer(node.input[1])):
-                    # Softly skip this node
-                    continue
-                # As this is not a fork-node, there can be at most one successor
-                successor = model.find_direct_successors(node)
-                # If this is the final operation in the graph, there might be no
-                # successor
-                if successor is None:
-                    # Softly skip this node
-                    continue
-                # Now there is exactly one successor which needs to be extracted
-                # from the list
-                successor = successor[0]
-                # Handle both, Im2Col and the inverse Col2Im, as well as padding
-                if successor.op_type in {"Im2Col", "Col2Im", "Pad"}:
-                    # Get names of all tensors involved in connecting the
-                    # nodes
-                    inp = node.input[0]  # noqa: Duplicate
-                    mid = node.output[0]
-                    out = successor.output[0]
-                    # Rewire the graph to feed original input into the
-                    # Add node first
-                    successor.input[0] = inp
-                    # Repurpose the middle tensor for the output of the Add
-                    successor.output[0] = mid
-                    # The Mul operator now gets the middle tensor as its
-                    # input
-                    node.input[0] = mid
-                    # Mul now produces the original output tensor
-                    node.output[0] = out
-                    # Delete the shape annotation of the connecting tensors
-                    # to be re-done later
-                    model.set_tensor_shape(mid, None)
-                    model.set_tensor_shape(out, None)
-                    # Track whether the graph has been modified, never
-                    # resets to False
-                    graph_modified = True
-                    # Break the loop after deleting shape annotations to
-                    # immediately re-do these before changing the next
-                    # operator
-                    break
-        # Redo datatype and shape annotations
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the transformation
-        # needs to be applied again
-        return model, graph_modified
-
-
-# Moves scalar linear elementwise operations past fork nodes, applies to Add,
-# Mul, Sub, Div, etc.
-class MoveScalarLinearPastFork(Transformation):
-    # Applies the transform to a whole model graph
-    def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
-        graph = model.graph
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
-        for index, node in enumerate(graph.node):
-            # Applies to Mul-like and Add-like operation types
-            if node.op_type in {"Add", "Sub", "Mul", "Div"}:
-                # Only handles non-joining forks for now
-                if not model.is_fork_node(node) or model.is_join_node(node):
-                    # Softly skip this node
-                    continue
-                # Only handles one forking output for now
-                if len(node.output) > 1:
-                    # Softly skip this node
-                    continue
-                # Test whether the node initializer is a scalar...
-                if not is_scalar(model.get_initializer(node.input[1])):
-                    # Softly skip this node
-                    continue
-                # We need to insert a replica of this operation in front of each
-                # consumer node
-                for consumer in model.find_direct_successors(node):
-                    # Create an exact replica of this operator
-                    copy = deepcopy(node)
-                    # Insert a new unique tensor connecting the output of the
-                    # copy to the consumer
-                    copy.output[0] = model.make_new_valueinfo_name()
-                    # The original node might be connecting to multiple inputs
-                    # of the consumer...
-                    for idx, inp in enumerate(consumer.input):
-                        # Find each instance of connection from original node
-                        if inp == node.output[0]:
-                            # Rewire to connect to the replica
-                            consumer.input[idx] = copy.output[0]
-                    # Insert the new replica node into the graph
-                    graph.node.insert(index + 1, copy)
-                # Remove the original node from the graph
-                graph.node.remove(node)
-        # Redo datatype and shape annotations
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the transformation
-        # needs to be applied again
-        return model, graph_modified
-
-
-# Moves constant elementwise multiplication past another joining multiplication
-class MoveConstMulPastJoinMul(Transformation):
-    # Applies the transform to a whole model graph  # noqa: Duplicate
-    def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
-        graph = model.graph
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
-        for index, node in enumerate(graph.node):
-            # Applies to Mul operation types
-            if node.op_type == "Mul":
-                # Currently does not handle fork- or join-nodes
-                if model.is_fork_node(node) or model.is_join_node(node):
-                    # Softly skip this node
-                    continue
-                # As this is not a fork-node, there can be at most one successor
-                successor = model.find_direct_successors(node)
-                # If Squeeze is the final operation in the graph, there might
-                # be no successor
-                if successor is None:
-                    # Softly skip this node
-                    continue
-                # Now there is exactly one successor which needs to be extracted
-                # from the list
-                successor = successor[0]
-                # Applies to Multiplications
-                if successor.op_type in {"Mul"}:
-                    # Applies only if the second multiplication is a join-node
-                    if model.is_join_node(successor):
-                        # Get names of all tensors involved in connecting the
-                        # nodes
-                        inp = node.input[0]  # noqa: Duplicate
-                        mid = node.output[0]
-                        out = successor.output[0]
-                        # Need to match the correct input of the joining second
-                        # multiplication
-                        for i, name in enumerate(successor.input):
-                            # If the successors input currently matches the
-                            # intermediate tensors, this input needs to be
-                            # rewired
-                            if name == mid:
-                                # Rewire the graph to feed original into the
-                                # second Mul node first
-                                successor.input[i] = inp
-                                # Note: Do not break here as it is perfectly
-                                # legal to connect the same tensor multiple
-                                # times to different inputs
-                        # Repurpose the middle tensor for the output of the
-                        # second Mul
-                        successor.output[0] = mid
-                        # The first Mul operator now gets the middle tensor as
-                        # its input
-                        node.input[0] = mid
-                        # The first Mul now produces the original output tensor
-                        node.output[0] = out
-                        # Delete the shape annotation of the connecting tensors
-                        # to be re-done later
-                        model.set_tensor_shape(mid, None)
-                        model.set_tensor_shape(out, None)
-                        # Track whether the graph has been modified, never
-                        # resets to False
-                        graph_modified = True
-                        # Break the loop after deleting shape annotations to
-                        # immediately re-do these before changing the next
-                        # operator
-                        break
-        # Redo datatype and shape annotations
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the transformation
-        # needs to be applied again
-        return model, graph_modified
-
-
-# Moves elementwise additions past MatMul operations: Applicable if each
-# operation has one initializer input
-class MoveAddPastMatMul(Transformation):
-    # Applies the transform to a whole model graph  # noqa: Duplicate
-    def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
-        graph = model.graph
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
-        for index, node in enumerate(graph.node):
-            # Applies to Add operations
-            if node.op_type == "Add":
-                # If the add is a join operation, we do not have a constant
-                # added to the input
-                if model.is_join_node(node):
-                    # Skip transforming this
-                    continue
-                # If the Add is a fork operation we should first distribute the
-                # Add into the branches
-                if model.is_fork_node(node):
-                    # Issue a warning to make the use aware of this potential
-                    # transformation if the fork is moved first
-                    warnings.warn(
-                        f"{self.__class__.__name__}:"
-                        f" Skipping near match: {node.name} is a fork-node,"
-                        f" try MoveLinearPastFork first"
-                    )
-                    # Skip transforming this node as moving this would lead
-                    # to messed up or detached graph
-                    continue
-                # Decompose the inputs into the dynamic and the constant
-                # initializer input
-                (x_name,), (c_name,) = group_inputs_by_category(node, model)
-                # Now check the successor node which must be a MatMul
-                consumer = model.find_direct_successors(node)
-                # If there is no consumer, this Add seems to be last node of the
-                # graph
-                if not consumer:
-                    # Skip transforming this
-                    continue
-                # There must be exactly one consumer now
-                consumer = consumer[0]
-                # This transformation only applies to Add in front of MatMul
-                if not consumer.op_type == "MatMul":
-                    # Skip this if not MatMul
-                    continue
-                # MatMul may not be a join operation to apply this
-                # transformation
-                if model.is_join_node(consumer):
-                    # Skip transforming without warning (there is nothing we can
-                    # do about this)
-                    continue
-                # Decompose the inputs to the MatMul to get the weight tensor
-                # name (the other input is the output of the Add)
-                _, (w_name,) = group_inputs_by_category(consumer, model)
-                # Read the weights and the constant addition tensor
-                w = model.get_initializer(w_name)
-                c = model.get_initializer(c_name)
-                # Determine whether the weights are the left or right input to
-                # the MatMul
-                left = w_name == consumer.input[0]
-                # Apply the weights to the constant tensor
-                c = np.matmul(w, c) if left else np.matmul(c, w)
-                # Insert the transformed tensor back into the mode as an
-                # initializer
-                model.set_initializer(c_name, c)
-                # The connecting tensors of this pattern
-                inp = x_name
-                mid = node.output[0]
-                out = consumer.output[0]
-                # Rewire the graph pattern connecting the input to the MatMul
-                # and the MatMul output to the Add node
-                consumer.input[1 if left else 0] = inp
-                # The Add now produces the original MatMul output
-                node.output[0] = out
-                # The middel tensor connects to the Add input
-                node.input[0 if node.input[0] == x_name else 1] = mid
-                # The MatMul feeds the middle tensors
-                consumer.output[0] = mid
-                # Delete the shape annotation of the connecting tensors
-                # to be re-done later
-                model.set_tensor_shape(mid, None)
-                model.set_tensor_shape(out, None)
-                # Delete the type annotations of the connecting tensors
-                # to be re-done later
-                # model.set_tensor_datatype(mid, None)
-                # model.set_tensor_datatype(out, None)
-                # Track whether the graph has been modified, never
-                # resets to False
-                graph_modified = True
-                # Break the loop after deleting shape annotations to
-                # immediately re-do these before changing the next
-                # operator
-                break
-        # Redo datatype and shape annotations
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the transformation
-        # needs to be applied again
-        return model, graph_modified
-
-
-# Moves elementwise multiplication past elementwise addition if one input to
-# each of the operators is a known constant
-# Note: Reverse of MoveAddPastMul
-class MoveMulPastAdd(Transformation):
-    # Applies the transform to a whole model graph
-    def apply(self, model: ModelWrapper):  # noqa
-        # Get the model graph out of the model wrapper object
-        graph = model.graph
-        # Keep track of whether the graph has been modified
-        graph_modified = False
-        # Iterate all nodes in the graph keeping track of the index
-        for index, node in enumerate(graph.node):
-            # Applies to Mul operation types
-            if node.op_type == "Mul":
-                # Currently does not handle fork- or join-nodes
-                if model.is_fork_node(node) or model.is_join_node(node):
-                    # Softly skip this node
-                    continue
-                # As this is not a fork-node, there can be at most one successor
-                successor = model.find_direct_successors(node)
-                # If Squeeze is the final operation in the graph, there might
-                # be no successor
-                if successor is None:
-                    # Softly skip this node
-                    continue
-                # Now there is exactly one successor which needs to be extracted
-                # from the list
-                successor = successor[0]
-                # Applies to additions
-                if successor.op_type in {"Add"}:
-                    # The addition may not join as we need to know the second
-                    # input
-                    if not model.is_join_node(successor):
-                        # Get the constant initializer tensors for both
-                        # operations: y = s * x + b
-                        _, s_name = group_inputs_by_category(node, model)
-                        _, b_name = group_inputs_by_category(successor, model)
-                        # Skip if either node has no constant initializer
-                        if not s_name or not b_name:
-                            # Skip without warning ok?
-                            continue
-                        # There must be exactly one constant per operations
-                        assert len(s_name) == 1, f"To many constant inputs for {node}"
-                        assert len(b_name) == 1, f"To many constant inputs for {successor}"
-                        # Now read the initializer tensors
-                        s = model.get_initializer(*s_name)
-                        b = model.get_initializer(*b_name)
-                        # Update the addition initializer according to the
-                        # distributive law
-                        model.set_initializer(*b_name, b / s)
-                        # Get names of all tensors involved in connecting the
-                        # nodes
-                        inp = node.input[0]  # noqa: Duplicate
-                        mid = node.output[0]
-                        out = successor.output[0]
-                        # Rewire the graph to feed original input into the
-                        # Add node first
-                        successor.input[0] = inp
-                        # Repurpose the middle tensor for the output of the Add
-                        successor.output[0] = mid
-                        # The Mul operator now gets the middle tensor as its
-                        # input
-                        node.input[0] = mid
-                        # Mul now produces the original output tensor
-                        node.output[0] = out
-                        # Delete the shape annotation of the connecting tensors
-                        # to be re-done later
-                        model.set_tensor_shape(mid, None)
-                        model.set_tensor_shape(out, None)
-                        # Track whether the graph has been modified, never
-                        # resets to False
-                        graph_modified = True
-                        # Break the loop after deleting shape annotations to
-                        # immediately re-do these before changing the next
-                        # operator
-                        break
-        # Redo datatype and shape annotations
-        model = model.transform(InferShapes())
-        model = model.transform(InferDataTypes())
-        # Return the transformed model and indicate whether the transformation
-        # needs to be applied again
-        return model, graph_modified
-
-
-# Define a set of custom streamlining transformations: These are applied once
-# during the actual streamlining step and once after converting attention to
-# hardware (the associated cleanup afterward might enable some Streamlining
-# transformations once again)
-def Streamline():  # noqa: Uppercase
-    # Return a set of exhaustively applies transformations
-    return ComposedTransformation(
-        [
-            # On skip-connections: prefer pushing scalar multiplication forward
-            # before MoveAddPastMul
-            MoveMulPastFork(),
-            # The "standard" set of FINN streamlining transformations or at least
-            # inspired by them but applied exhaustively until none of them changes
-            # the graph anymore.
-            # Note: Covers most parts of non-branching linear topologies
-            ComposedTransformation(
-                [
-                    ConvertSubToAdd(),
-                    ConvertDivToMul(),
-                    BatchNormToAffine(),
-                    ConvertSignToThres(),
-                    MoveMulPastMaxPool(),
-                    AbsorbSignBiasIntoMultiThreshold(),
-                    MoveScalarLinearPastInvariants(),
-                    MoveAddPastMul(),
-                    MoveScalarAddPastMatMul(),
-                    MoveAddPastConv(),
-                    MoveScalarMulPastMatMul(),
-                    MoveScalarMulPastConv(),
-                    MoveAddPastMul(),
-                    CollapseRepeatedAdd(),
-                    CollapseRepeatedMul(),
-                    MoveMulPastMaxPool(),
-                    AbsorbAddIntoMultiThreshold(),
-                    FactorOutMulSignMagnitude(),
-                    AbsorbMulIntoMultiThreshold(),
-                    Absorb1BitMulIntoMatMul(),
-                    Absorb1BitMulIntoConv(),
-                ]
-            ),
-            # Streamlining scales and biases forward through residual topologies
-            # Note: This mostly covers forking and joining operations
-            ComposedTransformation(
-                [
-                    # Note: This is probably the most common way of joining skip
-                    # connections, i.e., this corresponds to the original residual
-                    # addition, i.e., y = f(x) + x
-                    MoveLinearPastEltwiseAdd(),
-                    MoveLinearPastFork(), #DEBUG for positional encoding streamlining, MoveScalarLinearPastFork()
-                    MoveScalarLinearPastInvariants(),
-                    MoveMulPastFork(),
-                    MoveMulPastJoinAdd(),
-                    MoveAddPastJoinAdd(),
-                    # Note: This brings constant Muls (i.e., quantizer scales to be
-                    # removed) forward through joining Muls (i.e., those ending up
-                    # as actual hardware operators).
-                    MoveConstMulPastJoinMul(),
-                ]
-            ),
-            # Streamlining scales and biases forward through shape/layout changing
-            # operations, i.e., mostly transposes
-            ComposedTransformation(
-                [
-                    # Convolution inputs and padding
-                    MoveScalesPastIm2Col(),
-                    # Streamlining for Split and Concat operations
-                    MoveScalarLinearPastSplit(),
-                    MoveAffinePastJoinConcat(),
-                    MoveMulPastJoinConcat(),
-                    MoveAddPastJoinConcat(),
-                    # Move transposes around to some place where they could be removed
-                    # later, i.e., where they collapse into identities
-                    MoveTransposePastFork(),
-                    MoveTransposePastSplit(),
-                    MoveTransposePastJoinConcat(),
-                    MoveTransposePastEltwise(),
-                    MoveTransposePastJoinMul(),
-                    MoveTransposePastJoinAdd(),
-                    CollapseRepeatedTranspose(),
-                    # Remove identity shape/layout transformations
-                    RemoveIdentityTranspose(),
-                    RemoveIdentityReshape(),
-                    # Squeeze operators can be moved past the thresholding
-                    MoveSqueezePastMultiThreshold(),
-                    # A certain type of 4d-layout transpose can be absorbed (actually
-                    # moved past) MultiThreshold operations
-                    AbsorbTransposeIntoMultiThreshold(),
-                ]
-            ),
-            # Only round and clip after all streamlining transformations have
-            # been applied exhaustively.
-            # Note: Might still enable another round of streamlining.
-            RoundAndClipThresholds(),
-        ]
-    )
-
 
 # Prepares the graph to be consumed by FINN:
 # 1. Some graph cleanup removing unused tensors, nodes without effect and
@@ -815,15 +239,7 @@ def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig):
         # sample inputs
         if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
             verify_step(model, cfg, "lowered_python", need_parent=False)
-        # Apply the quantizer to MultiThreshold conversion
-        # Note: This is exhaustive as well as single .transform reapplies as
-        # long as possible.
-        # TODO: Enable once fixed...
-        # model = model.transform(QuantActivationToMultiThreshold(range_info))
-        # If configured, run a verification of the transformed model on some
-        # sample inputs
-        if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps():  # noqa
-            verify_step(model, cfg, "quant_to_thresholds_ra_python", need_parent=False)
+
         # Apply the standard QONNX to FINN conversion step to convert the
         # remaining quantizers not yet covered by the new range analysis based
         # method

From e1671b22f9ae7c977c89f740229c31bad25b4558 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Feb 2025 17:20:06 +0000
Subject: [PATCH 032/125] Enable Transformer benchmarks

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ff3187f25d..7cf1f91e39 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -211,7 +211,7 @@ Bench:
     PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   parallel:
     matrix:
-      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test]
+      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all]
 
 #dev: mvau_test
 #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test

From 2cdfd86be7744820ffcd434b0a430e1efc334615 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 14 Feb 2025 00:33:36 +0000
Subject: [PATCH 033/125] Add virtual HLS FIFO

---
 custom_hls/virtual_fifo.hpp                   |  81 +++++++
 src/finn/builder/build_dataflow_config.py     |   3 +
 src/finn/builder/build_dataflow_steps.py      |  23 ++
 .../custom_op/fpgadataflow/hls/__init__.py    |   2 +
 .../fpgadataflow/hls/streamingfifo_hls.py     | 208 ++++++++++++++++++
 .../transformation/fpgadataflow/templates.py  |   6 +-
 6 files changed, 320 insertions(+), 3 deletions(-)
 create mode 100644 custom_hls/virtual_fifo.hpp
 create mode 100644 src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py

diff --git a/custom_hls/virtual_fifo.hpp b/custom_hls/virtual_fifo.hpp
new file mode 100644
index 0000000000..85d71280bc
--- /dev/null
+++ b/custom_hls/virtual_fifo.hpp
@@ -0,0 +1,81 @@
+#ifndef VIRTUAL_FIFO_HPP
+#define VIRTUAL_FIFO_HPP
+
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <ap_axi_sdata.h>
+
+// Utility Functions, taken from instrumentation wrapper
+template<typename  T>
+static void move(
+	hls::stream<T> &src,
+	hls::stream<T> &dst
+) {
+#pragma HLS pipeline II=1 style=flp
+	dst.write(src.read());
+}
+
+template<typename  T>
+static void move(
+	hls::stream<hls::axis<T, 0, 0, 0>> &src,
+	hls::stream<T> &dst
+) {
+#pragma HLS pipeline II=1 style=flp
+	dst.write(src.read().data);
+}
+
+template<typename  T>
+class Payload {
+public:
+	using  type = T;
+};
+template<typename  T>
+class Payload<hls::axis<T, 0, 0, 0>> {
+public:
+	using  type = T;
+};
+
+template<unsigned int Width>
+void VirtualFIFO(hls::stream<ap_uint<Width> > &in, hls::stream<ap_uint<Width> > &out,
+                ap_uint<32> mode,
+                ap_uint<32> depth,
+                ap_uint<32> &occupancy,
+                ap_uint<32> &max_occupancy)
+{
+    #pragma HLS pipeline II=1 style=flp
+
+    static ap_uint<32> c_occupancy = 0;
+    static ap_uint<32> c_max_occupancy = 0;
+    #pragma HLS reset variable=c_occupancy
+    #pragma HLS reset variable=c_max_occupancy
+
+    ap_uint<Width> inElem;
+
+    bool read = mode == 0 || c_occupancy != depth;
+    bool write = c_occupancy != 0;
+
+    // INPUT
+    if(read)
+    {
+        if(in.read_nb(inElem)) //disregard input data
+        {
+            c_occupancy++;
+            c_max_occupancy = (c_occupancy > c_max_occupancy) ? c_occupancy : c_max_occupancy;
+        }
+    }
+
+    // OUTPUT
+    if(write)
+    {
+        if(out.write_nb(0)) //write dummy output data
+        {
+            c_occupancy--;
+        }
+    }
+
+    // Update output status registers
+    occupancy = c_occupancy;
+    max_occupancy = c_max_occupancy;
+}
+
+#endif
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index d6437a2e5c..c5e3995943 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -265,6 +265,9 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    # Enables experimental live FIFO sizing
+    live_fifo_sizing: Optional[bool] = False
+
     #: Whether FIFO nodes with depth larger than 32768 will be split.
     #: Allow to configure very large FIFOs in the folding_config_file.
     split_large_fifos: Optional[bool] = False
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5163b2dbdb..fe0cb68a88 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -549,6 +549,29 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     `GiveUniqueNodeNames`.
     """
 
+    # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior
+    if cfg.live_fifo_sizing:
+        # Create all DWCs and FIFOs normally
+        model = model.transform(InsertDWC())
+        model = model.transform(InsertFIFO(create_shallow_fifos=True))
+
+        # Specialize FIFOs to HLS back-end instead of default RTL back-end
+        for node in model.get_nodes_by_op_type("StreamingFIFO"):
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("preferred_impl_style", "hls")
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
+
+        # Fix impl_style attribute
+        for node in model.get_nodes_by_op_type("StreamingFIFO_hls"):
+            node_inst = getCustomOp(node)
+            node_inst.set_nodeattr("impl_style", "virtual")
+
+        # Clean up model
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
+        return model
+
     if cfg.auto_fifo_depths:
         if cfg.auto_fifo_strategy == "characterize":
             model = model.transform(InsertDWC())
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index 405c47a08d..d753fffa2e 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -47,6 +47,7 @@
     StreamingDataWidthConverter_hls,
 )
 from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls
+from finn.custom_op.fpgadataflow.hls.streamingfifo_hls import StreamingFIFO_hls
 from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls
 from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls
 from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
@@ -74,6 +75,7 @@
 custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls
 custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls
 custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
+custom_op["StreamingFIFO_hls"] = StreamingFIFO_hls
 custom_op["Thresholding_hls"] = Thresholding_hls
 custom_op["TLastMarker_hls"] = TLastMarker_hls
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
new file mode 100644
index 0000000000..f17bc48fc6
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
@@ -0,0 +1,208 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingFIFO_hls(StreamingFIFO, HLSBackend):
+    """HLS-based FIFO implementation. Currently only used as virtual FIFO for live FIFO-sizing."""
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # Only purpose of this CustomOp for now: virtual FIFO for live FIFO-sizing
+            "impl_style": ("s", False, "virtual", {"virtual"}),
+        }
+        my_attrs.update(StreamingFIFO.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "virtual_fifo.hpp"']
+
+    def defines(self, var):
+        numReps = 1
+        width = self.get_instream_width()
+        self.code_gen_dict["$DEFINES$"] = [
+            "#define Width %d " % width,
+            "#define numReps %d" % numReps,
+        ]
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0_{} ("in0_{}");'.format(
+                self.get_instream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+                self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+            )
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """
+            #pragma HLS dataflow disable_start_propagation
+
+            static hls::stream<ap_uint<Width>> in_fifo;
+            static hls::stream<Payload<ap_uint<Width>>::type> out_fifo;
+            #pragma HLS stream variable=in_fifo depth=2
+            #pragma HLS stream variable=out_fifo depth=2
+
+            // AXI-Stream -> FIFO
+            move(in0_%s, in_fifo);
+
+            // Main
+            VirtualFIFO<Width>(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy);
+
+            // FIFO -> AXI-Stream
+            move(out_fifo, out_%s);
+            """
+            % (self.hls_sname(), self.hls_sname())
+        ]
+
+    def blackboxfunction(self):
+        in_packed_bits = self.get_instream_width()
+        in_packed_hls_type = "ap_uint<%d>" % in_packed_bits
+        out_packed_bits = self.get_outstream_width()
+        out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode,
+            ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)"""
+            % (
+                self.onnx_node.name,
+                in_packed_hls_type,
+                self.hls_sname(),
+                out_packed_hls_type,
+                self.hls_sname(),
+            )
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+        ]
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+        )
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=occupancy")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=max_occupancy")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+    def get_verilog_top_module_intf_names(self):
+        # Overload default HWCustomOp implementation to add axilite control IF
+        intf_names = super().get_verilog_top_module_intf_names()
+        intf_names["axilite"] = ["s_axi_control"]
+        return intf_names
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        exp_shape = self.get_normal_input_shape()
+        folded_ishape = self.get_folded_input_shape()
+
+        # TODO ensure codegen dir exists
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        inp = context[node.input[0]]
+        assert str(inp.dtype) == "float32", "Input datatype is not float32"
+        assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape."
+
+        if self.get_input_datatype() == DataType["BIPOLAR"]:
+            # store bipolar activations as binary
+            inp = (inp + 1) / 2
+            export_idt = DataType["BINARY"]
+        else:
+            export_idt = self.get_input_datatype()
+        # reshape input into folded shape
+        reshaped_input = inp.reshape(folded_ishape)
+        # make copy before saving array
+        reshaped_input = reshaped_input.copy()
+        np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+        if mode == "cppsim":
+            output = inp
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+            context[node.output[0]] = output
+
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to "rtlsim" """.format(
+                    mode
+                )
+            )
+        # binary -> bipolar if needed
+        if self.get_output_datatype() == DataType["BIPOLAR"]:
+            out = context[node.output[0]]
+            out = 2 * out - 1
+            context[node.output[0]] = out
+        assert context[node.output[0]].shape == tuple(
+            exp_shape
+        ), """Output
+        shape doesn't match expected shape, should be same as input shape"""
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index ccf4e7a943..5c521720c4 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -92,9 +92,9 @@
 custom_zynq_shell_template = """
 set FREQ_MHZ %s
 set NUM_AXILITE %d
-if {$NUM_AXILITE > 9} {
-    error "Maximum 10 AXI-Lite interfaces supported"
-}
+#if {$NUM_AXILITE > 9} {
+#    error "Maximum 10 AXI-Lite interfaces supported"
+#}
 set NUM_AXIMM %d
 set BOARD %s
 set FPGA_PART %s

From 7c04eb6e628cd21820bcef02ff624edfa3702b22 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 14 Feb 2025 16:31:29 +0000
Subject: [PATCH 034/125] Integrate instrumentation into ZynqBuild

---
 custom_hls/instrumentation.template.cpp       | 307 ++++++++++++++++++
 custom_hls/instrumentation_sim.template.tcl   |  67 ++++
 custom_hls/instrumentation_tb.template.sv     | 172 ++++++++++
 src/finn/builder/build_dataflow_config.py     |   4 +
 src/finn/builder/build_dataflow_steps.py      |  22 ++
 .../transformation/fpgadataflow/floorplan.py  |   8 +-
 .../fpgadataflow/instrumentation.py           | 203 ++++++++++++
 .../fpgadataflow/make_zynq_proj.py            |  88 ++++-
 8 files changed, 860 insertions(+), 11 deletions(-)
 create mode 100644 custom_hls/instrumentation.template.cpp
 create mode 100644 custom_hls/instrumentation_sim.template.tcl
 create mode 100644 custom_hls/instrumentation_tb.template.sv
 create mode 100644 src/finn/transformation/fpgadataflow/instrumentation.py

diff --git a/custom_hls/instrumentation.template.cpp b/custom_hls/instrumentation.template.cpp
new file mode 100644
index 0000000000..bf15d77a87
--- /dev/null
+++ b/custom_hls/instrumentation.template.cpp
@@ -0,0 +1,307 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *******************************************************************************
+ * @brief	Instrumentation wrapper module for FINN IP characterization.
+ * @author	Thomas B. Preusser <thomas.preusser@amd.com>
+ * @details
+ *	Instrumentation wrapper intercepting the feature map input to and
+ *	the feature map output from a FINN IP to measure processing latency and
+ *	initiation interval in terms of clock cycles. The most recent readings
+ *	are exposed via AXI-light.
+ *	This wrapper can run the FINN IP detached from an external data source
+ *	and sink by feeding LFSR-generated data and sinking the output without
+ *	backpressure.
+ *	This module is currently not integrated with the FINN compiler. It must
+ *	be instantiated and integrated with the rest of the system in a manual
+ *	process.
+ *
+ * @param PENDING	maximum number of feature maps in the FINN dataflow pipeline
+ * @param ILEN		number of input transactions per IFM
+ * @param OLEN		number of output transactions per OFM
+ * @param KO           number of subwords within output payload vector
+ * @param TI		type of input payload vector
+ * @param TO		type of output payload vector
+ *******************************************************************************/
+
+ #include <hls_stream.h>
+ #include <ap_int.h>
+ #include <ap_axi_sdata.h>
+ #include <algorithm>
+
+ // Module Configuration
+ constexpr unsigned  PENDING = @PENDING@; // Max. feature maps in flight
+ constexpr unsigned  ILEN    = @ILEN@;    // Input words per IFM
+ constexpr unsigned  OLEN    = @OLEN@;    // Output words per OFM
+ constexpr unsigned  KO      = @KO@;      // Subwords within OFM transaction word
+ using  TI = @TI@;  // IFM transaction word
+ using  TO = @TO@;  // OFM transaction word
+
+ //---------------------------------------------------------------------------
+ // Utility Functions
+ static constexpr unsigned clog2  (unsigned  x) { return  x<2? 0 : 1+clog2((x+1)/2); }
+ static constexpr unsigned clog2nz(unsigned  x) { return  std::max(1u, clog2(x)); }
+
+ template<typename  T>
+ static void move(
+     hls::stream<T> &src,
+     hls::stream<T> &dst
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+     dst.write(src.read());
+ }
+
+ template<typename  T>
+ static void move(
+     hls::stream<hls::axis<T, 0, 0, 0>> &src,
+     hls::stream<T> &dst
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+     dst.write(src.read().data);
+ }
+
+ template<typename  T>
+ class Payload {
+ public:
+     using  type = T;
+ };
+ template<typename  T>
+ class Payload<hls::axis<T, 0, 0, 0>> {
+ public:
+     using  type = T;
+ };
+
+ /**
+  * Computes a checksum over a forwarded stream assumed to carry frames of
+  * N words further subdivided into K subwords.
+  *      - Subword slicing can be customized typically by using a lambda.
+  *        The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word
+  *        type with a member `width` and a range-based slicing operator. It
+  *        further assumes a little-endian arrangement of subwords within words
+  *        for the canonical subword stream order.
+  *      - Subwords wider than 23 bits are folded using bitwise XOR across
+  *        slices of 23 bits starting from the LSB.
+  *      - The folded subword values are weighted according to their position
+  *        in the stream relative to the start of frame by a periodic weight
+  *        sequence 1, 2, 3, ...
+  *      - The weighted folded subword values are reduced to a checksum by an
+  *        accumulation module 2^24.
+  *      - A checksum is emitted for each completed frame. It is the concatenation
+  *        of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum.
+  */
+ template<typename T, unsigned K>
+ class DefaultSubwordSlicer {
+     static_assert(T::width%K == 0, "Word size must be subword multiple.");
+     static constexpr unsigned  W = T::width/K;
+ public:
+     ap_uint<W> operator()(T const &x, unsigned const  j) const {
+ #pragma HLS inline
+         return  x((j+1)*W-1, j*W);
+     }
+ };
+
+ //---------------------------------------------------------------------------
+ // Instrumentation Core
+ template<
+     unsigned  PENDING,
+     unsigned  ILEN,
+     unsigned  OLEN,
+     unsigned  KO,
+     typename  TI,
+     typename  TO
+ >
+ void instrument(
+     hls::stream<TI> &finnix,
+     hls::stream<TO> &finnox,
+     ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
+     ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
+     ap_uint<32> &latency,
+     ap_uint<32> &interval,
+     ap_uint<32> &checksum,
+     ap_uint<32> &min_latency
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+
+     // Timestamp Management State
+     using clock_t = ap_uint<32>;
+     static clock_t  cnt_clk = 0;
+ #pragma HLS reset variable=cnt_clk
+     hls::stream<clock_t>  timestamps;
+ #pragma HLS stream variable=timestamps depth=PENDING
+     static bool  timestamp_ovf = false;
+     static bool  timestamp_unf = false;
+ #pragma HLS reset variable=timestamp_ovf
+ #pragma HLS reset variable=timestamp_unf
+
+     // Input Feed & Generation
+     constexpr unsigned  LFSR_WIDTH = (TI::width+15)/16 * 16;
+     static ap_uint<clog2nz(ILEN)>  icnt = 0;
+     static ap_uint<LFSR_WIDTH>  lfsr;
+ #pragma HLS reset variable=icnt
+ #pragma HLS reset variable=lfsr off
+     if(!finnix.full()) {
+
+         bool const  first = icnt == 0;
+         bool  wr;
+         if(first) {
+             // Start of new feature map
+             wr = cfg[0];
+             for(unsigned  i = 0; i < LFSR_WIDTH; i += 16) {
+ #pragma HLS unroll
+                 lfsr(15+i, i) = cfg(31, 16) ^ (i>>4)*33331;
+             }
+         }
+         else {
+             // Advance LFSR
+             wr = true;
+             for(unsigned  i = 0; i < LFSR_WIDTH; i += 16) {
+ #pragma HLS unroll
+                 lfsr(15+i, i) = (lfsr(15+i, i) >> 1) ^ ap_uint<16>(lfsr[i]? 0 : 0x8805);
+             }
+         }
+
+         if(wr) {
+             finnix.write_nb(lfsr);
+             if(first)  timestamp_ovf |= !timestamps.write_nb(cnt_clk);
+             icnt = icnt == ILEN-1? decltype(icnt)(0) : decltype(icnt)(icnt + 1);
+         }
+     }
+
+     // Output Tracking
+     static ap_uint<clog2nz(OLEN)>  ocnt = 0;
+ #pragma HLS reset variable=ocnt
+     static clock_t  ts1 = 0;	// last output timestamp
+     static clock_t  last_latency = 0;
+     static clock_t  last_interval = 0;
+     static clock_t  cur_min_latency = ~0;
+ #pragma HLS reset variable=ts1
+ #pragma HLS reset variable=last_latency
+ #pragma HLS reset variable=last_interval
+ #pragma HLS reset variable=cur_min_latency
+
+     static ap_uint<8>  pkts = 0;
+ #pragma HLS reset variable=pkts
+     static ap_uint< 2>  coeff[3];
+     static ap_uint<24>  psum;
+     static ap_uint<32>  last_checksum = 0;
+ #pragma HLS reset variable=coeff off
+ #pragma HLS reset variable=psum off
+ #pragma HLS reset variable=last_checksum
+
+     TO  oval;
+     if(finnox.read_nb(oval)) {
+         // Start of new output feature map
+         if(ocnt == 0) {
+             for(unsigned  i = 0; i < 3; i++)  coeff[i] = i+1;
+             psum = 0;
+         }
+
+         // Update checksum
+         for(unsigned  j = 0; j < KO; j++) {
+ #pragma HLS unroll
+             auto const  v0 = DefaultSubwordSlicer<TO, KO>()(oval, j);
+             constexpr unsigned  W = 1 + (decltype(v0)::width-1)/23;
+             ap_uint<KO*23>  v = v0;
+             ap_uint<   23>  w = 0;
+             for(unsigned  k = 0; k < W; k++)  w ^= v(23*k+22, 23*k);
+             psum += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0));
+         }
+
+         // Re-align coefficients
+         for(unsigned  j = 0; j < 3; j++) {
+ #pragma HLS unroll
+                 ap_uint<3> const  cc = coeff[j] + ap_uint<3>(KO%3);
+                 coeff[j] = cc(1, 0) + cc[2];
+         }
+
+         // Track frame position
+         if(ocnt != OLEN-1)  ocnt++;
+         else {
+             clock_t  ts0;
+             if(!timestamps.read_nb(ts0))  timestamp_unf = true;
+             else {
+                 last_latency  = cnt_clk - ts0;	// completion - start
+                 last_interval = cnt_clk - ts1;	// completion - previous completion
+                 cur_min_latency = std::min(cur_min_latency, last_latency);
+                 ts1 = cnt_clk;	// mark completion ^
+             }
+             ocnt = 0;
+
+             last_checksum = (pkts++, psum);
+         }
+     }
+
+     // Advance Timestamp Counter
+     cnt_clk++;
+
+     // Copy Status Outputs
+     status = timestamp_ovf | (timestamp_unf << 1);
+     latency  = last_latency;
+     interval = last_interval;
+     checksum = last_checksum;
+     min_latency = cur_min_latency;
+
+ } // instrument()
+
+ void instrumentation_wrapper(
+     hls::stream<TI> &finnix,
+     hls::stream<TO> &finnox,
+     ap_uint<32>  cfg,
+     ap_uint<32> &status,
+     ap_uint<32> &latency,
+     ap_uint<32> &interval,
+     ap_uint<32> &checksum,
+     ap_uint<32> &min_latency
+ ) {
+ #pragma HLS interface axis port=finnix
+ #pragma HLS interface axis port=finnox
+ #pragma HLS interface s_axilite bundle=ctrl port=cfg
+ #pragma HLS interface s_axilite bundle=ctrl port=status
+ #pragma HLS interface s_axilite bundle=ctrl port=latency
+ #pragma HLS interface s_axilite bundle=ctrl port=interval
+ #pragma HLS interface s_axilite bundle=ctrl port=checksum
+ #pragma HLS interface s_axilite bundle=ctrl port=min_latency
+ #pragma HLS interface ap_ctrl_none port=return
+
+ #pragma HLS dataflow disable_start_propagation
+     static hls::stream<TI>  finnix0;
+     static hls::stream<Payload<TO>::type>  finnox0;
+ #pragma HLS stream variable=finnix0 depth=2
+ #pragma HLS stream variable=finnox0 depth=2
+
+     // AXI-Stream -> FIFO
+     move(finnox, finnox0);
+
+     // Main
+     instrument<PENDING, ILEN, OLEN, KO>(finnix0, finnox0, cfg, status, latency, interval, checksum, min_latency);
+
+     // FIFO -> AXI-Stream
+     move(finnix0, finnix);
+
+ } // instrumentation_wrapper
diff --git a/custom_hls/instrumentation_sim.template.tcl b/custom_hls/instrumentation_sim.template.tcl
new file mode 100644
index 0000000000..4875d799e2
--- /dev/null
+++ b/custom_hls/instrumentation_sim.template.tcl
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of AMD nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set fpga_part @FPGA_PART@
+#set output_root ".."
+# path to IP folder for instrumentation wrapper, change as needed
+#set instrwrp_ip_dir "$output_root/instrumentation_wrapper/project_instrwrap/sol1/impl/ip"
+# path to IP folder for FINN IP, change as needed
+#set finn_ip_dir "$output_root/stitched_ip/ip"
+
+create_project -force instr_sim_proj instr_sim_proj/ -part $fpga_part
+create_bd_design "dut"
+update_compile_order -fileset sources_1
+#set_property ip_repo_paths [list $instrwrp_ip_dir] [current_project]
+set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] @IP_DIRS_STR@] [current_project]
+update_ip_catalog
+
+
+create_bd_cell -type ip -vlnv xilinx_finn:finn:finn_design:1.0 finn_design_0
+create_bd_cell -type ip -vlnv xilinx.com:hls:instrumentation_wrapper:1.0 instrumentation_wrap_0
+connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/finnix] [get_bd_intf_pins finn_design_0/s_axis_0]
+connect_bd_intf_net [get_bd_intf_pins finn_design_0/m_axis_0] [get_bd_intf_pins instrumentation_wrap_0/finnox]
+make_bd_intf_pins_external  [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl]
+make_bd_pins_external  [get_bd_pins instrumentation_wrap_0/ap_clk]
+make_bd_pins_external  [get_bd_pins instrumentation_wrap_0/ap_rst_n]
+connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins finn_design_0/ap_clk]
+connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins finn_design_0/ap_rst_n]
+
+save_bd_design
+
+update_compile_order -fileset sources_1
+make_wrapper -files [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] -top
+add_files -norecurse instr_sim_proj/instr_sim_proj.gen/sources_1/bd/dut/hdl/dut_wrapper.v
+
+set_property SOURCE_SET sources_1 [get_filesets sim_1]
+add_files -fileset sim_1 ./instrwrap_testbench.sv
+update_compile_order -fileset sim_1
+
+set_property synth_checkpoint_mode None [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd]
+generate_target Simulation [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd]
+launch_simulation -simset sim_1 -mode behavioral
+run all
diff --git a/custom_hls/instrumentation_tb.template.sv b/custom_hls/instrumentation_tb.template.sv
new file mode 100644
index 0000000000..933104c623
--- /dev/null
+++ b/custom_hls/instrumentation_tb.template.sv
@@ -0,0 +1,172 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of AMD nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+module tb #(
+	// sampling period (in cycles) for reading instrumentation wrapper registers
+    // TODO: make configurable or adjust automatically?
+	int unsigned  INSTR_READ_PERIOD = 10000,
+    // 16-bit LFSR seed for generating fixed random data
+    int unsigned LFSR_SEED = 1
+)();
+
+
+// Clock & Reset
+logic  ap_clk = 0;
+always #5ns ap_clk = !ap_clk;
+logic  ap_rst_n = 0;
+uwire  ap_rst = !ap_rst_n;
+
+// wires for instrumentation wrapper AXI lite interface
+logic [31:0] axilite_ctrl_araddr = 'x;
+uwire axilite_ctrl_arready;
+logic axilite_ctrl_arvalid = 0;
+logic [31:0]  axilite_ctrl_awaddr = 'x;
+uwire axilite_ctrl_awready;
+logic axilite_ctrl_awvalid = 0;
+uwire axilite_ctrl_bready = 1;
+uwire [1:0]axilite_ctrl_bresp;
+uwire axilite_ctrl_bvalid;
+uwire [31:0]axilite_ctrl_rdata;
+logic axilite_ctrl_rready = 1;
+uwire [1:0]axilite_ctrl_rresp;
+uwire axilite_ctrl_rvalid;
+logic [31:0]  axilite_ctrl_wdata = 'x;
+uwire axilite_ctrl_wready;
+uwire [3:0]axilite_ctrl_wstrb = 4'b1111;
+logic  axilite_ctrl_wvalid = 0;
+
+
+
+
+dut_wrapper dut_wrapper_inst (
+	.ap_clk_0(ap_clk), .ap_rst_n_0(ap_rst_n),
+    .s_axi_ctrl_0_araddr(axilite_ctrl_araddr),
+    .s_axi_ctrl_0_arready(axilite_ctrl_arready),
+    .s_axi_ctrl_0_arvalid(axilite_ctrl_arvalid),
+    .s_axi_ctrl_0_awaddr(axilite_ctrl_awaddr),
+    .s_axi_ctrl_0_awready(axilite_ctrl_awready),
+    .s_axi_ctrl_0_awvalid(axilite_ctrl_awvalid),
+    .s_axi_ctrl_0_bready(axilite_ctrl_bready),
+    .s_axi_ctrl_0_bresp(axilite_ctrl_bresp),
+    .s_axi_ctrl_0_bvalid(axilite_ctrl_bvalid),
+    .s_axi_ctrl_0_rdata(axilite_ctrl_rdata),
+    .s_axi_ctrl_0_rready(axilite_ctrl_rready),
+    .s_axi_ctrl_0_rresp(axilite_ctrl_rresp),
+    .s_axi_ctrl_0_rvalid(axilite_ctrl_rvalid),
+    .s_axi_ctrl_0_wdata(axilite_ctrl_wdata),
+    .s_axi_ctrl_0_wready(axilite_ctrl_wready),
+    .s_axi_ctrl_0_wstrb(axilite_ctrl_wstrb),
+    .s_axi_ctrl_0_wvalid(axilite_ctrl_wvalid)
+);
+
+//---------------------------------------------------------------------------
+
+initial begin
+	$timeformat(-9, 2, " ns");
+	// perform reset
+	repeat(100)  @(posedge ap_clk);
+	ap_rst_n <= 1;
+	$display("Reset complete");
+    repeat(100) @(posedge ap_clk);
+    // instrumentation wrapper configuration:
+    // set up LFSR seed + start data generation + output sink
+    axilite_ctrl_awaddr  <= 'h10;
+    axilite_ctrl_awvalid <= 1;
+    axilite_ctrl_wdata   <= (LFSR_SEED << 16) | 'b11;
+    axilite_ctrl_wvalid  <= 1;
+    repeat(8) begin
+        @(posedge ap_clk);
+        if(axilite_ctrl_wready && axilite_ctrl_awready)  break;
+    end
+    axilite_ctrl_wvalid  <= 0;
+    axilite_ctrl_awvalid <= 0;
+    axilite_ctrl_awaddr  <= 'x;
+    axilite_ctrl_wdata   <= 'x;
+    while(1) begin
+        axilite_ctrl_araddr  <= 'h18;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] STATUS_I = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h20;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] STATUS_O = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h28;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] LATENCY = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h38;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] INTERVAL = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h48;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] CHECKSUM = %8x", $time, axilite_ctrl_rdata);
+                if(axilite_ctrl_rdata) begin
+                    $display("Nonzero checksum detected, stopping simulation");
+                    $finish;
+                    // TODO: simulate for configurable number of frames, like this:
+                    // if(axilite_ctrl_rdata[31:24] == 47) begin
+                    //     $display("Frame number 48 detected, stopping simulation");
+                    //     $finish;
+                    // end
+                end
+                break;
+            end
+        end
+        axilite_ctrl_arvalid <= 0;
+        repeat(INSTR_READ_PERIOD)  @(posedge ap_clk);
+    end
+end
+
+
+endmodule : tb
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index d6437a2e5c..08545ebc14 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -314,6 +314,10 @@ class DataflowBuildConfig:
     #: debug signals in the generated hardware)
     enable_hw_debug: Optional[bool] = False
 
+    #: Whether the accelerator will be simulated and synthesized with an
+    #: instrumentation wrapper attached to accurately measure performance.
+    enable_instrumentation: Optional[bool] = False
+
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = True
 
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5163b2dbdb..a4481ed778 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -89,6 +89,7 @@
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
@@ -644,6 +645,26 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Create stitched IP for a graph after all HLS IP blocks have been generated.
     Depends on the DataflowOutputType.STITCHED_IP output product."""
 
+    # introduce tLAST marker, required for instrumentation
+    if cfg.enable_instrumentation:
+        model = model.transform(
+            InsertTLastMarker(
+                # only insert marker on output (input TLAST is ignored for these use-cases anyway)
+                both=False,
+                # use ap_axiu instead of qdma_axis
+                external=False,
+                # static number of iterations (based on what the compiler/folding sets up)
+                dynamic=False,
+            )
+        )
+        # give a proper name to the inserted node, important for codegen
+        # TODO: deal with multi-I/O accelerators?
+        model.graph.node[-1].name = "TLastMarker_0"
+        # re-run codegen and HLS IP gen, will affect only the new TLastMarker layer assuming
+        # all other IPs have been generated already
+        model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+        model = model.transform(HLSSynthIP())
+
     if DataflowOutputType.STITCHED_IP in cfg.generate_outputs:
         stitched_ip_dir = cfg.output_dir + "/stitched_ip"
         model = model.transform(
@@ -806,6 +827,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                     cfg.board,
                     cfg.synth_clk_period_ns,
                     cfg.enable_hw_debug,
+                    cfg.enable_instrumentation,
                     partition_model_dir=partition_model_dir,
                 )
             )
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index b24145afcb..7d93ff88fc 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -99,9 +99,13 @@ def apply(self, model):
                 # if we have SLR assignment already. use that
                 if node_slr != -1:
                     continue
+                # if available, use the SLR of the preceding node
                 srcnode = model.find_producer(node.input[0])
-                node_slr = getCustomOp(srcnode).get_nodeattr("slr")
-                node_inst.set_nodeattr("slr", node_slr)
+                if srcnode is not None:
+                    node_slr = getCustomOp(srcnode).get_nodeattr("slr")
+                    node_inst.set_nodeattr("slr", node_slr)
+                else:
+                    node_inst.set_nodeattr("slr", default_slr)
 
         if unassigned_nodes > 0:
             warnings.warn(
diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py
new file mode 100644
index 0000000000..7f37c5ed14
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/instrumentation.py
@@ -0,0 +1,203 @@
+import numpy as np
+import os
+import subprocess
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+
+from finn.custom_op.fpgadataflow.templates import ipgentcl_template
+from finn.util.basic import make_build_dir
+from finn.util.hls import CallHLS
+
+
+# TODO: duplicate function from make_zynq_proj.py
+def collect_ip_dirs(model, ipstitch_path):
+    # collect list of all IP dirs
+    ip_dirs = []
+    need_memstreamer = False
+    for node in model.graph.node:
+        node_inst = getCustomOp(node)
+        ip_dir_value = node_inst.get_nodeattr("ip_path")
+        assert os.path.isdir(
+            ip_dir_value
+        ), """The directory that should
+        contain the generated ip blocks doesn't exist."""
+        ip_dirs += [ip_dir_value]
+        if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls":
+            if node_inst.get_nodeattr("mem_mode") == "internal_decoupled":
+                need_memstreamer = True
+    ip_dirs += [ipstitch_path + "/ip"]
+    if need_memstreamer:
+        # add RTL streamer IP
+        ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
+    return ip_dirs
+
+
+class GenerateInstrumentationIP(Transformation):
+    def __init__(
+        self,
+        fpga_part,
+        clk_period_ns,
+        format="ip",  # "ip" for Vivado (Zynq) or "xo" for Vitis (Alveo/Versal)
+    ):
+        super().__init__()
+        self.fpga_part = fpga_part
+        self.clk_period_ns = clk_period_ns
+        self.format = format
+
+    def apply(self, model):
+        # Create directory for code-gen and HLS of instrumentation IP
+        wrapper_output_dir = make_build_dir(prefix="code_gen_ipgen_Instrumentation_")
+        model.set_metadata_prop("instrumentation_ipgen", wrapper_output_dir)
+
+        # conservative max for pending feature maps: number of layers
+        pending = len(model.graph.node)
+        # query the parallelism-dependent folded input shape from the
+        # node consuming the graph input
+        inp_name = model.graph.input[0].name
+        inp_node = getCustomOp(model.find_consumer(inp_name))
+        inp_shape_folded = list(inp_node.get_folded_input_shape())
+        inp_stream_width = inp_node.get_instream_width_padded()
+        # number of beats per input is given by product of folded input
+        # shape except the last dim (which is the stream width)
+        ilen = np.prod(inp_shape_folded[:-1])
+        ti = "ap_uint<%d>" % inp_stream_width
+        # perform the same for the output
+        out_name = model.graph.output[0].name
+        out_node = getCustomOp(model.find_producer(out_name))
+        out_shape_folded = list(out_node.get_folded_output_shape())
+        out_stream_width = out_node.get_outstream_width_padded()
+        olen = np.prod(out_shape_folded[:-1])
+        to = "ap_uint<%d>" % out_stream_width
+        ko = out_shape_folded[-1]
+        # fill out instrumentation wrapper template
+        with open(
+            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation.template.cpp"), "r"
+        ) as f:
+            instrwrp_cpp = f.read()
+        instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending))
+        instrwrp_cpp = instrwrp_cpp.replace("@ILEN@", str(ilen))
+        instrwrp_cpp = instrwrp_cpp.replace("@OLEN@", str(olen))
+        instrwrp_cpp = instrwrp_cpp.replace("@TI@", str(ti))
+        instrwrp_cpp = instrwrp_cpp.replace("@TO@", str(to))
+        instrwrp_cpp = instrwrp_cpp.replace("@KO@", str(ko))
+        with open(wrapper_output_dir + "/top_instrumentation_wrapper.cpp", "w") as f:
+            f.write(instrwrp_cpp)
+        # fill out HLS synthesis tcl template
+        prjname = "project_instrwrap"
+        ipgentcl = ipgentcl_template
+        ipgentcl = ipgentcl.replace("$PROJECTNAME$", prjname)
+        ipgentcl = ipgentcl.replace("$HWSRCDIR$", wrapper_output_dir)
+        ipgentcl = ipgentcl.replace("$TOPFXN$", "instrumentation_wrapper")
+        ipgentcl = ipgentcl.replace("$FPGAPART$", self.fpga_part)
+        ipgentcl = ipgentcl.replace("$CLKPERIOD$", str(self.clk_period_ns))
+        ipgentcl = ipgentcl.replace("$DEFAULT_DIRECTIVES$", "")
+        if self.format == "xo":
+            # use Vitis RTL kernel (.xo) output instead of IP-XACT
+            ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "config_export -format xo")
+            ipgentcl = ipgentcl.replace(
+                "export_design -format ip_catalog", "export_design -format xo"
+            )
+        else:
+            ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "")
+        with open(wrapper_output_dir + "/hls_syn.tcl", "w") as f:
+            f.write(ipgentcl)
+        # build bash script to launch HLS synth and call it
+        code_gen_dir = wrapper_output_dir
+        builder = CallHLS()
+        builder.append_tcl(code_gen_dir + "/hls_syn.tcl")
+        builder.set_ipgen_path(code_gen_dir + "/{}".format(prjname))
+        builder.build(code_gen_dir)
+        ipgen_path = builder.ipgen_path
+        assert os.path.isdir(ipgen_path), "HLS IPGen failed: %s not found" % (ipgen_path)
+        ip_path = ipgen_path + "/sol1/impl/ip"
+        assert os.path.isdir(ip_path), "HLS IPGen failed: %s not found. Check log under %s" % (
+            ip_path,
+            code_gen_dir,
+        )
+        if self.format == "xo":
+            assert False, "Not implemented"
+            # TODO: export for use in VitisBuild or VersalBuild
+            # xo_dir = self.output_dir + "/xo"
+            # xo_dir = str(os.path.abspath(xo_dir))
+            # os.makedirs(xo_dir, exist_ok=True)
+            # xo_path = code_gen_dir + "/{}/sol1/impl/export.xo".format(prjname)
+            # xo_instr_path = xo_dir + "/instrumentation_wrapper.xo"
+            # shutil.copy(xo_path, xo_instr_path)
+        else:
+            # shutil.move(ip_path, self.output_dir)
+            pass
+
+        return (model, False)
+
+
+class PrepareInstrumentationSim(Transformation):
+    def __init__(self, fpga_part):
+        super().__init__()
+        self.fpga_part = fpga_part
+
+    def apply(self, model):
+        # Create directory for simulation of instrumentation IP + FINN IP
+        sim_output_dir = make_build_dir(prefix="sim_Instrumentation_")
+        model.set_metadata_prop("instrumentation_sim", sim_output_dir)
+
+        # check if instrumentation IP was generated
+        instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen")
+        if instr_ip_dir is None or (not os.path.isdir(instr_ip_dir)):
+            raise Exception(
+                "Instrumentation IP not generated, run GenerateInstrumentationIP first."
+            )
+
+        # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines)
+        # fill in testbench template
+        with open(
+            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_tb.template.sv"),
+            "r",
+        ) as f:
+            testbench_sv = f.read()
+        with open(sim_output_dir + "/instrwrap_testbench.sv", "w") as f:
+            f.write(testbench_sv)
+        # fill in testbench project creator template
+        with open(
+            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_sim.template.tcl"),
+            "r",
+        ) as f:
+            testbench_tcl = f.read()
+
+        # collect ip repo paths for finn accelerator sub cores so Vivado can find them
+        ipstitch_path = model.get_metadata_prop("vivado_stitch_proj")
+        ip_dirs = ["list"]
+        ip_dirs += collect_ip_dirs(model, ipstitch_path)
+        ip_dirs += [instr_ip_dir]
+        ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
+        testbench_tcl = testbench_tcl.replace("@FPGA_PART@", self.fpga_part)
+        testbench_tcl = testbench_tcl.replace("@IP_DIRS_STR@", ip_dirs_str)
+        with open(sim_output_dir + "/make_instrwrap_sim_proj.tcl", "w") as f:
+            f.write(testbench_tcl)
+
+        return (model, False)
+
+
+class RunInstrumentationSim(Transformation):
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        sim_output_dir = model.get_metadata_prop("instrumentation_sim")
+        if sim_output_dir is None or (not os.path.isdir(sim_output_dir)):
+            raise Exception(
+                "Instrumentation sim not prepared, run PrepareInstrumentationSim first."
+            )
+
+        # Prepare bash script
+        bash_script = os.getcwd() + "/report_power.sh"
+        with open(bash_script, "w") as script:
+            script.write("#!/bin/bash\n")
+            script.write("cd %s\n" % (sim_output_dir))
+            script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl\n")
+
+        # Run script
+        print("Running Vivado simulation of instrumentation wrapper")
+        sub_proc = subprocess.Popen(["bash", bash_script])
+        sub_proc.communicate()
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 63ce2d3cbf..8192c09bae 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -45,6 +45,7 @@
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
@@ -102,6 +103,42 @@ def apply(self, model):
         axilite_idx = 0
         global_clk_ns = 0
         instance_names = {}
+
+        # instantiate instrumentation IP if it was generated
+        instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen")
+        if instr_ip_dir is not None and os.path.isdir(instr_ip_dir):
+            use_instrumentation = True
+            # update IP repository
+            config.append(
+                "set_property ip_repo_paths "
+                "[concat [get_property ip_repo_paths [current_project]] [list %s]] "
+                "[current_project]" % instr_ip_dir
+            )
+            config.append("update_ip_catalog -rebuild -scan_changes")
+            # create instance
+            config.append(
+                "create_bd_cell -type ip -vlnv %s %s"
+                % ("xilinx.com:hls:instrumentation_wrapper:1.0", "instrumentation_wrap_0")
+            )
+            # connect clock % reset
+            config.append(
+                "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_clk] "
+                "[get_bd_pins smartconnect_0/aclk]"
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_rst_n] "
+                "[get_bd_pins smartconnect_0/aresetn]"
+            )
+            # connect AXI-lite control interface
+            config.append(
+                "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] "
+                "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx)
+            )
+            config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl")
+            axilite_idx += 1
+        else:
+            use_instrumentation = False
+
         for node in model.graph.node:
             assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
             sdp_node = getCustomOp(node)
@@ -150,7 +187,8 @@ def apply(self, model):
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
             # name kernels connected to graph outputs as odmaxx
-            if (producer is None) or (consumer == []):
+            # do not expect IDMA/ODMA when instrumentation is enabled
+            if not use_instrumentation and ((producer is None) or (consumer == [])):
                 # TODO not a good way of checking for external inp&out
                 # should look at the list of top-level in/out instead
                 if producer is None:
@@ -228,6 +266,26 @@ def apply(self, model):
                             )
                         )
 
+            # connect first/last dataflow partition to instrumentation wrapper
+            if use_instrumentation:
+                if producer is None:
+                    config.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_0] "
+                        "[get_bd_intf_pins instrumentation_wrap_0/finnix]"
+                        % (instance_names[node.name])
+                    )
+                if consumer == []:
+                    config.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/m_axis_0] "
+                        "[get_bd_intf_pins instrumentation_wrap_0/finnox]"
+                        % (instance_names[node.name])
+                    )
+
+        # TODO: WORKAROUND, do not instantiate smartconnect when not needed!
+        if use_instrumentation:
+            config.append("delete_bd_objs [get_bd_cells smartconnect_0]")
+            aximm_idx = 1
+
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
         model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
@@ -305,6 +363,7 @@ def __init__(
         platform,
         period_ns,
         enable_debug=False,
+        enable_instrumentation=False,
         partition_model_dir=None,
     ):
         super().__init__()
@@ -313,19 +372,27 @@ def __init__(
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
+        self.enable_instrumentation = enable_instrumentation
         self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         # first infer layouts
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
-        prep_transforms = [
-            InsertIODMA(self.axi_port_width),
-            InsertDWC(),
-            SpecializeLayers(self.fpga_part),
-            Floorplan(),
-            CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
-        ]
+        if self.enable_instrumentation:
+            prep_transforms = [
+                GenerateInstrumentationIP(self.fpga_part, self.period_ns),
+                Floorplan(),
+                CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
+            ]
+        else:
+            prep_transforms = [
+                InsertIODMA(self.axi_port_width),
+                InsertDWC(),
+                SpecializeLayers(self.fpga_part),
+                Floorplan(),
+                CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
+            ]
         for trn in prep_transforms:
             model = model.transform(trn)
             model = model.transform(GiveUniqueNodeNames())
@@ -337,7 +404,10 @@ def apply(self, model):
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
-            kernel_model = kernel_model.transform(InsertFIFO())
+            # InsertFIFO at this stage interferes with tLastMarker
+            # TODO: is this really needed here at all?
+            if not self.enable_instrumentation:
+                kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part))
             kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)

From 419e18f65d67e3b8f498a9f4620123f1170582bf Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 19 Feb 2025 16:10:48 +0000
Subject: [PATCH 035/125] Nest AXI interconnects if required

---
 .../fpgadataflow/make_zynq_proj.py            | 94 +++++++++++++++++--
 1 file changed, 87 insertions(+), 7 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 8192c09bae..5e86a58b6e 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -27,6 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
 import os
 import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
@@ -100,6 +101,9 @@ def apply(self, model):
         idma_idx = 0
         odma_idx = 0
         aximm_idx = 0
+        nested_interconnect_count = 0
+        master_axilite_idx = 0
+        axilite_interconnect_idx = 0
         axilite_idx = 0
         global_clk_ns = 0
         instance_names = {}
@@ -132,13 +136,62 @@ def apply(self, model):
             # connect AXI-lite control interface
             config.append(
                 "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] "
-                "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx)
+                "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (master_axilite_idx)
             )
             config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl")
-            axilite_idx += 1
+            master_axilite_idx += 1
         else:
             use_instrumentation = False
 
+        # instantiate nested AXI interconnects if required
+        # only the nested interconnects and all interfaces connected before this line
+        # will be connected to the original (master) interconnect
+        total_axilite_count = 0
+        for node in model.graph.node:
+            sdp_node = getCustomOp(node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
+            total_axilite_count += len(ifnames["axilite"])
+        if total_axilite_count > (64 - master_axilite_idx):
+            nested_interconnect_count = math.ceil(total_axilite_count / 64.0)
+            for i in range(1, nested_interconnect_count + 1):
+                # create instance
+                config.append(
+                    "create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_%d" % (i)
+                )
+                # configure instance
+                config.append(
+                    "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]"
+                    % (max(64, total_axilite_count), i)
+                )
+                # connect to master interconnect
+                config.append(
+                    "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]"
+                    % (master_axilite_idx, i)
+                )
+                # connect clocks TODO: suppport zynq_7000
+                config.append(
+                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/ACLK]"
+                    % (i)
+                )
+                config.append(
+                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/S00_ACLK]"
+                    % (i)
+                )
+                # connect reset
+                config.append(
+                    "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]"
+                    % (i)
+                )
+                master_axilite_idx += 1
+                total_axilite_count = min(0, total_axilite_count - 64)
+
+            assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!"
+
+            # start populating the first nested interconnect
+            axilite_interconnect_idx = 1
+
         for node in model.graph.node:
             assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
             sdp_node = getCustomOp(node)
@@ -211,8 +264,13 @@ def apply(self, model):
                 assert axilite_intf_name is not None
                 config.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                    "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
-                    % (instance_names[node.name], axilite_intf_name, axilite_idx)
+                    "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]"
+                    % (
+                        instance_names[node.name],
+                        axilite_intf_name,
+                        axilite_interconnect_idx,
+                        axilite_idx,
+                    )
                 )
                 # assign_bd_address with appropriate range/offset
                 config.append(
@@ -221,6 +279,11 @@ def apply(self, model):
 
                 aximm_idx += 1
                 axilite_idx += 1
+                if axilite_idx == 64:
+                    axilite_interconnect_idx += 1
+                    axilite_idx = 0
+                if axilite_interconnect_idx == 0:
+                    master_axilite_idx += 1
             else:
                 instance_names[node.name] = node.name
                 config.append(
@@ -230,8 +293,13 @@ def apply(self, model):
                 for axilite_intf_name in ifnames["axilite"]:
                     config.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                        "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
-                        % (instance_names[node.name], axilite_intf_name, axilite_idx)
+                        "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]"
+                        % (
+                            instance_names[node.name],
+                            axilite_intf_name,
+                            axilite_interconnect_idx,
+                            axilite_idx,
+                        )
                     )
                     # assign_bd_address with appropriate range/offset
                     config.append(
@@ -239,6 +307,11 @@ def apply(self, model):
                         % (instance_names[node.name], axilite_intf_name)
                     )
                     axilite_idx += 1
+                    if axilite_idx == 64:
+                        axilite_interconnect_idx += 1
+                        axilite_idx = 0
+                    if axilite_interconnect_idx == 0:
+                        master_axilite_idx += 1
             sdp_node.set_nodeattr("instance_name", instance_names[node.name])
 
             config.append(
@@ -286,6 +359,13 @@ def apply(self, model):
             config.append("delete_bd_objs [get_bd_cells smartconnect_0]")
             aximm_idx = 1
 
+        # finalize nested interconnect clock TODO: support zynq_7000
+        for i in range(1, nested_interconnect_count + 1):
+            config.append(
+                "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_%d/M*_ACLK]"
+                % (i)
+            )
+
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
         model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
@@ -300,7 +380,7 @@ def apply(self, model):
                 templates.custom_zynq_shell_template
                 % (
                     fclk_mhz,
-                    axilite_idx,
+                    master_axilite_idx,
                     aximm_idx,
                     self.platform,
                     pynq_part_map[self.platform],

From 5628ab2a1a2505ad4014626e885ddc11c8e59238 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 19 Feb 2025 16:25:07 +0000
Subject: [PATCH 036/125] Fix AXI interconnect connection

---
 src/finn/transformation/fpgadataflow/make_zynq_proj.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 5e86a58b6e..8c990a8b3d 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -191,6 +191,8 @@ def apply(self, model):
 
             # start populating the first nested interconnect
             axilite_interconnect_idx = 1
+        else:
+            axilite_idx = master_axilite_idx
 
         for node in model.graph.node:
             assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"

From 0c57d1b373527337f80ede1714a739cb83771bad Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 19 Feb 2025 22:19:16 +0000
Subject: [PATCH 037/125] Make floorplan partitioning of AXI-lite interfaces
 more consistent

---
 .../transformation/fpgadataflow/floorplan.py  | 39 ++++++++++++-------
 .../fpgadataflow/make_zynq_proj.py            |  4 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 7d93ff88fc..0b806ff44a 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -134,25 +134,27 @@ def apply(self, model):
         )
         non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes))
 
+        # assign every DMA node to its own partition
         for node in dma_nodes:
             node_inst = getCustomOp(node)
             node_inst.set_nodeattr("partition_id", partition_cnt)
             partition_cnt += 1
 
+        # assign every dynamic tLastMarker node to its own partition
         for node in dyn_tlastmarker_nodes:
             node_inst = getCustomOp(node)
             node_inst.set_nodeattr("partition_id", partition_cnt)
             partition_cnt += 1
 
+        # handle remaining nodes
         for node in non_dma_nodes:
             pre_node = model.find_producer(node.input[0])
             node_inst = getCustomOp(node)
             if pre_node not in non_dma_nodes:
-                # input node
+                # input node -> start new partition
                 node_inst.set_nodeattr("partition_id", partition_cnt)
                 partition_cnt += 1
                 continue
-
             elif not (
                 node.op_type.startswith("MVAU")
                 and node_inst.get_nodeattr("mem_mode") is not None
@@ -160,25 +162,36 @@ def apply(self, model):
             ):
                 pre_nodes = model.find_direct_predecessors(node)
             else:
+                # exception for external weight MVAU: only consider primary input
+                # TODO: (why) is this necessary? should we consider such exceptions for other cases?
                 pre_nodes = [pre_node]
 
+            axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
+            if len(axilite_intf_name) != 0:
+                # This node has an AXI-Lite interface -> start new partition
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
+                continue
+
+            # examine all predecessor nodes to determine partition id for this node
             node_slr = node_inst.get_nodeattr("slr")
+            slr_mismatch_count = 0
             for pre_node in pre_nodes:
                 pre_inst = getCustomOp(pre_node)
                 pre_slr = pre_inst.get_nodeattr("slr")
                 if node_slr == pre_slr:
-                    axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"]
-                    if len(axilite_intf_name) != 0:
-                        node_inst.set_nodeattr("partition_id", partition_cnt)
-                        partition_cnt += 1
-                    else:
-                        partition_id = pre_inst.get_nodeattr("partition_id")
-                        node_inst.set_nodeattr("partition_id", partition_id)
-
+                    # Default case -> assign to same partition as predecessor
+                    partition_id = pre_inst.get_nodeattr("partition_id")
+                    node_inst.set_nodeattr("partition_id", partition_id)
+                    break
                 else:
-                    # no matching, new partition
-                    node_inst.set_nodeattr("partition_id", partition_cnt)
-                    partition_cnt += 1
+                    # SLR mismatch with predecessor, can't assign same partition
+                    slr_mismatch_count += 1
+
+            if slr_mismatch_count == len(pre_nodes):
+                # SLR mismatch with ALL predecessors -> start new partition
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
 
         # save the updated floorplan
         floorplan = model.analysis(floorplan_params)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 8c990a8b3d..4d2ee3d50e 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -163,7 +163,7 @@ def apply(self, model):
                 # configure instance
                 config.append(
                     "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]"
-                    % (max(64, total_axilite_count), i)
+                    % (min(64, total_axilite_count), i)
                 )
                 # connect to master interconnect
                 config.append(
@@ -185,7 +185,7 @@ def apply(self, model):
                     % (i)
                 )
                 master_axilite_idx += 1
-                total_axilite_count = min(0, total_axilite_count - 64)
+                total_axilite_count = max(0, total_axilite_count - 64)
 
             assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!"
 

From 684459c76189c22b9aa004a7c0028ee1c77a5a0d Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 19 Feb 2025 22:56:06 +0000
Subject: [PATCH 038/125] Add GPIO IP for reset

---
 .../transformation/fpgadataflow/make_zynq_proj.py  | 14 +++++++++++---
 src/finn/transformation/fpgadataflow/templates.py  | 11 +++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 4d2ee3d50e..456441bca8 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -94,6 +94,7 @@ def __init__(self, platform, enable_debug=False):
         super().__init__()
         self.platform = platform
         self.enable_debug = 1 if enable_debug else 0
+        self.enable_gpio_reset = 0
 
     def apply(self, model):
         # create a config file and empty list of xo files
@@ -112,6 +113,12 @@ def apply(self, model):
         instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen")
         if instr_ip_dir is not None and os.path.isdir(instr_ip_dir):
             use_instrumentation = True
+
+            # instantiate GPIO IP to trigger reset
+            self.enable_gpio_reset = 1
+            # in the template this will connect to first port of interconnect_0
+            master_axilite_idx += 1
+
             # update IP repository
             config.append(
                 "set_property ip_repo_paths "
@@ -170,7 +177,7 @@ def apply(self, model):
                     "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]"
                     % (master_axilite_idx, i)
                 )
-                # connect clocks TODO: suppport zynq_7000
+                # connect clocks/reset TODO: suppport zynq_7000
                 config.append(
                     "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/ACLK]"
                     % (i)
@@ -179,7 +186,7 @@ def apply(self, model):
                     "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/S00_ACLK]"
                     % (i)
                 )
-                # connect reset
+                # connect reset TODO: probably unneeded
                 config.append(
                     "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]"
                     % (i)
@@ -361,7 +368,7 @@ def apply(self, model):
             config.append("delete_bd_objs [get_bd_cells smartconnect_0]")
             aximm_idx = 1
 
-        # finalize nested interconnect clock TODO: support zynq_7000
+        # finalize nested interconnect clock/reset TODO: support zynq_7000
         for i in range(1, nested_interconnect_count + 1):
             config.append(
                 "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_%d/M*_ACLK]"
@@ -388,6 +395,7 @@ def apply(self, model):
                     pynq_part_map[self.platform],
                     config,
                     self.enable_debug,
+                    self.enable_gpio_reset,
                 )
             )
 
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index ccf4e7a943..0f6ba7c3c4 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -218,6 +218,17 @@
                                                              ]
 }
 
+# set up GPIO to trigger reset
+if {%d == 1} {
+    create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0
+    set_property -dict [list CONFIG.C_ALL_OUTPUTS {1} CONFIG.C_DOUT_DEFAULT {0x00000001} CONFIG.C_GPIO_WIDTH {1}] [get_bd_cells axi_gpio_0]
+    connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/M00_AXI]
+    assign_axi_addr_proc axi_gpio_0/S_AXI
+    connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aclk] [get_bd_pins axi_interconnect_0/ACLK]
+    connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aresetn] [get_bd_pins axi_interconnect_0/ARESETN]
+    connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_o] [get_bd_pins rst_zynq_ps_*/aux_reset_in]
+}
+
 #finalize clock and reset connections for interconnects
 if {$ZYNQ_TYPE == "zynq_us+"} {
     apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_0/M*_ACLK]

From 8d454886c16f7495106d4ec477c54f5ba99bcb3d Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 20 Feb 2025 07:55:52 +0000
Subject: [PATCH 039/125] Remove unneeded connect_bd_net

---
 src/finn/transformation/fpgadataflow/make_zynq_proj.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 456441bca8..d462dc9d6b 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -186,11 +186,6 @@ def apply(self, model):
                     "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/S00_ACLK]"
                     % (i)
                 )
-                # connect reset TODO: probably unneeded
-                config.append(
-                    "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]"
-                    % (i)
-                )
                 master_axilite_idx += 1
                 total_axilite_count = max(0, total_axilite_count - 64)
 

From 960a7f46a48519d4d63183a4de234bd0b12857bf Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 20 Feb 2025 18:01:02 +0000
Subject: [PATCH 040/125] Fix redundant bd_automation

---
 src/finn/transformation/fpgadataflow/make_zynq_proj.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index d462dc9d6b..846d95a11b 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -182,10 +182,6 @@ def apply(self, model):
                     "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/ACLK]"
                     % (i)
                 )
-                config.append(
-                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/S00_ACLK]"
-                    % (i)
-                )
                 master_axilite_idx += 1
                 total_axilite_count = max(0, total_axilite_count - 64)
 

From 76ef35d988611261142395633eb2eeb28886f9c8 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 21 Feb 2025 11:12:12 +0000
Subject: [PATCH 041/125] Remove tcl.collectionResultDisplayLimit

---
 src/finn/transformation/fpgadataflow/templates.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 0f6ba7c3c4..d9040d83f2 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -100,6 +100,10 @@
 set FPGA_PART %s
 create_project finn_zynq_link ./ -part $FPGA_PART
 
+# Prevent limitation on number of elements for string representations of Vivado collections of objects
+# Otherwise we might run into the default limit of 500 if we have many IP_REPO_PATHS
+set_param tcl.collectionResultDisplayLimit 0
+
 # set board part repo paths to find PYNQ-Z1/Z2
 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
 set paths_param [get_param board.repoPaths]

From 9c6c3cd8439ee162c3c5f153ec2123ea6591211a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sat, 22 Feb 2025 22:46:47 +0000
Subject: [PATCH 042/125] Add driver for iterative live FIFO-sizing

---
 driver/iterative_live_fifosizing_driver.ipynb | 833 ++++++++++++++++++
 1 file changed, 833 insertions(+)
 create mode 100644 driver/iterative_live_fifosizing_driver.ipynb

diff --git a/driver/iterative_live_fifosizing_driver.ipynb b/driver/iterative_live_fifosizing_driver.ipynb
new file mode 100644
index 0000000000..83a329d263
--- /dev/null
+++ b/driver/iterative_live_fifosizing_driver.ipynb
@@ -0,0 +1,833 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0ee21ecb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": [
+       "\n",
+       "try {\n",
+       "require(['notebook/js/codecell'], function(codecell) {\n",
+       "  codecell.CodeCell.options_default.highlight_modes[\n",
+       "      'magic_text/x-csrc'] = {'reg':[/^%%microblaze/]};\n",
+       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
+       "      Jupyter.notebook.get_cells().map(function(cell){\n",
+       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
+       "  });\n",
+       "});\n",
+       "} catch (e) {};\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/javascript": [
+       "\n",
+       "try {\n",
+       "require(['notebook/js/codecell'], function(codecell) {\n",
+       "  codecell.CodeCell.options_default.highlight_modes[\n",
+       "      'magic_text/x-csrc'] = {'reg':[/^%%pybind11/]};\n",
+       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
+       "      Jupyter.notebook.get_cells().map(function(cell){\n",
+       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
+       "  });\n",
+       "});\n",
+       "} catch (e) {};\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import time\n",
+    "import json\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "from IPython.display import clear_output\n",
+    "import numpy as np\n",
+    "from pynq import Overlay\n",
+    "\n",
+    "path = \"bitstreams/resnet50/live_instrumentation\"\n",
+    "bitstream = path + \"/finn-accel.bit\"\n",
+    "\n",
+    "# Program FPGA\n",
+    "ol = Overlay(bitstream, download=True, device=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f476fd87",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#FIFO IP detected: 266\n",
+      "#FIFO width information found: 266\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Sanity checks\n",
+    "# We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps\n",
+    "# We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights\n",
+    "print(\"#FIFO IP detected: %d\" % (len(ol.ip_dict.keys()) - 3))\n",
+    "\n",
+    "# We expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,\n",
+    "# {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}\n",
+    "with open(path + \"/fifo_widths.json\", \"r\") as f:\n",
+    "    fifo_info = json.load(f)\n",
+    "print(\"#FIFO width information found: %d\" % len(fifo_info[\"fifo_widths\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e419656f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Instrumentation driver\n",
+    "# Register map\n",
+    "#ap_uint<32>  cfg,   \t// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed\n",
+    "#ap_uint<32> &status,\t// [0] - timestamp overflow; [1] - timestamp underflow\n",
+    "#ap_uint<32> &latency,\n",
+    "#ap_uint<32> &interval,\n",
+    "#ap_uint<32> &checksum,\n",
+    "#ap_uint<32> &min_latency\n",
+    "\n",
+    "def read_register(ol, name):\n",
+    "    return ol.instrumentation_wrap_0.read(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"])\n",
+    "\n",
+    "def write_register(ol, name, value):\n",
+    "    return ol.instrumentation_wrap_0.write(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"], value=value)\n",
+    "\n",
+    "def observe_instrumentation(debug_print=True):\n",
+    "    status_reg = read_register(ol, \"status\")\n",
+    "    chksum_reg = read_register(ol, \"checksum\")\n",
+    "    min_latency = read_register(ol, \"min_latency\")\n",
+    "    latency = read_register(ol, \"latency\")\n",
+    "    interval =  read_register(ol, \"interval\")\n",
+    "\n",
+    "    frame = (chksum_reg >> 24) & 0x000000ff\n",
+    "    checksum = chksum_reg & 0x00ffffff\n",
+    "    overflow_err = (status_reg & 0x00000001) != 0\n",
+    "    underflow_err = (status_reg & 0x00000002) != 0\n",
+    "\n",
+    "    if debug_print:\n",
+    "        print(\"---INSTRUMENTATION_REPORT---\")\n",
+    "        if overflow_err or underflow_err:\n",
+    "            print(\"Status ERROR\")\n",
+    "            print(\"Overflow error: %s\" % overflow_err)\n",
+    "            print(\"Underflow error: %s\" % underflow_err)\n",
+    "        else:\n",
+    "            print(\"Status OK\")\n",
+    "        print(\"Frame number (8-bit): %d\" % frame)\n",
+    "        print(\"Checksum: 0x%06x\" % checksum)\n",
+    "        print(\"Min Latency (cycles): %d\" % min_latency)\n",
+    "        print(\"Latency (cycles): %d\" % latency)\n",
+    "        print(\"Interval (cycles): %d\" % interval)\n",
+    "        print(\"----------------------------\")\n",
+    "\n",
+    "    return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)\n",
+    "\n",
+    "def start_accelerator():\n",
+    "    lfsr_seed = 0x00010000 # upper 16 bits\n",
+    "    write_register(ol, \"cfg\", lfsr_seed + 1) # start operation\n",
+    "\n",
+    "### Virtual FIFO driver\n",
+    "# Register map\n",
+    "mode_offset = 0x10\n",
+    "depth_offset = 0x18\n",
+    "occupancy_offset = 0x20\n",
+    "occupancy_ctrl_offset = 0x24\n",
+    "max_occupancy_offset = 0x30\n",
+    "max_occupancy_ctrl_offset = 0x34\n",
+    "\n",
+    "def configure_fifo(ol, i, mode, depth = 2):\n",
+    "    ip_name = \"StreamingDataflowPartition_%d\" % i\n",
+    "    getattr(ol, ip_name).write(offset=mode_offset, value = mode)\n",
+    "    getattr(ol, ip_name).write(offset=depth_offset, value = depth)\n",
+    "\n",
+    "def total_fifo_size(depths):\n",
+    "    # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs\n",
+    "    total_size_bits = 0\n",
+    "    for i, depth in enumerate(depths):\n",
+    "        total_size_bits += depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % i]\n",
+    "    total_size_kB = total_size_bits / 8.0 / 1000.0\n",
+    "    return total_size_kB\n",
+    "\n",
+    "### GPIO Reset Driver\n",
+    "def reset_accelerator():\n",
+    "    ol.axi_gpio_0.write(offset=ol.ip_dict[\"axi_gpio_0\"][\"registers\"][\"GPIO_DATA\"][\"address_offset\"], value=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2e2a4b88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Iterative FIFO-sizing function\n",
+    "def size_iteratively(start_depth, iteration_runtime, reduction_factor = 0.5):\n",
+    "    num_fifos = len(fifo_info[\"fifo_widths\"])\n",
+    "    fifo_minimum_reached = [False] * num_fifos\n",
+    "    \n",
+    "    if isinstance(start_depth, list):\n",
+    "        # Individual start depth for each FIFO has been supplied\n",
+    "        fifo_depths = start_depth\n",
+    "    else:\n",
+    "        # Initialize all depths to the same start depth\n",
+    "        fifo_depths = [start_depth] * num_fifos\n",
+    "    \n",
+    "    # Reset accelerator and configure FIFOs\n",
+    "    reset_accelerator()\n",
+    "    for i in range(0, num_fifos):\n",
+    "        configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n",
+    "\n",
+    "    # Run once to determine target interval\n",
+    "    start_accelerator()\n",
+    "    time.sleep(1)\n",
+    "    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n",
+    "    log_total_fifo_size = [int(total_fifo_size(fifo_depths))]\n",
+    "    log_interval = [interval]\n",
+    "    log_min_latency = [min_latency]\n",
+    "    log_latency = [latency]\n",
+    "    target_interval = interval\n",
+    "    \n",
+    "    # Iteratively reduce FIFO depth until all FIFOs are minimized\n",
+    "    iteration = 0\n",
+    "    start_time = time.time()\n",
+    "    while not all(fifo_minimum_reached):\n",
+    "        for fifo_id in range(0, num_fifos):\n",
+    "            if not fifo_minimum_reached[fifo_id]:\n",
+    "                fifo_depth_before = fifo_depths[fifo_id]\n",
+    "                fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor)\n",
+    "\n",
+    "                # Reset accelerator\n",
+    "                reset_accelerator()\n",
+    "\n",
+    "                # Configure all FIFOs\n",
+    "                for i in range(0, num_fifos):\n",
+    "                    configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n",
+    "\n",
+    "                # Start accelerator\n",
+    "                start_accelerator()\n",
+    "\n",
+    "                # Let it run\n",
+    "                time.sleep(iteration_runtime)\n",
+    "\n",
+    "                # Check if throughput dropped or deadlock occured \n",
+    "                (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n",
+    "\n",
+    "                if interval > target_interval or interval == 0 or overflow_err or underflow_err:\n",
+    "                    # Revert depth reduction and mark FIFO as minimized\n",
+    "                    fifo_depths[fifo_id] = fifo_depth_before\n",
+    "                    fifo_minimum_reached[fifo_id] = True\n",
+    "                else:\n",
+    "                    log_total_fifo_size.append(int(total_fifo_size(fifo_depths)))\n",
+    "                    log_interval.append(interval)\n",
+    "                    log_min_latency.append(min_latency)\n",
+    "                    log_latency.append(latency) \n",
+    "\n",
+    "                if fifo_depths[fifo_id] == 1:\n",
+    "                    fifo_minimum_reached[fifo_id] = True\n",
+    "\n",
+    "                # Report status\n",
+    "                clear_output(wait=True)\n",
+    "                print(\"Iteration: %d\" % iteration)\n",
+    "                print(\"Reducing depth of FIFO: %d/%d\" % (fifo_id, num_fifos))\n",
+    "                print(\"Numer of minimized FIFOs: %d/%d\" % (sum(fifo_minimum_reached), num_fifos))\n",
+    "                print(\"Interval: %d\" % log_interval[-1])\n",
+    "                print(\"Min. latency / latency: %d/%d\" % (log_min_latency[-1], log_latency[-1]))\n",
+    "                print(\"Total FIFO Size (kB): %d\" % log_total_fifo_size[-1])\n",
+    "\n",
+    "        iteration += 1\n",
+    "\n",
+    "    end_time = time.time()\n",
+    "    print(\"Done (%d seconds)\" % int(end_time - start_time))\n",
+    "    \n",
+    "    return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2ebb2aa3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing start depth of 64\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 128\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 256\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 512\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 1024\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 2048\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 4096\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 0\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 4294967295\n",
+      "Latency (cycles): 0\n",
+      "Interval (cycles): 0\n",
+      "----------------------------\n",
+      "Testing start depth of 8192\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 108\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 2548522\n",
+      "Latency (cycles): 5030984\n",
+      "Interval (cycles): 903174\n",
+      "----------------------------\n",
+      "Testing start depth of 16384\n",
+      "---INSTRUMENTATION_REPORT---\n",
+      "Status OK\n",
+      "Frame number (8-bit): 108\n",
+      "Checksum: 0x000000\n",
+      "Min Latency (cycles): 2548522\n",
+      "Latency (cycles): 7496520\n",
+      "Interval (cycles): 903174\n",
+      "----------------------------\n",
+      "Determined start depth for all FIFOs: 8192\n",
+      "Determined iteration runtime based on performance: 0.127426 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Attempt to determine start depth for all FIFOs automatically\n",
+    "# If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis\n",
+    "start_depth = 64\n",
+    "last_interval = 0\n",
+    "start_depth_found = False\n",
+    "\n",
+    "while not start_depth_found:\n",
+    "    print(\"Testing start depth of %d\" % start_depth)\n",
+    "    reset_accelerator()\n",
+    "\n",
+    "    # Configure FIFOs\n",
+    "    num_fifos = len(fifo_info[\"fifo_widths\"])\n",
+    "    for i in range(0, num_fifos):\n",
+    "        configure_fifo(ol, i, mode = 1, depth = start_depth)\n",
+    "    \n",
+    "    # Start accelerator and let it run for a long time\n",
+    "    start_accelerator()\n",
+    "    time.sleep(1)\n",
+    "    \n",
+    "    # Examine performance\n",
+    "    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation()\n",
+    "    if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:\n",
+    "        # Accelerator runs with stable interval, reset to previous start depth\n",
+    "        start_depth_found = True\n",
+    "        start_depth = last_start_depth\n",
+    "    else:\n",
+    "        # Start depth is still too small, increase for next try\n",
+    "        last_start_depth = start_depth\n",
+    "        start_depth = start_depth * 2\n",
+    "    \n",
+    "    last_interval = interval\n",
+    "    \n",
+    "# Determine runtime per iteration based on performance, so that stable-state is guaranteed\n",
+    "# Use a simple overestimation for now to be safe\n",
+    "iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)\n",
+    "\n",
+    "print(\"Determined start depth for all FIFOs: %d\" % start_depth)\n",
+    "print(\"Determined iteration runtime based on performance: %f s\" % iteration_runtime)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4ba40f96",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 12\n",
+      "Reducing depth of FIFO: 265/266\n",
+      "Numer of minimized FIFOs: 266/266\n",
+      "Interval: 903174\n",
+      "Min. latency / latency: 2549314/2580777\n",
+      "Total FIFO Size (kB): 244\n",
+      "Done (389 seconds)\n"
+     ]
+    }
+   ],
+   "source": [
+    "### First pass\n",
+    "(fifo_depths,\n",
+    " log_total_fifo_size,\n",
+    " log_interval,\n",
+    " log_min_latency,\n",
+    " log_latency) = size_iteratively(start_depth, iteration_runtime)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ebf027a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdgAAAE3CAYAAAAJy1DOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAxOAAAMTgF/d4wjAABNoElEQVR4nO3dd5wU5f3A8c+ze527oyPlhKHpDjZEUEFRMRjLGjTRoCZijMZIJImKbWPys0XjGiOaWGLFCnZAdAELitgQVCAis1KXKkXKHe3a7vz+mNljOa7M7u3eXvm+X6993e48U76znnzveeYpyjRNhBBCCJFcrnQHIIQQQrREkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECkgCVYIIYRIAUmwQgghRApkpDsAIYQQLYPmC/wHGAX0Ao4K+b1LHByTDTwAnAmUAwtDfu+lKQ20kUgNVgghRLK8AZwMrInjGD8QAQ4L+b1HADelIrB0UDKTkxBCiGTSfIEQcG60Bqv5Av2Bh4AuQBbwRMjvfUzzBdoAG4CikN+7O03hpow0EQshhEgZzRdwA5OBMSG/N6j5AnnAPM0XmAdUAtuAv2m+wEhgH3BHyO+dnb6Ik0eaiIUQQqTS4cARwCuaL7AI+BwoAAYAmUAfYGnI7x0M/NHer3OaYk0qqcEKIYRIJQX8GPJ7B1Yv0HyBTljPXycBhPzexZovsBorIc9pxBhTQmqwQgghUul7YK/mC1wW3aD5Av00X6BDyO/9EZiN1YMYzRfoBfS2j2n2pJOTEEKIpNB8gUeB84CuwI/A7pDf28/u5PQg0BNwA1uBX4f83g2aL9AHmAh0BMLAnSG/d2pabiDJJMEKIYQQKSBNxEIIIUQKtKpOTkop0+VK8G8K07ReiR4vhBAiLpFIBNM0VbrjSFSrSrAul4twOJzQsfv+9z9Coy+i0zV/oPOf/5zkyIQQQlSnlErsH+wmQqpjDmX17AlA5dYf0xyJEEKI5kASrEOuNm0ACO/cmd5AhBBCNAuSYB1SmZlk9evLrg8+YNfsFjGLlxBCiBRqVcN03G63megzWIDydetYecZPyR8xgkP/+1gSIxMtkWmaVS8hxMGUUtTV8VQpFTZNs9n2FWq2gadD1qGHkqVp7J0/n/DOnbjbtUt3SKIJikQibNmyhZ07d0pyFaIemZmZ9OzZk6ysrHSHknSSYOPU7sIL2PKvB9j+wgvSm1jUaM2aNbhcLjRNIzMzM93hCNFkmabJtm3bWLt2Lf369Ut3OEknCTZO7UaPZsu/HqBy69Z0hyKaoEgkQmlpKf379ycjQ/73EqI+HTt2ZPv27UQikTqbi5ujlnU3jcCVlwdAxcYf0hyJaIqiTcJKNdux8UI0quj/Ky3xcYokWIciEZPSijCmy01G926Ur1mT7pCEEEI0YZJgHZrw/jI8/zeLVT/uIW/gQCrWr2fntGnpDksIRzRNo0uXLlRUVFRt+/DDD1FKceONNwIwffp0brrppnrPtXHjRkaMGJGyWBMxfvx4XnnlFQAWLFjAsGHDyMvL48ILL3R0/LRp05g/f36t5StXrmTQoEEce+yxPPvss0mJOV433HADL7/8cq3lp512Gu+8805c57zjjjsoLy9vaGiiFvKQyKEMt9WMURmJ0OWWW9i74Cu2PvgQ7c4/P72BCeFQz549mT59OhdccAEAEydOZPDgwVXlo0aNYtSoUfWep3v37nz00UcpizNeGzZsYObMmTzwwAMAdOvWjYceeoiFCxfy/vvvOzrHtGnTGDx4MMcff3yN5W+88QZDhw7l0UcfPaissrKyUZ6333LLLQwfPpyLLrooac8q77zzTm688cZm34NX8wVCQKn9Arg35Pe+WsN+VwI+rMrlbOCakN9bmaq4JME6lOm2fqErwyaZPQ4hb8hgSmbMZO/CheQde2yaoxNN2aiXR7Fyx8qUnLtv+75Mv2S6o32vuOIKJk6cyAUXXEBxcTHz5s3jkksuYd++fQA899xzvPPOO7zxxhvMmTOH6667jmHDhvHZZ59RWVnJ888/z+DBgwmFQgwePJgff7SmDVVK8Y9//IOpU6fy448/8uSTTzJ79mxmzZpFeXk5r732GkcccQRz5szhxhtv5KuvvgJgyZIlnHvuuYRCoapzjh07lkAgwL59+3jppZd48sknmTdvHjk5OUybNo3u3bsfdF8TJ07kwgsvrHqWV1RURFFREUuXLj1o33nz5jFu3DjC4TCVlZWMGzeOXr16MX36dD744AOefvpp/vjHP/K73/2u6pgXXniBBx98kEgkwmeffcbkyZO55pprOOmkk5g3bx4A7777Ll6vl23btrFv3z4GDhzIU089RV5eHs899xyTJ0+mQ4cOLFq0iO7du/Pwww9z8803s3z5cgYNGsTkyZNxuVzs2rWL8ePHs3jxYkpLSxk2bBgPP/wwmZmZdOnShd69ezN79mzOOOMMx78jEyZM4OWXX6ayspLMzEwefvhhTjjhBMaOHQvAsGHDcLlcvPfee+Tm5tZ6/dNOO40TTjiBzz//nI0bN3LGGWfw+OOPA1BcXMwNN9zAl19+icvl4rjjjuOxxx5D0zQWLFjAoYceCsBf/vIXIpEI9913n+P443BhyO9dUluh5gv0Bv4OHAtsAd4CrgSeSEUwIE3EjmW4rP95K8IRANpfeikAJTNmpi0mIeJxyimnsGrVKjZs2MDLL7/ML3/5S9xud637f/fdd1xxxRUsXryYP/3pT/z1r3+tdd/CwkLmz5/Pfffdx3nnncfJJ5/MwoUL+c1vfsM999zjKL5t27YxdOhQFi5cyJVXXsnIkSO55ppr+N///sfgwYN55JFHajxuzpw5DBs2zNE17r33Xm644QYWLVrEkiVLuPjiiznnnHMYNWoUPp+PRYsWHZBcAS677DLGjh3LZZddxqJFixgwYAAAixYtYtasWcyePRu3283kyZP56quvWLJkCYWFhTz22P7JaBYsWMC//vUvgsEgeXl5/OpXv2Ly5MksXbqUpUuX8sEHHwBWM/App5zC/PnzWbx4MZWVlQfc97Bhw5gd50xyY8aMYcGCBSxcuJD//Oc/XHnllQBVyfHzzz9n0aJFdOnSpd7rr1y5kjlz5rBkyRLeffddvvjiCwCuu+46cnNzWbx4MYsXL+a+++4jJyeHK6+8kieesPJXWVkZzz77LH/4wx/iij+JLgSmhvzezSG/1wQeBy5J5QWlButQRrQGG7F6uuUcfjgAkV270haTaB6c1jAbw5gxY3j++eeZNm0akyZNYtKkSbXue/jhh1c1IQ8dOpR//etfte570UUXATBo0CBcLhderxeA4447jilTpjiKLT8/v+q4QYMGUVRUxMCBA6vOU1tz7/r16+natauja4wYMYK7776bFStWcPrpp3PyySc7Oq4mY8aMqRrnbJomDz74IIFAgMrKSoqLiznllFOq9j3ppJMoKioC4Nhjj0XTNNq2bQvAMcccw6pVqwCrqXrevHlVzd379u07oPm2a9euzJ07N644Fy5cyD333MO2bdvIyMhg6dKllJeX19gsXN/1L774YtxuN7m5uQwcOJCVK1cydOhQ3nnnHb7++uuqpuvOnTsDcM0113DCCSdw22238corr3DCCSegaVo84buUUutjPk8wTXNCLftO0nwBF/Al8JeQ31t9LGVPILZ3asjeljKSYB3KdB9Yg1V5eZCZSdmKFekMS4i4XH755QwaNIjDDjuM/v3717lvTk5O1Xu3201lZe2PqqL7ut1usrOzazwuIyPjgOUiS0tLDzhH9eOcXj8vL6+qmbs+1113HaNGjWL27NnceuutHHnkkQfUNOORn59f9X7y5Ml8/PHHzJ07l4KCAv7zn/8ckAir30tt92aaJtOmTaNPnz41XrO0tJTc3FzHMZaXl3PBBRcwZ84cjjvuOEpKSmjbtm2tCba+68fzOwHQo0cPhg8fzhtvvMGjjz7quDUjRsQ0zSIH+50S8nvXar5AJnA38DxwTg37xY4FSvlYOmkidijDtf8ZLFjPnbL79aNi3Toi1f6hEKKp6t69O/fee2+qnoHVqXfv3qxevZpt27YB8OKLLyblvEcffTTBYNDRvt9//z19+vThqquu4tZbb616hlpYWEhxcXHCMezYsYOOHTtSUFDArl27eO655xI6z6hRo/D7/VWJa8eOHayI+SPeMAyOOeYYx+crLS2loqKi6hnoww8/fEB5QUHBAfdd3/Xrivv+++8nErEqIFtjJuK59tprueWWWygpKWHkyJGOY49HyO9da/+sAB4Chtew21pAi/ncy96WMpJgHYrtRRxVeOaZhIuL2XTHnekKS4i4/fa3v2Xo0KGNft0ePXpw4403MnjwYEaMGEG7JM3lfeGFFzJz5v6+ECtXrqSoqIjx48czY8YMioqKqmqpDz/8MEcccQTHHnssf/vb36qaQseMGcPkyZMZOHAgTz/9dNwxXHbZZezevZsBAwbwi1/8guHDa/r3vX4PPfQQGRkZDBw4kKOPPpqRI0cSCoUAq3Y5e/ZszjvvvFqPv/zyy6s6eRUVFfHdd99x1113cfzxx3PKKacc0EoA1jPf008/nYEDB7Jly5Y6r1+XBx98kL1793LkkUcycOBAbr311qqyE088kXbt2jFu3LiUTMCi+QJtNF+gXcymS4CFNez6JvBzzRc4RPMFFDAWeCXpAcWQ1XQcmrpwPde/upgnxhzHmUdYz3vMSITQL0dTtmwZhy9aiKqjw4hoHcLhMMuWLeOwww6rswORSJ5IJMKQIUN46623qp5ztkSzZs1i0qRJSav5N5Z169Zx/PHHs2zZMgoKCg4qr+v/GSer6Wi+QB+s5OnGavZdBVwb8ntDmi/wNDA95PdOt/e9CrgFq3L5IfAHu9abEvIM1qHqTcQAyuUi74QTKP3uO3a88godfv3rdIUnRKvlcrl44oknCIVCLTrBFhcXp6VpvyFuu+02Jk6ciN/vrzG5JkPI712FNfSmprLfVfv8FPBUSgKpgSRYhzJraCIG6HDZGLZPnCidnYRIo9gJM1qqaE/t5uSuu+7irrvuSncYaSPPYB2K1mArwgc2qbvbtwcgvG17o8ckhBCi6Up5Ddbw6NnAA8CZQDmwUA8alxoevQvwAtAXKAPG6kHjU/uYPOAZYAgQAXx60Jhil7mAf2N1wTaBCXrQSKyffRyqOjmFD6zBqqwsXHl5lK9NaWc00Uy05JVBhEiFlrwCVWM0EfuxkuRhetAwDY/eLWb7PD1onGV49CHAG4ZH76sHjUrgRqBMDxr9DI/eG/jC8Ogf6UFjB3ApMAA4DGgLfGN49A/1oOGsn36ColMlVkQO/IdTKUWWphHeuTOVlxfNhMvlIicnhw0bNnDIIYfIgutC1CG64HpmZmaLWwsWUpxgDY/eBvgtUKQHDRNADxrRhVRHA73tbQsMj74ZOBmYA1wEXG6XrTY8+lzgPOA5u+xxPWiEge2GR38NuBi4I5X3kpNp9W7bU3bwwGpXQQFlq1dTsWkTmQ5nlBEtV69evdiyZQuhUEhqskLUIzMzk549UzqhUtqkugbbF9gG/M3w6COBfViJcBHg0oNG7FRWIfZPW1XXlFY1ldXYw0EpNR4YH/M5kXsAoGtbawaTLSVlB5W1u+AX7P3ySzaMvwFtcu1Tz4nWweVy0bVrVw455BBM05QkK0QtlFItsuYaleoEmwn0AZbqQcNnePRjgA+AIzlwyio4eNqquqa0cjTdlT1nZdW8lW63O+F/6dzRZ2sHhQ1tR41i+wsvUr56daKnFy2QUqpFPlcSQjiT6j8d1mA9f50EoAeNxcBqQAcwPHrnmH1jp62qa0qrRp/uCsBeTIfaKiMZnToR3rEDsyJlY5aFEEI0IylNsHrQ+BFrUdszAQyP3gvruev3wOvAOHv7EKAr8Kl9aGxZb+BUYHpM2dWGR3cbHr0D1jPZgxbWTbqqBFtzhs3qa02OXTJrVspDEUII0fQ1RuP3WOBmw6N/i7XA7e/tjk63AMMMj74cq/PSGLsHMcD9QK7h0VcA7wLj9KARHWj6IlaCXgYsAO7Xg4aR6ptQRJuIa1ZoL4BcuXlzqkMRQgjRDKR8mI4eNFYBp9WwfTPw01qO2YNVM62pLIxdu21Mqp4mYldhIQAVm7c0UkRCCCGaspbbfSvJol1VaurkBJDZvTuuwkJKZs2ssVwIIUTrIgnWIVfVDD21lOfk0OaE4wn/uE2GZQghhJAE61S0iThSR+505ReAacqsTkIIISTBOqViGolrk9XLmgujLJjSWRuFEEI0A5JgnaqnkxNA7rGDAPjx0ccwy8sbISghhBBNlSRYh+rrRQyQd/wQ2l10EXu/+oo9X85vnMCEEEI0SZJgHXLVMVVilFKKwjOtkUfFU6c0SlxCCCGaJkmwDkWfwNbVyQkg74QTyOrbl10fzUl1SEIIIZowSbAOOWkiBlBuN9mH9cfctw+z8uCl7YQQQrQOkmAd2j9VYv1jXN1t2wJQsWFDSmMSQgjRdEmCdUjVP0qnSnb//gDsW7QoZfEIIYRo2iTBxsnJHE0Fp52Gq6CATX+/mwqZ/F8IIVolSbAORXsRRxxMg5jZowddbryRyO7d7Pns81SHJoQQogmSBOuQ005OUXlDBgOw/dlnUxSREEKIpkwSrENxPIIFILtPH9qcdBLloVCKIhJCCNGUSYJ1SFWtpuN8pRx3+/aYFRVEyspSFZYQQogmShKsQ/HWYAEyunQBoMwwkh6PEEKIpk0SrEPxDNOJyj36aABKZs5KfkBCCCGaNEmwDqk4ehFH5Z8+gpwBA9j+/POULl2aqtCEEEI0QZJg46CU817EAK6sLDr89nIASmbOTE1QQgghmiRJsHFQOJsqMVb+iBG4O3Zk5xtvpiYoIYQQTZIk2DgopeKqwQK48/PJG3Qs4eLiuHogCyGEaN4kwcZBUf9ydTVxtcmHSITw9u1Jj0kIIUTTJAk2Dm2yM9hdVhH3cVl9+gBQulSG6wghRGshCTYO3drmsKm4NO7j2px4AgA/3H4b4ZKSZIclhBCiCZIEG4eubXP4obg07mepuUcfTYcrrqBy4w+yhJ0QQrQSkmDj0LUwh7LKCDv3xt9MnHvUkQBE9uxJdlhCCCGaoIx0B9CctMm2vq59FWHax3msu2NHAHbN/pDCs89OcmRCCNG6ab7A7cAdwFEhv3dJtbLTgBnAspjNQ0N+775UxiQJNg5ZGVaFv7wyEvexeYMH4+7UibJly+rfWQghhGOaLzAIOBFYW8duS0N+7+BGCglohARrePQQUGq/AO7Vg8arhkfvArwA9AXKgLF60PjUPiYPeAYYAkQAnx40pthlLuDfwDlYMwNP0IPGY6m+D4Ast5VgyxJIsMrlIvOQQ6jcIUN1hBAiWTRfIBt4FPgV8FGawzlAYz2DvVAPGgPt16v2Nj8wTw8a/YHfApMMjx5N+DcCZXrQ6AecCTxmePRoq+ylwADgMOB44GbDo3sa4yYaUoMFyOjalcqNP1D6vdRihRDCAZdSan3Ma3wN+9wFvBTye1fXc67DNV/gG80XWKD5AtekINaDpLOT02isvzrQg8YCYDNwsl12UUzZamAucF5M2eN60AjrQWM78BpwcWMEnB1NsOFwQscXnnUWAGXfB5MWkxBCtGAR0zSLYl4TYgs1X2AoVktnfa2Y3wBFIb93EPBzYKzmC4xOTcj7NVaCnWR49G8Nj/604dE7Gx69I+DSg8bWmH1CQE/7fU9gTQJlB1BKjY/966ehUxVGE2wiTcQAGZ07AVC5TZqJhRAiCU4FPMBqzRcIAUXAu5ovcEBP0pDfWxLye4vt9+uBl4HhqQ6uMTo5naIHjbWGR88E7gaeB8Zw8MqqqtpnM8Gy/TtZf+1U/cXjdrsblGEb2kSc2a2bdfyqVQ0JQwghBBDye/1YjxsBsJPsuTX0Iu4GbA75vRHNFygAzsXq55NSKa/B6kFjrf2zAngIGK4HjW0AhkfvHLNrL/b3AFsLaAmUpVSDE2zPnqjsbPZ+9RXh3TIeVgghUkXzBZ7WfIFR9scLgG81X2AxMA94H3g21TGktAZrePQ2QKYeNHbamy4BFtrvXwfGAXcYHn0I0BX4tFrZ5YZH743VDDA2puxqw6NPAdpiPZM9K5X3EZXldgNQHk4swSql6PSHsWx96N/seOlFOo0dW/9BQgghHAn5vVrM+9/FvH8EeKSx40l1DfYQ4CPDo//P8OjfYiXKy+yyW4BhhkdfDjwHjNGDRqVddj+Qa3j0FcC7wDi7QxPAi8D3WAOGFwD360GjUWbRz8m0vq7SisQSLED7S8cAsOvDjzAjiZ9HCCFE05bSGqweNFYBx9ZSthn4aS1le7BqpjWVhbFqt40uJ9Oqwe6rSKwXMYA7vw35I3/C7g9mU7Z8OTmHH56s8IQQQjQhMhdxHKIJtmRf/HMRx2pz4lAAwjuLGxyTEEKIpkkSbBzy7bmIF4QaNszGXZAPQPmaUENDEkII0URJgo1DUftcADq2yW7QebLtZuHy0Jp69hRCCNFcSYKNg7JH3JoHDeGNT1afPmR07syOSZMoW748CZEJIYRoaiTBxkFRlWEbxJWVRdc778QsK2PXh01qbmohhBBJUm8vYntlm/pE9KBRWv9uzZuqdc6o+OUeOxCVnc32l16k45VXoDJk5UAhhGhJnNRgdwO77J/VX9HtK1MVYFPUwAosABnt21Po9RLe+iPhYulNLIQQLY2TatNiPWjUOJY1yvDoC+sqFzXL6NgBgHBxMRkdO6Y5GiGEEMnkpAb7pyTt02I0dFWeqIzO1lTMZd9/n5TzCSGEaDrqTbB60Pg0Gfu0BCo5fZyq5Bx1FAAl776XtKQthBCiaXDSySkXuBzYgbW4+T+BM7HmA75WDxobUhlgU6JqXxkvIbnHHEP+iBHsmjWLfZddRt6gOlvihRBCNCNOmoifAs4Bfg+8B7QDbgZWA4+nLLImLFmVTeVy0e7CCwDY9e67yTmpEEKIJsFJgh2kB42fYSXZwcDv9aAxUw8aNwG9UxpdE5PsJmKANiedREbXrhS//XYSzyqEECLdnCTYMgB7nOtqPWjErrFWnpKomqjkNhBbXDk55AwYQHjXLnkOK4QQLYiTYTrZhkfXsfJL7HuAnJRF1oQlOxG6CwqgooLwtm1kdOqU1HMLIYRwTvMFtjjYbVPI7z26vp2cJNg8YEbM5xm17djSqWRO5RQju38/AEq/+478U09NyTWEEEI4shXrkWhtFDDdyYnqTbB60NCcxdR6JLsht83w4fDQv9l05130mTUTV1ZWkq8ghBDCoTtDfm+dS51pvsDdTk7keLJ/w6OfWcO2sU6Pbwmq6q9JzrA5hx9Oh8suo2LjRsqWLk3uyYUQQjgW8ntfS8Y+EN9qOvcbHv2o6AfDo48Brojj+GYvRS3EALQZeiIA2yY+m7qLCCGEcETzBe7SfIF2mi+gNF8goPkCP2q+wAXxnCOeBHsxMNnw6N0Nj/4L4Ebg7Hgu1lI0dD3YmuQPH05G926Ur5FF2IUQogk4L+T37gRGApXAScBf4zmB4wSrB42lwJ+xJpv4O3CmHjS2xXOx5i5VnZyiMtq1p3KLkw5sQgghUiw6JPVU4PWQ3xv3pPFOpkr8Z7VNlcByYLzh0dGDxs3xXrS5S9Vw1YyuXSldupRwcTHutm1TcxEhhBBO7NF8AR9W6+1Jmi/gAuLqgeqkBrun2msqsCTmc6uTqgSbe7Q1rGrnlKmpuYAQQginLge6AjeH/N7NQB9gUjwnUK1p9iC3222Gw+EGnaP3XwKcOaArj485LklR7RfeuZPVoy+icssWDv/ma5QrnkfkQgjRsiilwqZpOpmvISU0X8ANHBrye0OJHF/vv+CGR6+3p7CTfVqSVHRyAnC3a0f+Kadglpay+6OPUnINIYQQ9dN8geHAGmCu/XmI5gu8GM85nPxlcKPh0b+g7ql4rwMmxnPh5kqRuiZigA6X/podL71ESWAGBT/5SeouJIQQoi7/xOrg9AZAyO9doPkCg+I5QSJTJdZkazwXbc5S3ZM4S9NQOTmEd+9K6XWEEELUKSPk967UfIHYbXEtcCNTJSYg1U+t3W3bUr5iZYqvIoQQog6lmi+Qj/1PvuYLHAGUxnMC6UUTp9TWXy3Z/fpRsWULlTt2NMLVhBBC1ODvwLtAd80XeA6YDfxfPCdotN5Zhke/HbgDOEoPGksMj94FeAHoi7Xm7Fg9aHxq75sHPAMMwRrs69ODxhS7zAX8G2u1AxOYoAeNxxrrPiC1z2ABCs85hz2ffca6q36P9uorKLc7tRcUQghxgJDf+57mCywHzsKqW90d8ntXxHOORkmwhkcfBJwIrI3Z7Afm6UHjLMOjDwHeMDx6Xz1oVGJNw1imB41+hkfvDXxhePSP9KCxA7gUGAAcBrQFvjE8+od60Ag2xr1Yj2BTm2HbXfAL9s7/kuK3plPxww9kFRWl9HpCCCEOFvJ7VwP/TfT4uBOs4dEz7CTodP9s4FHgV0Ds2JPRQG8APWgsMDz6ZuBkYA5wEdYgX/Sgsdrw6HOB84Dn7LLH9aARBrYbHv01rJk27oj3XhKhGqWRGDJ79gQgsnt3o1xPCCEEaL7AAuqoRYX83uOdnstxgjU8+hFYs1h0BA41PPpxwGg9aNxSz6F3AS/ZiTJ6ro6ASw8asb2PQ0BP+31PrPFHTssG13RhpdR4YHzM53pCdaYx5ubI6NQZgF0fzCbH40n9BYUQQoDVgpoU8XRyegT4I/Cj/fkbwFvXAYZHH4r1HLWmZ6TV01T17GcmWLZ/J9OcYJpmUfSVlATbOBVYCr3WV1u2Iq4mfyGEEA0Q8ns/Dvm9HwNfAnNjPn9ib3MsngRbEO2EBKAHDROoqOeYUwEPsNrw6CGgCKtX1vEAhkfvHLNvL/Y/o10LaAmUNYrGmFzSnd8GlZdHZJeMhxVCiDT4ECiM+VwAfBDPCeJ5BltpePRM7PxiePQi9i/nUyM9aPixOjNhHxMCzrV7Eb8OjAPusDs5dQWiCTxadrndyelUYGxM2dWGR5+C1cnpIqxeXo3CmsmpceZvzuzShb1ffSWr6wghRD00X6BqpErI711SQ/mVgA+rYjkbuCbk99bVnygv5PcWRz+E/N5izRdoE09M8TYRTwU6GR79Dqz5Ge+P52LV3AIMMzz6cqzOS2NiOk/dD+QaHn0FVo13nB40tttlLwLfA8uABcD9etAwGhBHXFI8kdMBCs4+C7OsjPJ16xvvokII0czYUxhWH6kSW94ba1zryUA/rArdlfWc1hWbUDVfoADIjCcuxzVYPWi8ZHj0VVi9efOA3+hB45N4LhY7K5QeNDYDP61lvz1YNdOaysJYtdu0aaz1hzI6dQIgLBNOCCFEjTRfoLaRKrEuBKbay86h+QKPAzcDT9Rx6knAe5ovEB2m8wfg+Xhic1yDNTz6cKxxq7foQeNmPWh8Yo9vbVUUqlF6EQNkdLYeUZevkmkThRCtkksptT7mNb6Gfe4CXrLHrNamrpEpNQr5vfcBTwKj7Nd/Q35vXK228TyD/Qh4z/DoF+pBY6+97WmgVSXZxmwizj3iCAB2f/wx7X/9a1RG2pZFFEKIdIiYplnrTDuaLxAdqeJzcC5Ho09izt0u5Pc+T5y11ljxPIP9FqsT0lzDox9ib2vEdNN0NFYTcWaPHhSO+hl7Pv+CXR/MbqSrCiFEs1E1UkXzBULYI1U0X+DsavslMvpkueYLPKX5AkcnGlw8VSJTDxr/MDz6Wqwkez6Nl2uajMb+i6LDmDGUTH+bXe+9S+FZZzby1YUQoukK+b0HjFSxk+y5NfQifhP4VPMF7gK2YI1KeaWe0/fD6gj1puYLbAIeBt4M+b1hp/HFU4NVYHV2wnrYOwPoEcfxLUZjDdMByBkwgOz+/SmZOYtIaVwrJQkhRKul+QJPa77AKICQ37sKuB34DFiJlWSfqev4kN9bHPJ7J4T83v5YSfxfwFrNF/ir0+E68dRgH4m+0YPGh4ZH/xkx0xC2FqlecP2g67nd5A0ZQtny5UR278aVk9Oo1xdCiOYi5PdqMe9/V63sKeCpeM5nD825HLgG+M4+/ifALGB4fcfHM0znmWqflwBXxBFri5COh86uggIAKjZurBq6I4QQInXsoTznYTUvnx/ye7+3i6ZovoCjuRfqTbCGR39RDxpjDI9e4woDetBwvLJAS9GILcQAZPftA0DZ8uXkHp3w83YhhBDOrQA8sbM5xTjdyQmc1GAfsn8mbYWBZi0NVdi8445DZWWx5YEJ5J9+Ohnt2zd+EEII0bp8TMx0wJovUAgcFvJ7vwr5vT84OUG9CVYPGl/bPz+ObjM8ejs9aOyMO9wWwmzkztOZPXrQ+dpr2XL//ez9cr70JhZCiNR7AmuMbdRee9txTk9Qby9iw6NfZ3h03X7vMjz621gLnW+1l6NrVazJ/hv/unnHWy3x2597rvEvLoQQrY8rdkiOvTBAXLP9OBmm8zusbs0Av8QaG9QNq2fVffFcrCVo7F7EUblHHUnuscdStrqu2cCEEEIkSbnmC/SNftB8gX7Uv0TrAZxk40o9aJTb738CvGhP1B8wPPrd8VyspUhHDRbA3aEDkYULMU0zbYleCCFaiTuxJqcI2J/Ppv4VeA7gpAabYXj06L/mQ4HPY8riWrqnJVCq8Z/BRmV0tobolK9alZbrCyFEaxHyewPAKcA39uuUkN87K55zOKnBzgZeNjz6JqwFzj8FMDx6V6AsrohbgHTWG3OPOoqdr7zKrtkfkt23b/0HCCGESFjI710OLE/0eCc12BuA+fb7s2IWRe8PTEj0ws1ZupqIC848k8yiIrY9/TSRslb3t40QQqSc5gtMS8Y+4GyYTiU1JNJ4F1tvKdL57NOdn0+HMZey+V4/u97/gLbnetMWixBCtFBDNV/gn/Xsc4STE8Uz2b+wpasGC1A4ahRkZrJj0qT0BSGEEC3XY8Ceel6POzmRrOAdJ5eC+aHtLN+8C5crHbXZTHaOPJfIzGlUbt9ORocOaYhBCCFappDfe2eyzqUac+m1dHO73WY47Hgpvxqd+/AnLNlQkqSIEnfR97O5as939Jk2FVdeXrrDEUKIpFNKhU3TbLYVQccJ1vDoI4BB9sdv9KDxUcqiSpFkJNhNxaW8umAdlZFI/TungGnCIx+t4Az3Dsa/eQ+F55xDjwkPpCUWIYRIpRafYA2PXggEAA34GmukyiBgDXCOHjTSX51zKBkJtinod+sMTuvXgVteupXKjT/Q//PPpKlYCNHiNPcE66ST0z+BhUAfPWicrweN84C+9rZ/pTI4UbOcTDdlpqLtz0YBsO2JJ9IckRBCtCyaL3C15gs06PmbkwQ7ErhODxpVczDaUydejzV1omhkOZlu9pWH6TDmUlyFhWx//gV2zZ6d7rCEEKIlORVYrfkCD9rzEMfNSYKt0IPGQQ8c7fGx5TXsL1IsN8vFvoowGZ060XPiRAC2PPhgmqMSQoiWI+T3/go4BtgJfKT5AjM0X+CceM7hJMHuMjz60dU3Gh79GKzxQKKR5WS4Ka2wniXnHnkEOUccQfmKlZQtT3hGLyGEENWE/N5N9rCdXwNHAi9pvkBQ8wUctd46eXh8F/tXzpkHmMAw4G/AHxILWzREXnYGq7furlpVp91Fo9l02+2sueIK+r37rgzbEUKIBtJ8gRzgV8A4oBS4CXgDa8H117A6/tbJyVSJ7xgevRL4K/unTPwauEoPGjMTilw0SI92OSxet5OKsElWhqL96NHsW7iI4qlT2f7CC3QaOzbdIQohRHMXAt4Hxob83gUx2+drvsD7Tk6Q8okmDI/+HtAViAC7gD/pQWOR4dG7AC9g9UguA8bqQSO6Uk8e8AwwxD7OpweNKXaZC/g3cA5WbXqCHjQecxJLSxmmc/2ri5i6cAPBv59FTqYbgModO1g+dBi5AweivfJymiMUQoiGS+cwHc0X6Bbye39oyDnqfQZrePTHYt6fl8A1RutB42g9aAwEHgAm2tv9wDw9aPQHfgtMMjx69Iu8ESjTg0Y/4EzgMcOjt7fLLgUGAIcBxwM3Gx7dk0BczVZ0vYHYv40y2rfHlZ9PeSjEvv/9Lz2BCSFEyzFW8wU6Rj9ovkAnzRe4PZ4TOOnkdGLM+7hODqAHjZ0xH9ti1UgBRgOP2vssADYDJ9tlF8WUrQbmAufFlD2uB42wHjS2Y7WFXxxvXM2Zy86w4WqtD11uuonwnj388H+3pSMsIYRoSc4L+b3boh9Cfu+PwPnxnMBJ1VvV8t4xw6O/AIywP55lePSOgEsPGltjdgsBPe33PbFminJaNrim6yqlxgPjYz4nEn6T47bvI1Itwba/aDS73nuPPZ99Run335Nz+OHpCE8IIVqCmhJGZjwncJJgsw2PrtsXi30PgB40ltZ3Aj1oXAZgePTfAPcDY7Cen8aqfjNmgmX7dzLNCcSsZet2u1vEygYuu93BrGE65EKvlz2ffUbxlKnk/MXXuIEJIUTLsUzzBcYDD2LlmeuBYDwncNJEnAfMwJqPODfmfQB4J56L6UHjefbXZDE8eueY4l7AWvv9Wg7sAu20rFVQtdRgAQrOGAmZmeycMqWxwxJCiJbkWuBcYB/WnA9nAX+K5wROhuloiUQGVQsF5OtBY6P9+efANmA78DrW+KI7DI8+BKun8af2odGyyw2P3htryqqxMWVXGx59CtYz3YuwbrzViC5DW/0ZLIC7oICC005j1/vvs+eLL2gzdGgjRyeEEM1fyO/dCJyu+QJt7M9xT6yU6u7PbYE3DY+ei9W5aStwrh40TMOj3wK8aHj05VhTLo6xp18Eqxl5ouHRV9jHjbM7NAG8iDV8Z1l0Xz1oGCm+jyaltmewUW1/fj673n+fdWP/QL8P3iejc+ca9xNCCFE7zRfoBvQGMjRfAICQ3zvX6fFOlqvbysHPS8Fqkzb1oNHFcbRp1lLGwd4x/Tue+zzEl7f+hEMKc2rc58ennmLrAxNoN3o03e66s5EjFEKIhkvzONi/Ys3etAqIJg4z5Pce7/QcTgKvsYeuSB9XPTVYgI6/+x0/PvIoJTNmcMjf/oorK6uxwhNCiJbgCqCfPTwnIU4S7B49aCR8AZF8Vc9gI7UnWKUUhed6KX5zChtvupmifz/UOMEJIUTLsKkhyRWcJdj3gEEAhkd/Rg8aVzbkgqLhXHaGrW+Wy663386ez79g17vvsmfel7Q58YRGiE4IIVqEdzVf4AFgEtZk/wCE/N56h6ZGORmmEzvO9FjnsYlUcdJEDODKyqLr3/4KQPHUqSmPSwghWpDfAr8A3iTBoalOarAtYnKGliTaRFxHC3GVNsOGAVD81lsUnnsu+cNPrucIIYQQIb+3d0PP4STB9jA8+j9reA+AHjRubmgQIj6FudZsXT/uLqN3pzZ17uvKzeXQp55i3VVXsfHmm+k35yNc2dmNEaYQQjRrmi9wHuAJ+b33ab5Ad6BjyO/91unxTpqIH8OaxWJPtffRl2hknq4FACxet9PR/vnDT6bQ6yW8Ywf7Fi1OYWRCCNEyaL7AHVgTHEX7HZnA4/Gcw8lMTjKIsokZ0L0QgNA253/f5I8YQUkgwIZrr6X31ClkduuWqvCEEKJRab7AQeuOh/zeRdX2OQ1rqt9lMZuHhvzefbWc9nzgOOArgJDf+4PmCxTEE1daBvCKhinMsZqId5dW1rNnzDHecyhbtoxtTz7Jjskv0+WG8fUfJIQQzcPokN+7E0DzBc7HWnd8UA37LQ35vU7ndigN+b3h6AxOiXDSRCyamJxMN1luF7viSLBKKTpc/hsAiqdPT1VoQgjR6KLJ1Ra77nhDrNF8gZMBU/MFXJov8DfA8fNXkBpss1WQkxFXggXI6NCB/JE/YfcHs9m3eDG5xxyTouiEECIpXEqp9TGfJ9jLkB5E8wUOWHe8lvMdrvkC32BNffhsyO99rI5r/xl4HjgS2At8AlwaV/Dx7Cyajra5mezYWx73ce0uvBCAHZMnJzskIYRItohpmkUxrxqTK0DI770s5PceCvwNa8GY6r4BikJ+7yDg58BYzRcYXcf5Nof83rOAdkCnkN97Rsjv3RxP8E4m+19AHWNh9aDheOLjdGspk/0DXPncAuYu38rSu84i0+3876TIvn18f+wgXHl5aK++Qnb//imMUgghEpfoZP+aL7APK5luq2OfvwDdQ35vjWu8ar7A/OoT+9e0rS5OAr/R6clE4+nWLoeKsMnOvRV0LnA+rtWVm0vX229j0513sfbK39F31kxceXkpjFQIIVJH8wUKgXx7/VY0XyB23fHY/boBm0N+b8TuDXwu8Ewdpz4gP2q+gBvIjyc2J8N0Po7nhKJxRKdLNBOYaKv9JZew95uFlLz9NjunTaPDr36V7PCEEKKxtAXe1HyBA9YdD/m9puYLPA1MD/m904ELgD9ovkAlVu57HXi2+sk0X+Am4GagreYLbIkpysOal9ixepuIowyP3gm4HTgGqFqEVJqI0+P2t5bw/BdrmH/rT+hSy5qwdSk1DFb//Bdk6zraKy/L7E5CiCYnHevBar5AW6A98F+siSaiSkJ+7454zhVP4BOBz4AzgRuAq4GF8VxMJI+qqsEmJkfXyR6gU7bUYNNtt9P9Pn/yghNCiGYq5PcWA8XA2Q09VzwJtqceNEYZHv3XetB42/Do7wIzGxqASIydX+tdsq4u2ssvs3z4KRS/9RYdf38V2X37Jic4IYRo5jRfoC/wENVabUN+bxen54hnmE50TEiZ4dE7AJVAURzHiyRSOFuyri6u7Gy6jLdmdNr55pSkxCWEEC3E08BLWFMv/gSYhpVwHYsnwX5vJ9aXgHnAl0gTcdpU1WAbeJ78U4aDUmyfOJGS999vcFxCCNFCtA35va8CEXsFnauBM+I5geMEqweNMXrQ2K4HjX9jLUR7JyDdT9PEzq847aRWm8zu3en5rNWRbtP/3UakrKyBkQkhRItQYf/cpfkCvYBsoFc8J3CcYA2PXjWllB40PtODxjvAI/FcTCSPy151vYH5FYA2J55A4aifEd65U5azE0IIy8eaL9ABK899BawA4prIPZ5OTifWsG1oPBcTybO/Bpuc8+Wfciol099mw4030Gf6dDLat0/OiYUQohkK+b03228na77AJ1jjbbfXcchB6k2whkf/JTAa0AyP/lpMUVtkwfX0qXoGm5wMW+g9h73z57PztdfYePMt9HzqyaScVwghmruQ37sOWKf5AmuBnk6Pc1KDXQYEgOPtn1ElwOx4ghTJE+1FnKwarFKKrrffxu5PP2HPJ59glpejsrKSc3IhhGgZVP277OdkqsTFwGLDowf0oLE14bBEUrmS1Is4lnK7yT/lFHa+8io/3HEn3e65u2pCCyGEEPH9kxvPM9gMw6O/w/719mYDV+tB44d4LiiSI5r3GjIOtiadr72WPV98QfGUKeQceYTMUyyEaFU0X2BAHcVxTdsYzzjYJ4HPgR7263N7m0iDZDcRR2W0b0/PZyYC8ON//5vckwshRNMXqONVGs+J4snGh+pB42cxn/2GR18Uz8VE8uxvuU1yhgWyinqQO3Ag+xYtomTWuxSedWbSryGEEE1RyO/tnaxzxZNgXYZH76oHjU0AhkfvQj0PfA2PngO8AgwA9gKbgLF60AjZx78A9AXK7O2f2sflYa3TNwRr+SGfHjSm2GUu4N/AOVjZZYIeNB6jlama7D/5+RWATn8Yy/o/X8uG668nu+9bsjC7EELEqd4mYsOjv2y/vR9YaHj0Jw2P/gTwtb2tPk8Ch+tBYyDwDvublf3APD1o9MeaGWqS4dGjCf9GoEwPGv2wVu95zPDo0YGZl2Il7MOwejbfbHh0j4M4WpToXzaRFCXY/FNPpdvdfwfTZMerr9V/gBBCiAM4eQbrAdCDxotY8zD+D1gCnKkHjZfqOlAPGqV60JihB41oGpgH9LHfjwYetfdbAGwGTrbLLoopWw3MBc6LKXtcDxphPWhsB14DLnZwHy2KSvI42JoUjBwJQMnMmQ2eklEIIVobJ03EVf+y6kFjCVZyTdSfgbcNj94RcFUb9hNi/wDensCaOMoG13QxpdR4YHzM58Qjb2JS1ckplis3l/yRP2H3B7PZ++WXtDmxpsm8hBBC1MRJgj3K8OhbatiuAFMPGo7WxjM8+q1Af6wV4nM5uHdO9exnJli2fyfTnABMiH52u90tphpWNQ42xXfU/pe/ZPcHs9n52uuSYIUQIg5OmoiXYXU2qv4abP+sl+HRbwR+AZytB429etDYZm/vHLNbL2Ct/X4toCVQ1mqkahxsdXlDhkBmJiUzZlDy7nspvZYQQrQkTmqwZXrQWFP/bjUzPPp44BJgpB40dsYUvQ6MA+4wPPoQoCvwabWyyw2P3hs4FavmGy272vDoU7DmQ74IOCvR+JqrxmruduXl0WviM6wZcxmbbr+dghGnyRSKQgjhgJMabML/khsevQh4AGgHfGR49EWGR//SLr4FGGZ49OXAc8AYPWhU2mX3A7mGR18BvAuMszs0AbwIfI9Vs14A3K8HDSPRGJu7xuh7lDdkCIU/s5az27toUeovKIQQLYBqTb1D3W63GQ6H0x1GUvx3zkrumxXkrXEnccyh7VJ+vZKZM9lw/XgyunSh76yZuPLyUn5NIUTrppQKm6YZ1/SETUk8UyWKJkSlYLL/uhSefTZtL/gFlVu2sOmeexrpqkII0Xw1278MWrtou/0bX69LeUenKPOKa1n76f8w53xD//U7GdC9LW5Xyxn6JIQQySRNxM3Uu99t4uoXv05rDHeddwSXDdXSGoMQouVq7k3EzTbw1u7MI7ry+KXHsW773ka9bnjXLta/OY2Xugxm5aIgSIIVQogaSYJtxs46smtarrtNb8dLTy7hh7mfU6KZFJ59dlriEEKIpkw6OYm4tdd6ojDZlZnHhuvHs/7P16Y7JCGEaHIkwYq4uVyKdnlZbDliMK7CQna99x4bb7mFcElJukMTQogmQxKsSEhR+zx2VULv114lW9cpfms66676PWZ5ebpDE0KIJkESrEhIXpabyohJlqbR+43XyezZk32LF7P81NMoX78h3eEJIUTaSYIVCcl0u6iojACg3G56T5lCm2HDCO/YwborryS8e0+aIxRCiPSSBCsSkuFWVEQiVZ/d+W049KknAShfs4ZVo36GWVlZ2+FCCNHiSYIVCcl0u6gMHzhJiXK76f/pJ6isLCo3/sDme/1pik4IIdJPEqxISKZbURkxqT4TWEanTvSdOQOAHZMmsW3is5gtZPYsIYSIhyRYkZCC7EwAtu4uO6gss0cPDn3qKVz5+Wz55z/ZLIsDCCFaIUmwIiHHae0BmBPcWmN5/vCT6fPO26AUOya/zJrLfkOk7OBkLIQQLZUkWJGQY+01aNftqH0u5MyuXekzI4C7Uyf2zp/P+mvGyWQUQohWQ+YiFgk5pG0OAJuKS+vcL7t3b/q8PZ1VZ5/Dns8+Y8WI0zls/pcot7sxwhRCtAKaL/Ae0BWIALuAP4X83kU17Hcl4MOqXM4Grgn5vSkb7iA1WJGQguwM8rMzmLlkU737ZrRvT/9P5pIzYACRPXtYNuwkSr9f1ghRCiFaidEhv/fokN87EHgAmFh9B80X6A38HTgZ6IeVkK9MZVCSYEVClFKUVoSpjBkLW+f+mZn0evEF8k87jUhxMWt+/WsipXXXfoUQwomQ37sz5mNbrJpsdRcCU0N+7+aQ32sCjwOXpDIuSbAiYcP7d4prf1ebNhQ9+giZvXoS2b2bjTfdhFlRkaLohBAtgEsptT7mNb62HTVf4AXNF1gH3A38poZdegJrYj6H7G0pIwlWJMylFA4rsFWU24328suonBx2vf8B3x9/gnR8EkLUJmKaZlHMa0JtO4b83stCfu+hwN+A+2vZLXbgvkpmoDWRBCsS5nIpItUmmnAio0MH+n88h5yjj8bct481v76Ufd99l4IIhRCtTcjvfR4YofkCHasVrQW0mM+97G0pIwlWJMylIJxAggVwt21Lr+efI//00ylbvpzQBRdStmpVkiMUQrR0mi9QqPkC3WM+/xzYBmyvtuubwM81X+AQzRdQwFjglVTGJglWJMztUpgmB02X6JQrN5dDH3uUTtf8AYDV5/+c7ZMmJTNEIUTL1xaYpvkC32q+wGJgHHBuyO81NV/gac0XGAUQ8ntXAbcDnwErgS3AM6kMTCX6j2Nz5Ha7zbDMi5s04yZ/Q+B/P7DyH+fgdjXscUbx22+z8aabAejmv5e2552HUil/RCKEaMKUUmHTNJvtfA1SgxUJc9sJMJHnsNW1/dnPKHr8v+By8YPvL6y94ooGn1MIIdJJEqxIWLTSGo4kpxWk4LTT6PfB+7jy89n7xTzW/OZyKjZvTsq5hRCisUmCFQlz2Rk2mU8ZMrt3p9ekSWT378/eL78kdNHFVGyqf7YoIYRoaiTBioS57CbiRHsS1ybn8MPoPf0tsj0eKjdtYsVpI9j37bdJvYYQQqRayh8eGx79P8AorDFHR+lBY4m9vQvwAtAXKAPG6kHjU7ssD6t31xCsKa98etCYYpe5gH8D52ANGp6gB43HUn0f4mDJfAZbnVKK3q+9ykbfXyiZMYN1v7uKfp/MxZWVlfRrCSFEKjRGDfYNrMmV11Tb7gfm6UGjP/BbYJLh0aMJ/0agTA8a/YAzgccMj97eLrsUGAAcBhwP3Gx4dE+K70HUwGX/9phxzubklMrKoseEB8ju359wcTEbrrue8M6dqbmYEEIkWcprsHrQmAtgePTqRaOB3vY+CwyPvhkrEc8BLgIut8tWGx59LnAe8Jxd9rgeNMLAdsOjvwZcDNyR2jsR1UWH5pz177l0KcxJ3YVG3kBZn1WY+0pRf32dbF0HpcjNdPHPC46hZ8e81F1bCCESlJbxRYZH7wi49KCxNWZziP0TL9c1KXNNZYNruo49MfT4mM8NiFpU9/NjezB/9XZ2l1aytSTFK+N07Unltm2Y5eWwch1bswsBmLpwA9eO7J/aawshRALSOYC3+oO76tmvrkmZHU3YbE8MXTU5tNvtbj2zajSC43p14L3rT22060X27GHT3fdQPHUquws78MvTb+WzlT9KghVCNElp6UWsB41tAIZH7xyzOXbi5bomZW70CZtF0+Bq04bu9/6DtuefT37JdrLCFexZt57ydevSHZoQQhwkncN0XseaMxLDow/BWl3+0xrKegOnAtNjyq42PLrb8OgdsJ7JvtqIcYs063bvPyj672N0qdjFnm07WXnGT1n1s1Hs/uRTTJkKUwjRRKQ8wRoe/VHDo68HioAPDI++wi66BRhmePTlWJ2XxuhBo9Iuux/Itfd9FxinB43oyggvAt8Dy4AFwP160DBSfR+i6VBKUTBiBG21nhR37Ia7a1fKli9n3VVXsWrUeRS/9VbCCxAIIUSyyGT/otn627RveWneWmb8+WS0jcvZdM8/KLP/1soZMIBOfxxHwemnpzlKIUSiZLJ/IdLknCO7AfDMpyHyBg+mz9Qp9PtwNrnHHUfp0qWsv2YcG/9yK2WrVqc5UiFEayQJVjRbQ3p3AMD4oaRqW2b37miTXqLnC8/jatuW4qlTWXXOOay/7nr2fv11ukIVQrRCkmBFs5XpdtG7UxvKwwdPJdXm+OPpP/djekx4gMxDD2XXrFms+fWlrLnsN5TMmiXPaIUQKScJVjRr2RkuKmtIsACu7GwKzzmHvu+9S89nJ5J34onsnT+fDdddz7LBQyiZMaORoxVCtCaSYEWzlul2URGuuzaqlKLN0KH0eu5Zek95k/yRPyGyZw8bxt/A+j9fy+65cxspWiFEayIJVjRrGW5FZcT5agM5AwZw6COPoL3yMrnHHMOu995j3e+vZvUvR7Pj9dcx4ziXEELURRKsaNYyXS4q66nB1iR34EC0V1+hT+AdCs46i9Jvv2XT/93GsqHD2P7884SLi1MQrRCiNZFxsKJZ+9VT85i/ejunHd4FAKWsyamtn8r6ab+nqsyavlrF7B/ZvZuKUIjyVatQmCgTcjyHkd2nL67cnAPOhf0+w6W4bGgv+nUpSM/NC9HCNfdxsJJgRbM24b3vefzjVZiYmKa1CoRpmvbP1F//mKK2vPXHk1N/ISFaIUmwzYgk2NbJNM2qZFs9AUcTM9U+h8vK2Pna62yfPJnKLVsxFbS/4grannceGYd0xQRO8n9IRTjC8nvOlqUQhUgBSbDNiCRYkYidU6byw//9H9i/O+1Gj6bDby7jtoV7ePWrdTx+6SDOsmeVEkIkjyTYZkQSrEhUePduiqdOY9tTT1G5ZQsAG0aez+/yT6Z9tosF44eR0bZtmqMUomWRBNuMSIIVDWWaJiXvvMOOyS+zb/Firjn1Ola37c606T7yCvPJ7t+fzEMPJfeoI2kzfDiZPXpI87EQCZIE24xIghXJVLljB3+ZNJ/X11cy0/yC3NAKylasILJnT9U+GV27kjfoWLIPO4wsTSOrVy8yu3fHLbVdIeolCbYZkQQrku3v7yzlmU9Xc/7A7uRmuQGo3LWbyu07qNy6lYptPxIu2YU1GAhMBSYKd2Eh7q5dcbdvj6ugAJWTCy57H7s3tPXerHqP3QkLoF+XAq4f2V9qx6JFa+4JttkGLkRTcNgh+QBMW7SxhtLO0L4ztK/l4FLghzD8sBPYGeeVN+E9qhuHd5UxuEI0VVKDFaKBSkoriERMazILsCa02P/2oIktAKgMU75yBfu+XULFmjWUr1pF+Zo1VG7YgIqEq/bHNMk95hjyTzqJ7N69ye6jsUi145IXFjGsb0ce/dUg2rfJasS7FaLxNPcarCRYIZqQSHk5FevWUb5mLeWrVrJnwQL2fPIpxMyRHEHx91OuZl6Hfly1L8hv22wno2NHMjp3IuOQrmT26EFG506427bFlZOTxrsRomEkwTYjkmBFcxTevYeKdWspW7GS8lCI8jVr2LB2E6P7XMwhe7dTtGtL7Qe7XKjMTFwZGaisLFR2Nioz03plZaEyMlCZmZCRgSszA+XOiD4urtFPB3TlVyf0TP5NClEDSbDNiCRY0VJUhiP84r+fs3LLbsDqDIUZgYhp95IyqxaVN83922pjxmZVZTdrx75QlJrWPocXunG5XCiXwuV2oVwulNtt/VQqZj5odcC80Pvngj54nuiqJvUDjgGXUrTLyyIn8+B1SWq7m9pv8+CC2vatdXstV619f+fnr+3ccW6mtn/Ta9oaf9zOzw1w7U/6c9ghifcTkATbjEiCFa2ZGYkQ3r6d8M6dhHfupGLzZiK7dhEuLrG2FRcT2VVCeMdOyteupfLHHw9oml7YuT+v9R/BpjYdwFoSAdNe/MAE+7MCl/1Tueyfan+Zsn/an62zq6re1Sax57NeFbKCYLM1+aoTGNa3U8LHS4JtRiTBCuGcGYkQLi62knJxCeGSYsLbthMpK8UsL8fct4/I3r1E9uwlsncvZnkZkX2l1vuyMiKlMe/37iVSWgoVFXHHsTszh7CqeWVNFf3nK1qLzs6ymrzdLpQrA+VyQYbbaibPykS53Ci329rf5YKMDFR2JsqdiXIpcLlRbqsmjtuFUi5UZhYq0z6X/QcEygUul33p/duVy4XKyrb3jWkJsHu+7X974PbYZaCiZSp6TIYblZlt3W9sT7mYRgdX9H3ssC0V7XZ34LYDNsXs79rfjLB/9/0njjmH/SnaeS+mEx/VzpFz5JFkduhQ4387J5p7gm22gScikhfhiMeOSHcYQrQcufbLKdNV1ZRd1XQdidhN2hH7s9XcbUbsOqxZbG0jegzW9mhzuDVAuFpTeMx+sWVV22PLIvv3jSmi0n4f/98EwpY5vweuvHh+QVqWVpVghRBpprBrfwf3pWoyU2bEJtuqZZdin5CaHPjQsXrSrvb5oHMf/KHm3avFUPuJ6lfn7g7OleDxKqt1DyGTJmIhhBBNkjQRCyGEEGmi+QI5wCvAAGAvsAkYG/J7Q9X2Ow2YASyL2Tw05PfuS1VskmCFEEI0d08CM0N+r6n5An+0P/+0hv2WhvzewY0VlCRYIYQQzVbI7y3FqplGzQOuS080B5IEK4QQoqlyKaXWx3yeYJrmhHqO+TPwdi1lh2u+wDdAGHg25Pc+lowga9NsE6zh0fsDzwOdsJYiuVwPGkvTGpQQQohkipimWeR0Z80XuBXoD4ytofgboCjk9xZrvkARMEPzBX4M+b2vJSnWg9Q8ert5eAJ4Ug8ahwH/BJ5JczxCCCHSRPMFbgR+AZwd8nv3Vi8P+b0lIb+32H6/HngZGJ7KmJplgjU8ehdgEPCSvelNoLfh0bW0BSWEECItNF9gPHAJcEbI791Zyz7dNF/AZb8vAM4FFqYyrubaRHwosFEPGpUAetAwDY++FugJhKI7KaXGA+NjD1RKNWQgrAuQmVEt8l1Y5HvYT74Li3wP+zX0u3DXt4Pd3PsAsAr4SPMFAMpCfu8Jmi/wNDA95PdOBy4A/qD5ApVYue914NkGxFavZjnRhOHRjwNe0IPGETHbFgA36EFjbqquq5RaH8/zgJZMvguLfA/7yXdhke9hv9b+XTTLJmJgHVBkePQMAMOjK6xa7dq0RiWEEELYmmWC1YPGFqy280vtTRcAIT1ohNIWlBBCCBGjuT6DBbgaeM7w6LcCJcBvGuGa9Y2/ak3ku7DI97CffBcW+R72a9XfRbN8BiuEEEI0dc2yiVgIIYRo6iTBCiGEECkgCVYIIYRIAUmwDiml+iulPldKLVNKzVdKDUh3TKmilAoppYJKqUX26yJ7exel1Cyl1HKl1BKl1Mkxx+QppV5WSq2wv6NfpO8OEqOU+o9976ZS6siY7Qndt1LKpZR6WCm10i6/prHvKVF1fBdzlFKrYn43ro8pa3HfhVIqRyk1zb6fRfbvgWaXtarfi3q+i1b1e+GYaZrycvACPgQut99fCHyR7phSeK8h4Mgatk8E7rDfDwHWABn259uA5+z3vbEWPW6f7nuJ875PAYqq33+i9w1cBszGmo2mg31eT7rvs4HfxRzg3FqOaXHfBZADnMP+DqF/BN5rjb8X9XwXrer3wulLarAOKKVqnPs4+tdbKzIaeBTANM0FwGYg+lf7RTFlq4G5wHlpiDFhpmnONU1zfQ1Fid73RcDjpmmGTdPcDrwGXJy6O0ieOr6LurS478I0zVLTNGeYdjbAWmu0j/2+Vf1e1PNd1KXFfRdOSYJ15lBgo2malQD2L1h07uOWapJS6lul1NNKqc5KqY6AyzTNrTH7hNj/HfTE+gu+prJmq4H33SK/E+B++3fjVaVU7D+wreG7+DPwtvxeAAevu9qafy9qJAnWueoDhlVaomgcp5imeQxWrX0b1rq7UP93YNZR1pw15L5b2ncyxjRNHTga+AR4p1p5i/0ulFLRtUb/am9qtb8XNXwXrfb3oi6SYJ1ZBxQppTIAlFIteu5j0zTX2j8rgIeA4aZpbgNQSnWO2bUX+7+DtYBWS1mz1cD7bnHfiWma6+yfpmmajwB97NoctODvQilVtdaoaZp7W/PvRfXvAlrv70V9JME6YJpmjXMfm6YZSltQKaKUaqOUahez6RL2r5n4OjDO3m8I0BX4tIay3sCpwPRGCLkxJHrfrwNXK6XcSqkOWM+bXm3EuJNKKZWhlDok5vMFwOZosqGFfhfKWvbyEuAM0zR3xhS1ut+Lmr6L1vp74Ui6e1k1lxdwOPAFsAz4Cjgi3TGl6D77YCXU/wHfAm8Bml12CPAesBz4Djg15rg2WP9jrLC/owvTfS8J3PujwHqgEqun44qG3DdWz8hHgZX264/pvseGfBf2vX5l/14sxur9eUxL/i6welKbdsyL7NeXrfH3orbvojX+Xjh9yVzEQgghRApIE7EQQgiRApJghRBCiBSQBCuEEEKkgCRYIYQQIgUkwQohhBApIAlWiEZir05zpFLqcqXUYSk4fzul1M3Vtj2tlBqe7GsJIeonCVaIxnc5EHeCtZf2quv/2XbAAQnWNM3fmab5SbzXEkI0nCRYIRrXacBg4D/2upnngDX9nLLWGf5GKTVDKXWovf0OpdSLSqkpWAP7uyml7ldKLbCP/1gp1d8+9+NAO3v7V/bxc5RS59rvD1FKTbUnZF+ilPp9NCi7dn27stY8Xq2U+ltjfSFCtFQZ6Q5AiFZmDtasN/8yTfMdAKXUr7BqtENN0wwrpcYAj7B/Sa8RwCDTmrITpdR9pmneZL+/GHgQOBcYC3xlmubAWq79HyBomubP7SUYv1ZKLTJNc75d3s40zWH2/LorlFLPmqa5Ial3L0QrIglWiPQ7H6tW+7W1jgRuIBxT/k40udp+qpT6E1CA1QpV6PA6I4FjwJpf264V/wSIJthJdtlWpdQqrMWxJcEKkSBJsEKknwLuNk1zYi3lu6t2VKonVk30eNM0VymljgY+jONa1edGjf1cGvM+jPz7IESDyDNYIRpfCdA25vN04Bp7NRGUUplKqWNrObYtUA5sspdN/GO18+ZFl1WswQfA7+1rdAZ+TnzJWQgRB0mwQjS+J4Hbop2cTNN8EXgJmKOUWozVmWlETQeapvkt1hJf32E9z10bU7Ydq5n322gnp2r+DBytlPof8BFwT8zzVyFEkslqOkIIIUQKSA1WCCGESAFJsEIIIUQKSIIVQgghUkASrBBCCJECkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECnw/9u1/1xcs4TpAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 480x320 with 2 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "### Visualize results\n",
+    "mpl.rcParams['figure.dpi'] = 80\n",
+    "fig, ax1 = plt.subplots()\n",
+    "\n",
+    "color = 'tab:red'\n",
+    "ax1.set_xlabel('Iteration')\n",
+    "ax1.set_ylabel('Total FIFO Size [kB]', color=color)\n",
+    "ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)\n",
+    "ax1.tick_params(axis='y', labelcolor=color)\n",
+    "ax1.set_ylim(0, max(log_total_fifo_size))\n",
+    "         \n",
+    "ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis\n",
+    "\n",
+    "color = 'tab:blue'\n",
+    "ax2.set_ylabel('Latency [cycles]', color=color)\n",
+    "ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)\n",
+    "ax2.tick_params(axis='y', labelcolor=color)\n",
+    "#ax2.set_ylim(0, max(log_latency))\n",
+    "\n",
+    "ax2.axhline(log_min_latency[0], color=\"green\", label=\"Minimum (1st frame) Latency\")\n",
+    "ax2.legend()\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.savefig('fifo_iterative_graph.png', dpi = 300)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "466f818f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Iteration: 11\n",
+      "Reducing depth of FIFO: 48/266\n",
+      "Numer of minimized FIFOs: 266/266\n",
+      "Interval: 903174\n",
+      "Min. latency / latency: 2549314/2580781\n",
+      "Total FIFO Size (kB): 226\n",
+      "Done (49 seconds)\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Optional second pass for fine-tuning\n",
+    "(fifo_depths,\n",
+    " log_total_fifo_size,\n",
+    " log_interval,\n",
+    " log_min_latency,\n",
+    " log_latency) = size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2c707459",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FIFO        DEPTH | SIZE\n",
+      "FIFO 000:       1 | 24\n",
+      "FIFO 001:       2 | 48\n",
+      "FIFO 002:       2 | 48\n",
+      "FIFO 003:      16 | 2048\n",
+      "FIFO 004:       8 | 64\n",
+      "FIFO 005:       2 | 16\n",
+      "FIFO 006:       8 | 64\n",
+      "FIFO 007:      32 | 256\n",
+      "FIFO 008:      32 | 128\n",
+      "FIFO 009:      32 | 128\n",
+      "FIFO 010:       2 | 8\n",
+      "FIFO 011:     128 | 8192\n",
+      "FIFO 012:       1 | 32\n",
+      "FIFO 013:       1 | 2\n",
+      "FIFO 014:      16 | 128\n",
+      "FIFO 015:     256 | 2048\n",
+      "FIFO 016:       2 | 16\n",
+      "FIFO 017:       2 | 16\n",
+      "FIFO 018:     355 | 45440\n",
+      "FIFO 019:       1 | 4\n",
+      "FIFO 020:       4 | 256\n",
+      "FIFO 021:       1 | 8\n",
+      "FIFO 022:       1 | 10\n",
+      "FIFO 023:       1 | 8\n",
+      "FIFO 024:    4096 | 32768\n",
+      "FIFO 025:       1 | 8\n",
+      "FIFO 026:       1 | 4\n",
+      "FIFO 027:    4096 | 32768\n",
+      "FIFO 028:       1 | 64\n",
+      "FIFO 029:     256 | 1024\n",
+      "FIFO 030:     256 | 2048\n",
+      "FIFO 031:       2 | 16\n",
+      "FIFO 032:       2 | 16\n",
+      "FIFO 033:     288 | 36864\n",
+      "FIFO 034:       1 | 4\n",
+      "FIFO 035:       1 | 64\n",
+      "FIFO 036:       1 | 8\n",
+      "FIFO 037:       1 | 10\n",
+      "FIFO 038:       4 | 32\n",
+      "FIFO 039:       4 | 32\n",
+      "FIFO 040:    4096 | 32768\n",
+      "FIFO 041:    4096 | 32768\n",
+      "FIFO 042:       8 | 32\n",
+      "FIFO 043:      16 | 1024\n",
+      "FIFO 044:     256 | 1024\n",
+      "FIFO 045:     256 | 2048\n",
+      "FIFO 046:       2 | 16\n",
+      "FIFO 047:       2 | 16\n",
+      "FIFO 048:     288 | 36864\n",
+      "FIFO 049:       1 | 4\n",
+      "FIFO 050:       1 | 128\n",
+      "FIFO 051:       1 | 8\n",
+      "FIFO 052:       1 | 10\n",
+      "FIFO 053:       1 | 8\n",
+      "FIFO 054:       1 | 4\n",
+      "FIFO 055:       1 | 4\n",
+      "FIFO 056:       1 | 4\n",
+      "FIFO 057:       1 | 8\n",
+      "FIFO 058:      28 | 3584\n",
+      "FIFO 059:       1 | 4\n",
+      "FIFO 060:       1 | 8\n",
+      "FIFO 061:       1 | 8\n",
+      "FIFO 062:     114 | 14592\n",
+      "FIFO 063:       1 | 8\n",
+      "FIFO 064:       2 | 16\n",
+      "FIFO 065:       1 | 8\n",
+      "FIFO 066:     243 | 31104\n",
+      "FIFO 067:       1 | 4\n",
+      "FIFO 068:       2 | 128\n",
+      "FIFO 069:       1 | 8\n",
+      "FIFO 070:       1 | 10\n",
+      "FIFO 071:       1 | 8\n",
+      "FIFO 072:       1 | 8\n",
+      "FIFO 073:    4096 | 32768\n",
+      "FIFO 074:    4096 | 32768\n",
+      "FIFO 075:       1 | 4\n",
+      "FIFO 076:       6 | 384\n",
+      "FIFO 077:      60 | 240\n",
+      "FIFO 078:     128 | 1024\n",
+      "FIFO 079:       2 | 16\n",
+      "FIFO 080:       2 | 16\n",
+      "FIFO 081:     394 | 50432\n",
+      "FIFO 082:       1 | 4\n",
+      "FIFO 083:       1 | 64\n",
+      "FIFO 084:      15 | 120\n",
+      "FIFO 085:      15 | 150\n",
+      "FIFO 086:      16 | 128\n",
+      "FIFO 087:      16 | 128\n",
+      "FIFO 088:    4096 | 32768\n",
+      "FIFO 089:    4096 | 32768\n",
+      "FIFO 090:      16 | 64\n",
+      "FIFO 091:      32 | 2048\n",
+      "FIFO 092:      64 | 256\n",
+      "FIFO 093:     128 | 1024\n",
+      "FIFO 094:      32 | 256\n",
+      "FIFO 095:       2 | 16\n",
+      "FIFO 096:     394 | 50432\n",
+      "FIFO 097:       1 | 4\n",
+      "FIFO 098:       1 | 64\n",
+      "FIFO 099:      15 | 120\n",
+      "FIFO 100:      15 | 150\n",
+      "FIFO 101:      16 | 128\n",
+      "FIFO 102:      16 | 128\n",
+      "FIFO 103:    4096 | 32768\n",
+      "FIFO 104:    4096 | 32768\n",
+      "FIFO 105:      16 | 64\n",
+      "FIFO 106:      32 | 2048\n",
+      "FIFO 107:      64 | 256\n",
+      "FIFO 108:     128 | 1024\n",
+      "FIFO 109:      32 | 256\n",
+      "FIFO 110:       2 | 16\n",
+      "FIFO 111:     394 | 50432\n",
+      "FIFO 112:       1 | 4\n",
+      "FIFO 113:       1 | 64\n",
+      "FIFO 114:       1 | 8\n",
+      "FIFO 115:       8 | 80\n",
+      "FIFO 116:       8 | 64\n",
+      "FIFO 117:       8 | 32\n",
+      "FIFO 118:       1 | 4\n",
+      "FIFO 119:       8 | 32\n",
+      "FIFO 120:       1 | 8\n",
+      "FIFO 121:      16 | 2048\n",
+      "FIFO 122:       8 | 32\n",
+      "FIFO 123:       1 | 8\n",
+      "FIFO 124:       8 | 64\n",
+      "FIFO 125:     121 | 15488\n",
+      "FIFO 126:       1 | 8\n",
+      "FIFO 127:       2 | 16\n",
+      "FIFO 128:       1 | 8\n",
+      "FIFO 129:     243 | 31104\n",
+      "FIFO 130:       2 | 8\n",
+      "FIFO 131:       8 | 512\n",
+      "FIFO 132:       1 | 8\n",
+      "FIFO 133:       8 | 80\n",
+      "FIFO 134:       8 | 64\n",
+      "FIFO 135:       8 | 64\n",
+      "FIFO 136:    1024 | 8192\n",
+      "FIFO 137:    8192 | 65536\n",
+      "FIFO 138:       8 | 32\n",
+      "FIFO 139:      16 | 1024\n",
+      "FIFO 140:       4 | 16\n",
+      "FIFO 141:       8 | 64\n",
+      "FIFO 142:       2 | 16\n",
+      "FIFO 143:       2 | 16\n",
+      "FIFO 144:     512 | 65536\n",
+      "FIFO 145:       1 | 4\n",
+      "FIFO 146:       1 | 64\n",
+      "FIFO 147:      30 | 240\n",
+      "FIFO 148:      32 | 320\n",
+      "FIFO 149:      32 | 256\n",
+      "FIFO 150:      32 | 256\n",
+      "FIFO 151:    1024 | 8192\n",
+      "FIFO 152:    8192 | 65536\n",
+      "FIFO 153:      32 | 128\n",
+      "FIFO 154:      32 | 2048\n",
+      "FIFO 155:      32 | 128\n",
+      "FIFO 156:      32 | 256\n",
+      "FIFO 157:       2 | 16\n",
+      "FIFO 158:       2 | 16\n",
+      "FIFO 159:     512 | 65536\n",
+      "FIFO 160:       1 | 4\n",
+      "FIFO 161:       1 | 64\n",
+      "FIFO 162:      30 | 240\n",
+      "FIFO 163:      32 | 320\n",
+      "FIFO 164:      32 | 256\n",
+      "FIFO 165:      32 | 256\n",
+      "FIFO 166:    1024 | 8192\n",
+      "FIFO 167:    8192 | 65536\n",
+      "FIFO 168:      32 | 128\n",
+      "FIFO 169:      32 | 2048\n",
+      "FIFO 170:      32 | 128\n",
+      "FIFO 171:      32 | 256\n",
+      "FIFO 172:       2 | 16\n",
+      "FIFO 173:       2 | 16\n",
+      "FIFO 174:     512 | 65536\n",
+      "FIFO 175:       1 | 4\n",
+      "FIFO 176:       1 | 64\n",
+      "FIFO 177:      30 | 240\n",
+      "FIFO 178:      32 | 320\n",
+      "FIFO 179:      32 | 256\n",
+      "FIFO 180:      32 | 256\n",
+      "FIFO 181:    1024 | 8192\n",
+      "FIFO 182:    8192 | 65536\n",
+      "FIFO 183:      32 | 128\n",
+      "FIFO 184:      32 | 2048\n",
+      "FIFO 185:      32 | 128\n",
+      "FIFO 186:      32 | 256\n",
+      "FIFO 187:       2 | 16\n",
+      "FIFO 188:       2 | 16\n",
+      "FIFO 189:     512 | 65536\n",
+      "FIFO 190:       1 | 4\n",
+      "FIFO 191:       1 | 64\n",
+      "FIFO 192:      30 | 240\n",
+      "FIFO 193:      32 | 320\n",
+      "FIFO 194:      32 | 256\n",
+      "FIFO 195:    1024 | 8192\n",
+      "FIFO 196:      32 | 256\n",
+      "FIFO 197:      32 | 128\n",
+      "FIFO 198:    8192 | 65536\n",
+      "FIFO 199:      32 | 2048\n",
+      "FIFO 200:      32 | 128\n",
+      "FIFO 201:      32 | 256\n",
+      "FIFO 202:       2 | 16\n",
+      "FIFO 203:       2 | 16\n",
+      "FIFO 204:     512 | 65536\n",
+      "FIFO 205:       1 | 4\n",
+      "FIFO 206:       1 | 64\n",
+      "FIFO 207:       1 | 8\n",
+      "FIFO 208:       1 | 10\n",
+      "FIFO 209:       1 | 8\n",
+      "FIFO 210:       1 | 10\n",
+      "FIFO 211:       1 | 4\n",
+      "FIFO 212:       1 | 4\n",
+      "FIFO 213:       1 | 4\n",
+      "FIFO 214:       1 | 8\n",
+      "FIFO 215:       8 | 1024\n",
+      "FIFO 216:       1 | 4\n",
+      "FIFO 217:       1 | 8\n",
+      "FIFO 218:       2 | 16\n",
+      "FIFO 219:     121 | 15488\n",
+      "FIFO 220:       1 | 8\n",
+      "FIFO 221:       2 | 16\n",
+      "FIFO 222:       1 | 8\n",
+      "FIFO 223:     218 | 27904\n",
+      "FIFO 224:       4 | 16\n",
+      "FIFO 225:       8 | 512\n",
+      "FIFO 226:       3 | 24\n",
+      "FIFO 227:       4 | 40\n",
+      "FIFO 228:       8 | 64\n",
+      "FIFO 229:       8 | 64\n",
+      "FIFO 230:    3696 | 29568\n",
+      "FIFO 231:    7782 | 62256\n",
+      "FIFO 232:       8 | 32\n",
+      "FIFO 233:      64 | 4096\n",
+      "FIFO 234:      16 | 64\n",
+      "FIFO 235:      16 | 128\n",
+      "FIFO 236:       2 | 16\n",
+      "FIFO 237:       2 | 16\n",
+      "FIFO 238:     512 | 65536\n",
+      "FIFO 239:       4 | 16\n",
+      "FIFO 240:       8 | 512\n",
+      "FIFO 241:       3 | 24\n",
+      "FIFO 242:       4 | 40\n",
+      "FIFO 243:       8 | 64\n",
+      "FIFO 244:       8 | 64\n",
+      "FIFO 245:    3696 | 29568\n",
+      "FIFO 246:    7782 | 62256\n",
+      "FIFO 247:       8 | 32\n",
+      "FIFO 248:      64 | 4096\n",
+      "FIFO 249:      16 | 64\n",
+      "FIFO 250:      16 | 128\n",
+      "FIFO 251:       2 | 16\n",
+      "FIFO 252:       2 | 16\n",
+      "FIFO 253:     512 | 65536\n",
+      "FIFO 254:       4 | 16\n",
+      "FIFO 255:       8 | 512\n",
+      "FIFO 256:       2 | 16\n",
+      "FIFO 257:       2 | 20\n",
+      "FIFO 258:       2 | 16\n",
+      "FIFO 259:       2 | 20\n",
+      "FIFO 260:       4 | 80\n",
+      "FIFO 261:       2 | 40\n",
+      "FIFO 262:       1 | 16\n",
+      "FIFO 263:       1 | 20\n",
+      "FIFO 264:       1 | 21\n",
+      "FIFO 265:       1 | 16\n"
+     ]
+    }
+   ],
+   "source": [
+    "### Display resulting FIFO depths\n",
+    "print(\"FIFO        DEPTH | SIZE\")\n",
+    "for fifo, depth in enumerate(fifo_depths):\n",
+    "    size = depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % fifo]\n",
+    "    print(\"FIFO %03d: \"%(fifo) + (\"%d\"%(depth)).rjust(7) + \" | %d\"%(size))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "64c444f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Export for use in FINN\n",
+    "fifo_depth_export = {}\n",
+    "for fifo, depth in enumerate(fifo_depths):\n",
+    "    fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo] = {}\n",
+    "    # Try to account for additional registers introduced by virtual FIFO HLS implementation\n",
+    "    fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo][\"depth\"] = depth + 4\n",
+    "\n",
+    "with open(\"fifo_depth_export.json\", \"w\") as f:\n",
+    "    json.dump(fifo_depth_export, f, indent=2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From b394bba7d4603f149e034c82ef296db93fc575f5 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 26 Feb 2025 10:25:07 +0000
Subject: [PATCH 043/125] Initialize DVC

---
 .dvc/.gitignore  | 3 +++
 .dvc/config      | 9 +++++++++
 .dvcignore       | 4 ++++
 requirements.txt | 3 ++-
 4 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore

diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 0000000000..528f30c71c
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
new file mode 100644
index 0000000000..000da4310e
--- /dev/null
+++ b/.dvc/config
@@ -0,0 +1,9 @@
+[core]
+    remote = public
+['remote "push"']
+    url = webdavs://uni-paderborn.sciebo.de/public.php/webdav
+    user = XKrfO8JuRmm9pBo
+['remote "public"']
+    url = webdavs://uni-paderborn.sciebo.de/public.php/webdav
+    user = zkYThpsdAk69ZOb
+    password = ""
diff --git a/.dvcignore b/.dvcignore
new file mode 100644
index 0000000000..be35ed42ab
--- /dev/null
+++ b/.dvcignore
@@ -0,0 +1,4 @@
+# Add patterns of files dvc should ignore, which could improve
+# the performance. Learn more at
+# https://dvc.org/doc/user-guide/dvcignore
+__pycache__
diff --git a/requirements.txt b/requirements.txt
index 1683695576..8233f97a54 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 bitstring==3.1.7
 clize==5.0.1
 dataclasses-json==0.5.7
+dvc[webdav]~=3.59.1
 gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2
@@ -11,7 +12,7 @@ onnxruntime==1.18.1
 pre-commit==3.3.2
 protobuf==3.20.3
 psutil==5.9.4
-pyscaffold==4.4
+pyscaffold==4.6
 scipy==1.10.1
 setupext-janitor>=1.1.2
 sigtools==4.0.1

From d8bc10d9f6a86ab1dd6c5eda1d6ba8f71b28f87c Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 26 Feb 2025 17:49:25 +0000
Subject: [PATCH 044/125] fix metafi, test dvc

---
 benchmarking/bench-ci.yml                    |  1 +
 benchmarking/cfg/metafi_fifosizing_test.json |  4 ++--
 benchmarking/cfg/metafi_test.json            |  2 +-
 benchmarking/collect.py                      | 22 ++++++++++++++++++++
 requirements.txt                             |  3 ++-
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index c3c40d4b0e..c7803e27ec 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -51,6 +51,7 @@ Result Collection:
     - image_build
   script:
     - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
+    - dvc exp push -r origin
   artifacts:
     name: "bench_results"
     when: always
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
index f61ec93217..c61d1265fa 100644
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ b/benchmarking/cfg/metafi_fifosizing_test.json
@@ -20,7 +20,7 @@
     {
         "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
@@ -39,7 +39,7 @@
     {
         "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index 0ee1339441..6475f1aadd 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -2,7 +2,7 @@
     {
         "dut": ["metafi"],
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 3bc9aaf04b..ffe2222f73 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -3,6 +3,7 @@
 import os
 import sys
 import time
+from dvclive import Live
 
 def merge_dicts(a: dict, b: dict):
     for key in b:
@@ -79,6 +80,27 @@ def wait_for_power_measurements():
     print("Consolidating synthesis results from all sub-jobs of the array")
     consolidate_logs(sys.argv[1], sys.argv[2])
 
+    # TEST DVC
+    # TODO: proper metric collection directly from .jsons in report build dir
+    combined_log = []
+    with open(sys.argv[2], "r") as f:
+        combined_log = json.load(f)
+
+    for run in combined_log:
+        with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live:
+            metadata = {
+                "run_id": run["run_id"],
+                "task_id": run["task_id"],
+                "status": run["status"],
+                "total_time": run["total_time"]
+            }
+            live.log_params(metadata)
+            live.log_params(run["params"])
+
+            if "builder" in run["output"]:
+                for key in run["output"]["builder"]:
+                    live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False)
+
     # TODO: disabled for now, update accordingly to new runner-based measurement setup
     # wait_for_power_measurements()
     # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
diff --git a/requirements.txt b/requirements.txt
index 8233f97a54..c553f637e6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
 bitstring==3.1.7
 clize==5.0.1
 dataclasses-json==0.5.7
-dvc[webdav]~=3.59.1
+dvc[webdav]==3.59.1
+dvclive[image]==3.48.2
 gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2

From 8324083aa297f736cf16996f97c38eb8ef5709c2 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 13:49:05 +0000
Subject: [PATCH 045/125] Fix ResNet-50 streamlining

---
 benchmarking/dut/resnet50_custom_steps.py | 95 ++++++++++++-----------
 1 file changed, 48 insertions(+), 47 deletions(-)

diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py
index ddf8b0d0de..90deae5721 100644
--- a/benchmarking/dut/resnet50_custom_steps.py
+++ b/benchmarking/dut/resnet50_custom_steps.py
@@ -27,75 +27,65 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-
+from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
+from qonnx.transformation.composed import ComposedTransformation
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
 from qonnx.transformation.fold_constants import FoldConstants
-
 from qonnx.transformation.general import (
-    ConvertSubToAdd,
+    ApplyConfig,
     ConvertDivToMul,
+    ConvertSubToAdd,
     GiveReadableTensorNames,
     GiveUniqueNodeNames,
-    SortGraph,
-    RemoveUnusedTensors,
     GiveUniqueParameterTensors,
     RemoveStaticGraphInputs,
-    ApplyConfig,
+    RemoveUnusedTensors,
+    SortGraph,
 )
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.insert_topk import InsertTopK
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
 
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from finn.builder.build_dataflow_config import DataflowBuildConfig, ShellFlowType
+from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.streamline.absorb import (
-    AbsorbScalarMulAddIntoTopK,
-    AbsorbAddIntoMultiThreshold,
-    AbsorbMulIntoMultiThreshold,
-    FactorOutMulSignMagnitude,
-    Absorb1BitMulIntoMatMul,
     Absorb1BitMulIntoConv,
+    Absorb1BitMulIntoMatMul,
+    AbsorbAddIntoMultiThreshold,
     AbsorbConsecutiveTransposes,
+    AbsorbMulIntoMultiThreshold,
+    AbsorbScalarMulAddIntoTopK,
     AbsorbTransposeIntoMultiThreshold,
+    FactorOutMulSignMagnitude,
 )
-
 from finn.transformation.streamline.collapse_repeated import (
     CollapseRepeatedAdd,
     CollapseRepeatedMul,
 )
 
+# just for not linear
 from finn.transformation.streamline.reorder import (
+    MoveAddPastConv,
     MoveAddPastMul,
-    MoveScalarMulPastMatMul,
+    MoveLinearPastEltwiseAdd,
+    MoveLinearPastFork,
+    MoveMaxPoolPastMultiThreshold,
     MoveScalarAddPastMatMul,
-    MoveAddPastConv,
-    MoveScalarMulPastConv,
     MoveScalarLinearPastInvariants,
-    MoveMaxPoolPastMultiThreshold,
+    MoveScalarMulPastConv,
+    MoveScalarMulPastMatMul,
+    MoveTransposePastEltwise,
+    MoveTransposePastFork,
+    MoveTransposePastJoinAdd,
 )
-
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.transformation.streamline.sign_to_thres import ConvertSignToThres
-from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine
-
-# just for not linear
-from finn.transformation.streamline.reorder import (
-    MoveLinearPastEltwiseAdd,
-    MoveLinearPastFork,
-)
-
-from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
-from qonnx.transformation.remove import RemoveIdentityOps
-from qonnx.core.datatype import DataType
-
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.infer_data_layouts import InferDataLayouts
-from qonnx.transformation.insert_topk import InsertTopK
-import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-
-from finn.builder.build_dataflow_config import (
-    DataflowBuildConfig,
-    ShellFlowType,
-)
-
-from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 
 
 def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig):
@@ -170,6 +160,19 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     model = model.transform(DoubleToSingleFloat())
 
+    # Lower convolutions and streamline resulting transposes
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(
+        ComposedTransformation(
+            [
+                MoveTransposePastJoinAdd(),
+                MoveTransposePastFork(),
+                MoveTransposePastEltwise(),
+                AbsorbConsecutiveTransposes(),
+                AbsorbTransposeIntoMultiThreshold(),
+            ]
+        )
+    )
     return model
 
 
@@ -181,17 +184,15 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(SortGraph())
 
     to_hw_transformations = [
-        to_hw.InferAddStreamsLayer,
-        LowerConvsToMatMul,
         to_hw.InferChannelwiseLinearLayer,
         to_hw.InferPool,
-        AbsorbTransposeIntoMultiThreshold,
+        AbsorbConsecutiveTransposes,
         RoundAndClipThresholds,
         to_hw.InferQuantizedMatrixVectorActivation,
         to_hw.InferThresholdingLayer,
-        AbsorbConsecutiveTransposes,
         to_hw.InferConvInpGen,
         to_hw.InferDuplicateStreamsLayer,
+        to_hw.InferAddStreamsLayer,
         to_hw.InferLabelSelectLayer,
     ]
     for trn in to_hw_transformations:
@@ -249,4 +250,4 @@ def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
         # comment: apply floorplan to model
         # model = model.transform(ApplyConfig(floorplan))
         # print("SLR floorplanning applied from partitioner")
-    return model
\ No newline at end of file
+    return model

From 66a9c6e6e6e01850577e43d535322dc8a6a10add Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 13:50:04 +0000
Subject: [PATCH 046/125] Remove transformer debug streamlining code

---
 benchmarking/bench-ci.yml                    | 4 ++--
 benchmarking/dut/transformer_custom_steps.py | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index c7803e27ec..206d395839 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -50,8 +50,8 @@ Result Collection:
   tags:
     - image_build
   script:
-    - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
-    - dvc exp push -r origin
+    - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
+    - dvc exp push origin
   artifacts:
     name: "bench_results"
     when: always
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 1a96117e22..4ff497b892 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -276,11 +276,6 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
     # Note: Contains some sets of nested exhaustive transformations meant for
     # particular architectural patterns, e.g., residual topologies.
     model = model.transform(Streamline())
-    # DEBUG for streamlining after moving to MoveLinearPastFork with workaround applied
-    model = model.transform(MoveMulPastAdd())
-    model = model.transform(AbsorbMulIntoMultiThreshold())
-    model = model.transform(AbsorbAddIntoMultiThreshold())
-    model = model.transform(MoveAddPastMul())
     # If configured, run a verification of the transformed model on some
     # sample inputs
     if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():  # noqa

From c1696d9f82c6506c586cf3fe09cd1fd0cbba39d2 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 14:27:13 +0000
Subject: [PATCH 047/125] Enable live fifosizing option

---
 benchmarking/bench_base.py      | 17 +++++++++++++++--
 benchmarking/dut/metafi.py      |  2 --
 benchmarking/dut/resnet50.py    |  3 +--
 benchmarking/dut/transformer.py |  5 -----
 4 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 9493a12786..a97054aca9 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -24,7 +24,8 @@
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.res_estimation import res_estimation
 from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs
-from finn.util.basic import make_build_dir, pynq_native_port_width, part_map
+import finn.builder.build_dataflow_config as build_cfg
+from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map
 from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
 from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
@@ -1065,6 +1066,11 @@ def steps_full_build_flow(self):
         # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow
         cfg = self.step_build_setup()
         cfg.board = self.board
+        if self.board in alveo_part_map:
+            cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO
+            cfg.vitis_platform=alveo_default_platform[self.board]
+        else:
+            cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
         cfg.force_python_rtlsim = False
@@ -1072,10 +1078,17 @@ def steps_full_build_flow(self):
         #cfg.default_swg_exception
         #cfg.large_fifo_mem_style
 
-        # "manual or "characterize" or "largefifo_rtlsim"
+        # "manual or "characterize" or "largefifo_rtlsim" or "live"
         if "fifo_method" in self.params:
             if self.params["fifo_method"] == "manual":
                 cfg.auto_fifo_depths = False
+            elif self.params["fifo_method"] == "live":
+                cfg.auto_fifo_depths = False
+                cfg.live_fifo_sizing = True
+                cfg.enable_instrumentation = True
+                # Overwrite output products
+                # TODO: make configurable directly via JSON/YAML cfg
+                cfg.generate_outputs = [build_cfg.DataflowOutputType.BITFILE]
             else:
                 cfg.auto_fifo_depths = True
                 cfg.auto_fifo_strategy = self.params["fifo_method"]
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index 7808f11856..b4bd4246b7 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -49,8 +49,6 @@ def step_build_setup(self):
             steps=steps,
 
             target_fps=None, #23
-            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
-            #vitis_platform=vitis_platform,
 
             split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test
 
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
index 87c6e04e2e..ec03e44a8b 100644
--- a/benchmarking/dut/resnet50.py
+++ b/benchmarking/dut/resnet50.py
@@ -39,9 +39,8 @@ def step_build_setup(self):
             output_dir = self.build_inputs["build_dir"],
             synth_clk_period_ns = self.clock_period_ns,
             steps=resnet50_build_steps,
-            shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end
+
             split_large_fifos=True,
-            vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end
 
             # enable extra performance optimizations (physopt)
             vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST,
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 014da2e13e..91c73bbffe 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -897,10 +897,6 @@ def step_build_setup(self):
         with open("folding.yaml", "w") as f:
                 f.write(template_folding_yaml)
 
-        if self.board in alveo_part_map:
-            shell_flow = "vitis_alveo"
-        else:
-            shell_flow = "vivado_zynq"
 
         # Create a configuration for building the scaled dot-product attention
         # operator to a hardware accelerator
@@ -910,7 +906,6 @@ def step_build_setup(self):
             output_dir = self.build_inputs["build_dir"],
             stitched_ip_gen_dcp = False, # only needed for further manual integration
             synth_clk_period_ns = self.clock_period_ns,
-            shell_flow_type = shell_flow,
             folding_config_file = "folding.yaml",
             specialize_layers_config_file = "specialize_layers.json",
             standalone_thresholds = True,

From 01d5551f1d93a6c97760e3a4335018e314f9de6b Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 15:37:31 +0000
Subject: [PATCH 048/125] Generate FIFO size report as part of
 step_set_fifo_depths

---
 src/finn/builder/build_dataflow_steps.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index fe0cb68a88..ef90cba0b6 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -656,6 +656,23 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         model = model.transform(SplitLargeFIFOs())
     model = model.transform(RemoveShallowFIFOs())
 
+    # generate a dedicated report about final FIFO sizes
+    fifo_info = {}
+    fifo_info["fifo_depths"] = {}
+    fifo_info["fifo_sizes"] = {}
+    total_fifo_size = 0
+    for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"):
+        node_inst = getCustomOp(node)
+        fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
+        fifo_info["fifo_sizes"][
+            node.name
+        ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
+        total_fifo_size += fifo_info["fifo_sizes"][node.name]
+    fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
+
+    with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f:
+        json.dump(fifo_info, f, indent=2)
+
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
     model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))

From 3598501532ede834cf894439bab9793dc49a853f Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 17:49:47 +0000
Subject: [PATCH 049/125] Add PYNQ driver for ZYNQ platforms

---
 src/finn/builder/build_dataflow_steps.py      |  10 +-
 .../driver/driver_instrumentation.py          | 143 ++++++++++++++++++
 .../fpgadataflow/make_pynq_driver.py          |  33 +++-
 3 files changed, 183 insertions(+), 3 deletions(-)
 create mode 100644 src/finn/qnn-data/templates/driver/driver_instrumentation.py

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index a4481ed778..96f3bd7c63 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -90,7 +90,10 @@
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
-from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_driver import (
+    MakePYNQDriverIODMA,
+    MakePYNQDriverInstrumentation,
+)
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
@@ -782,7 +785,10 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
-        model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
+        if cfg.enable_instrumentation:
+            model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns))
+        else:
+            model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
         print("PYNQ Python driver written into " + driver_dir)
     return model
diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
new file mode 100644
index 0000000000..fea9446bf5
--- /dev/null
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -0,0 +1,143 @@
+import time
+import json
+import argparse
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from IPython.display import clear_output
+import numpy as np
+from pynq import Overlay
+from pynq.ps import Clocks
+from pynq.pl_server.device import Device
+
+### Instrumentation wrapper register map ###
+#ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
+#ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
+#ap_uint<32> &latency,
+#ap_uint<32> &interval,
+#ap_uint<32> &checksum,
+#ap_uint<32> &min_latency
+
+class FINNInstrumentationOverlay(Overlay):
+    def __init__(
+        self,
+        bitfile_name,
+        platform = "zynq",
+        fclk_mhz = 100.0,
+        device = None,
+        download = True,
+        seed = 1,
+    ):
+        super().__init__(bitfile_name, download=download, device=device)
+
+        self.platform = platform
+        self.fclk_mhz = fclk_mhz
+        self.seed = seed
+
+        # configure clock (for ZYNQ platforms)
+        if self.platform == "zynq":
+            if self.fclk_mhz > 0:
+                Clocks.fclk0_mhz = self.fclk_mhz
+                self.fclk_mhz_actual = Clocks.fclk0_mhz
+
+    def instrumentation_read(self, name):
+        return self.instrumentation_wrap_0.read(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"])
+
+    def instrumentation_write(self, name, value):
+        return self.instrumentation_wrap_0.write(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], value=value)
+
+    def reset_accelerator(self):
+        self.axi_gpio_0.write(offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0)
+
+    def start_accelerator(self):
+        lfsr_seed = (self.seed << 16) & 0xffff0000 # upper 16 bits
+        self.instrumentation_write("cfg", lfsr_seed + 1) # start operation
+
+    def observe_instrumentation(self, debug_print=True):
+        status_reg = self.instrumentation_read("status")
+        chksum_reg = self.instrumentation_read("checksum")
+        min_latency = self.instrumentation_read("min_latency")
+        latency = self.instrumentation_read("latency")
+        interval =  self.instrumentation_read("interval")
+
+        frame = (chksum_reg >> 24) & 0x000000ff
+        checksum = chksum_reg & 0x00ffffff
+        overflow_err = (status_reg & 0x00000001) != 0
+        underflow_err = (status_reg & 0x00000002) != 0
+
+        if debug_print:
+            print("---INSTRUMENTATION_REPORT---")
+            if overflow_err or underflow_err:
+                print("Status ERROR")
+                print("Overflow error: %s" % overflow_err)
+                print("Underflow error: %s" % underflow_err)
+            else:
+                print("Status OK")
+            print("Frame number (8-bit): %d" % frame)
+            print("Checksum: 0x%06x" % checksum)
+            print("Min Latency (cycles): %d" % min_latency)
+            print("Latency (cycles): %d" % latency)
+            print("Interval (cycles): %d" % interval)
+            print("----------------------------")
+
+        return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper')
+    parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10)
+    parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0)
+    parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1)
+    parser.add_argument('--device', help='FPGA device to be used', type=int, default=0)
+    parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit")
+    parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json")
+    parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="")
+    # parse arguments
+    args = parser.parse_args()
+    runtime = args.runtime
+    frequency = args.frequency
+    seed = args.seed
+    bitfile = args.bitfile
+    reportfile = args.reportfile
+    settingsfile = args.settingsfile
+    devID = args.device
+    device = Device.devices[devID]
+
+    # overwrite frequency if specified in settings file
+    if settingsfile != "":
+        with open(settingsfile, "r") as f:
+            settings = json.load(f)
+            if "fclk_mhz" in settings:
+                frequency = settings["fclk_mhz"]
+
+    # instantiate FINN accelerator driver and pass batchsize and bitfile
+    print("Programming FPGA..")
+    accel = FINNInstrumentationOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed)
+
+    # start accelerator
+    print("Running accelerator..")
+    accel.start_accelerator()
+
+    # let it run for specified runtime
+    time.sleep(runtime)
+
+    # read measurement from instrumentation
+    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = accel.observe_instrumentation()
+
+    # write report to file
+    report = {
+        "error": overflow_err or underflow_err or interval == 0,
+        "checksum": checksum,
+        "min_latency_cycles": min_latency,
+        "latency_cycles": latency,
+        "interval_cycles": interval,
+        "frequency_mhz": round(accel.fclk_mhz_actual),
+        "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))),
+        "min_pipeline_depth": round(min_latency / interval, 2),
+        "pipeline_depth" : round(latency / interval, 2),
+    }
+    with open(reportfile, "w") as f:
+        json.dump(report, f, indent=2)
+
+    print("Done.")
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index ea9bd2aa26..b935f5eea0 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -28,6 +28,7 @@
 
 import numpy as np
 import os
+import json
 import qonnx
 import shutil
 import warnings
@@ -62,7 +63,7 @@ def to_external_tensor(init, w_dtype):
     return ext_weight
 
 
-class MakePYNQDriver(Transformation):
+class MakePYNQDriverIODMA(Transformation):
     """Create PYNQ Python code to correctly interface the generated
     accelerator, including data packing/unpacking. Should be called
     after conversion to HLS layers, folding and the creation of
@@ -302,4 +303,34 @@ def apply(self, model):
                 else:
                     continue
 
+
+class MakePYNQDriverInstrumentation(Transformation):
+    def __init__(self, platform, clk_period_ns):
+        super().__init__()
+        self.platform = platform
+        self.clk_period_ns = clk_period_ns
+
+    def apply(self, model):
+        # TODO: support runtime-writable and external weights
+        # TODO: support Alveo and Versal platforms
+
+        # create a temporary folder for the generated driver
+        pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
+        model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
+
+        # create (copy) the static instrumentation driver
+        driver_template = (
+            os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py"
+        )
+        driver_py = pynq_driver_dir + "/driver.py"
+        shutil.copy(driver_template, driver_py)
+
+        # write default settings to driver config file
+        settings = {
+            "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3,
+        }
+        settingsfile = pynq_driver_dir + "/settings.json"
+        with open(settingsfile, "w") as f:
+            json.dump(settings, f, indent=2)
+
         return (model, False)

From f32e884b81ba4f916a96a80b8d30b0bf44b8613a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 21:09:27 +0000
Subject: [PATCH 050/125] Add non-interactive driver

---
 src/finn/builder/build_dataflow_steps.py      |   2 +-
 .../templates/driver/driver_fifosizing.py     | 320 ++++++++++++++++++
 .../fpgadataflow/make_pynq_driver.py          |  24 +-
 3 files changed, 343 insertions(+), 3 deletions(-)
 create mode 100644 src/finn/qnn-data/templates/driver/driver_fifosizing.py

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index dd50e8880f..2f05886afd 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -826,7 +826,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
         if cfg.enable_instrumentation:
-            model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns))
+            model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing))
         else:
             model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
new file mode 100644
index 0000000000..560959991f
--- /dev/null
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -0,0 +1,320 @@
+import time
+import json
+import os
+import argparse
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import numpy as np
+from pynq.pl_server.device import Device
+
+from driver_instrumentation import FINNInstrumentationOverlay
+
+
+class FINNLiveFIFOOverlay(FINNInstrumentationOverlay):
+    def __init__(
+        self,
+        bitfile_name,
+        platform = "zynq",
+        fclk_mhz = 100.0,
+        device = None,
+        download = True,
+        seed = 1,
+        fifo_widths = {},
+    ):
+        super().__init__(bitfile_name, platform = platform, fclk_mhz = fclk_mhz, seed = seed, download = download, device = device)
+
+        self.error = False
+        self.fifo_widths = fifo_widths
+        self.num_fifos = len(self.fifo_widths)
+        # Try to account for additional registers introduced by virtual FIFO HLS implementation
+        self.fifo_depth_offset = 4
+
+        # Sanity check
+        # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps
+        # We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights
+        if (len(self.ip_dict.keys()) - 3) != self.num_fifos:
+            self.error = True
+
+    def configure_fifo(self, i, mode, depth = 2):
+        ### Virtual FIFO register map ###
+        mode_offset = 0x10
+        depth_offset = 0x18
+        occupancy_offset = 0x20
+        occupancy_ctrl_offset = 0x24
+        max_occupancy_offset = 0x30
+        max_occupancy_ctrl_offset = 0x34
+
+        ip_name = "StreamingDataflowPartition_%d" % i
+        getattr(self, ip_name).write(offset=mode_offset, value = mode)
+        getattr(self, ip_name).write(offset=depth_offset, value = depth)
+
+    def total_fifo_size(self, depths):
+        # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs
+        total_size_bits = 0
+        for i, depth in enumerate(depths):
+            total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths["StreamingFIFO_hls_%d" % i]
+        total_size_kB = total_size_bits / 8.0 / 1000.0
+        return total_size_kB
+    
+    def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.5):
+        ### Iterative FIFO-sizing function ###
+        fifo_minimum_reached = [False] * self.num_fifos
+        
+        if isinstance(start_depth, list):
+            # Individual start depth for each FIFO has been supplied
+            fifo_depths = start_depth
+        else:
+            # Initialize all depths to the same start depth
+            fifo_depths = [start_depth] * self.num_fifos
+        
+        # Reset accelerator and configure FIFOs
+        self.reset_accelerator()
+        for i in range(0, self.num_fifos):
+            self.configure_fifo(i, mode = 1, depth = fifo_depths[i])
+
+        # Run once to determine target interval
+        self.start_accelerator()
+        time.sleep(1)
+        (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False)
+        log_total_fifo_size = [int(self.total_fifo_size(fifo_depths))]
+        log_interval = [interval]
+        log_min_latency = [min_latency]
+        log_latency = [latency]
+        target_interval = interval
+        
+        # Iteratively reduce FIFO depth until all FIFOs are minimized
+        iteration = 0
+        start_time = time.time()
+        while not all(fifo_minimum_reached):
+            for fifo_id in range(0, self.num_fifos):
+                if not fifo_minimum_reached[fifo_id]:
+                    fifo_depth_before = fifo_depths[fifo_id]
+                    fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor)
+
+                    # Reset accelerator
+                    self.reset_accelerator()
+
+                    # Configure all FIFOs
+                    for i in range(0, self.num_fifos):
+                        self.configure_fifo(i, mode = 1, depth = fifo_depths[i])
+
+                    # Start accelerator
+                    self.start_accelerator()
+
+                    # Let it run
+                    time.sleep(iteration_runtime)
+
+                    # Check if throughput dropped or deadlock occured 
+                    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False)
+
+                    if interval > target_interval or interval == 0 or overflow_err or underflow_err:
+                        # Revert depth reduction and mark FIFO as minimized
+                        fifo_depths[fifo_id] = fifo_depth_before
+                        fifo_minimum_reached[fifo_id] = True
+                    else:
+                        log_total_fifo_size.append(int(self.total_fifo_size(fifo_depths)))
+                        log_interval.append(interval)
+                        log_min_latency.append(min_latency)
+                        log_latency.append(latency) 
+
+                    if fifo_depths[fifo_id] == 1:
+                        fifo_minimum_reached[fifo_id] = True
+
+            # Report status
+            print("Iteration: %d" % iteration)
+            print("Numer of minimized FIFOs: %d/%d" % (sum(fifo_minimum_reached), self.num_fifos))
+            print("Interval: %d" % log_interval[-1])
+            print("Min. latency / latency: %d/%d" % (log_min_latency[-1], log_latency[-1]))
+            print("Total FIFO Size (kB): %d" % log_total_fifo_size[-1])
+
+            iteration += 1
+
+        end_time = time.time()
+        duration = int(end_time - start_time)
+        print("Done (%d seconds)" % duration)
+
+        return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency, duration
+
+    def determine_start_depth(self, ):
+        ### Attempt to determine start depth for all FIFOs automatically ###
+        # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis
+        start_depth = 64
+        last_interval = 0
+        start_depth_found = False
+
+        while not start_depth_found and not self.error:
+            print("Testing start depth of %d" % start_depth)
+            self.reset_accelerator()
+
+            # Configure FIFOs
+            for i in range(0, self.num_fifos):
+                self.configure_fifo(i, mode = 1, depth = start_depth)
+            
+            # Start accelerator and let it run for a long time
+            self.start_accelerator()
+            time.sleep(1)
+            
+            # Examine performance
+            (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation()
+            if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:
+                # Accelerator runs with stable interval, reset to previous start depth
+                start_depth_found = True
+                start_depth = last_start_depth
+            else:
+                # Start depth is still too small, increase for next try
+                last_start_depth = start_depth
+                start_depth = start_depth * 2
+            
+            last_interval = interval
+
+            if start_depth > 1000000:
+                print("Couldn't find a working start depth, please set manually")
+                self.error = True
+            
+        # Determine runtime per iteration based on performance, so that stable-state is guaranteed
+        # Use a simple overestimation for now to be safe
+        iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)
+
+        print("Determined start depth for all FIFOs: %d" % start_depth)
+        print("Determined iteration runtime based on performance: %f s" % iteration_runtime)
+        return (start_depth, iteration_runtime)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper')
+    parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10)
+    parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0)
+    parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1)
+    parser.add_argument('--device', help='FPGA device to be used', type=int, default=0)
+    parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit")
+    parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json")
+    parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="")
+    # parse arguments
+    args = parser.parse_args()
+    runtime = args.runtime
+    frequency = args.frequency
+    seed = args.seed
+    bitfile = args.bitfile
+    reportfile = args.reportfile
+    report_dir = os.path.dirname(reportfile)
+    settingsfile = args.settingsfile
+    devID = args.device
+    device = Device.devices[devID]
+
+    # overwrite frequency if specified in settings file
+    if settingsfile != "":
+        with open(settingsfile, "r") as f:
+            settings = json.load(f)
+            if "fclk_mhz" in settings:
+                frequency = settings["fclk_mhz"]
+
+            # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,
+            # {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}
+            fifo_widths = settings["fifo_widths"]
+
+
+    print("Programming FPGA..")
+    accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths)
+    
+    (start_depth, iteration_runtime) = accel.determine_start_depth()
+
+    ### First pass
+    print("Starting first pass..")
+    pass1_result = accel.size_iteratively(start_depth, iteration_runtime)
+    (fifo_depths,
+    log_total_fifo_size,
+    log_interval,
+    log_min_latency,
+    log_latency,
+    duration) = pass1_result
+
+    ### Visualize results and save as "fifo_sizing_graph.png"
+    fig, ax1 = plt.subplots()
+
+    color = 'tab:red'
+    ax1.set_xlabel('Iteration')
+    ax1.set_ylabel('Total FIFO Size [kB]', color=color)
+    ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)
+    ax1.tick_params(axis='y', labelcolor=color)
+    ax1.set_ylim(0, max(log_total_fifo_size))
+            
+    ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
+
+    color = 'tab:blue'
+    ax2.set_ylabel('Latency [cycles]', color=color)
+    ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)
+    ax2.tick_params(axis='y', labelcolor=color)
+    #ax2.set_ylim(0, max(log_latency))
+
+    ax2.axhline(log_min_latency[0], color="green", label="Minimum (1st frame) Latency")
+    ax2.legend()
+
+    plt.tight_layout()
+    plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi = 300)
+
+    ### Second pass for fine-tuning
+    print("Starting second pass..")
+    pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)
+    (fifo_depths,
+    log_total_fifo_size,
+    log_interval,
+    log_min_latency,
+    log_latency,
+    duration) = pass2_result
+
+    ### Generate fifo_sizing_report.json
+    fifo_report = {
+        "error": accel.error,
+        "fifo_size_total_kB": log_total_fifo_size[-1],
+        "fifo_depths": {},
+        "fifo_sizes": {},
+        "pass_1": {
+            "duration": pass1_result[5],
+            "log_total_fifo_size": pass1_result[1],
+            "log_interval": pass1_result[2],
+            "log_min_latency": pass1_result[3],
+            "log_latency": pass1_result[4],
+        },
+        "pass_2": {
+            "duration": pass2_result[5],
+            "log_total_fifo_size": pass2_result[1],
+            "log_interval": pass2_result[2],
+            "log_min_latency": pass2_result[3],
+            "log_latency": pass2_result[4],
+        },
+    }
+    for fifo, depth in enumerate(fifo_depths):
+        size = (depth + accel.fifo_depth_offset) * accel.fifo_widths["StreamingFIFO_hls_%d" % fifo]
+        fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset
+        fifo_report["fifo_sizes"][fifo] = size
+    with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f:
+        json.dump(fifo_report, f, indent=2)
+
+    ### Generate fifo_depth_export.json to export FIFO depths for use in FINN
+    fifo_depth_export = {}
+    for fifo, depth in enumerate(fifo_depths):
+        fifo_depth_export["StreamingFIFO_rtl_%d" % fifo] = {}
+        fifo_depth_export["StreamingFIFO_rtl_%d" % fifo]["depth"] = depth + accel.fifo_depth_offset
+    with open(os.path.join(report_dir, "fifo_depth_export.json"), "w") as f:
+        json.dump(fifo_depth_export, f, indent=2)
+
+    ### Generate the usual instrumentation performance report based on final state
+    min_latency = log_min_latency[-1]
+    latency = log_latency[-1]
+    interval = log_interval[-1]
+    report = {
+        "error": accel.error,
+        "checksum": 0,
+        "min_latency_cycles": min_latency,
+        "latency_cycles": latency,
+        "interval_cycles": interval,
+        "frequency_mhz": round(accel.fclk_mhz_actual),
+        "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))),
+        "min_pipeline_depth": round(min_latency / interval, 2),
+        "pipeline_depth" : round(latency / interval, 2),
+    }
+    with open(reportfile, "w") as f:
+        json.dump(report, f, indent=2)
+
+    print("Done.")
\ No newline at end of file
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index b935f5eea0..93c0e45e6c 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -305,10 +305,11 @@ def apply(self, model):
 
 
 class MakePYNQDriverInstrumentation(Transformation):
-    def __init__(self, platform, clk_period_ns):
+    def __init__(self, platform, clk_period_ns, live_fifo_sizing):
         super().__init__()
         self.platform = platform
         self.clk_period_ns = clk_period_ns
+        self.live_fifo_sizing = live_fifo_sizing
 
     def apply(self, model):
         # TODO: support runtime-writable and external weights
@@ -322,13 +323,32 @@ def apply(self, model):
         driver_template = (
             os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py"
         )
-        driver_py = pynq_driver_dir + "/driver.py"
+        if self.live_fifo_sizing:
+            driver_py = pynq_driver_dir + "/driver_instrumentation.py"
+        else:
+            driver_py = pynq_driver_dir + "/driver.py"
         shutil.copy(driver_template, driver_py)
 
+        # add-on driver for live fifosizing
+        if self.live_fifo_sizing:
+            driver_template = (
+                os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_fifosizing.py"
+            )
+            driver_py = pynq_driver_dir + "/driver.py"
+            shutil.copy(driver_template, driver_py)
+
         # write default settings to driver config file
         settings = {
             "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3,
         }
+        if self.live_fifo_sizing:
+            # export FIFO widths to the settings file as well
+            fifo_widths = {}
+            for node in model.get_nodes_by_op_type("StreamingFIFO_hls"):
+                node_inst = getCustomOp(node)
+                fifo_widths[node.name] = node_inst.get_instream_width()
+            settings["fifo_widths"] = fifo_widths
+
         settingsfile = pynq_driver_dir + "/settings.json"
         with open(settingsfile, "w") as f:
             json.dump(settings, f, indent=2)

From 0bc66389add4249f00f055772e1c39993197331e Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 27 Feb 2025 21:13:58 +0000
Subject: [PATCH 051/125] DVC push fix

---
 benchmarking/bench-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 206d395839..f62f2eb35a 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -51,7 +51,7 @@ Result Collection:
     - image_build
   script:
     - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
-    - dvc exp push origin
+    - dvc exp push git@github.com:eki-project/finn-plus.git
   artifacts:
     name: "bench_results"
     when: always

From cd66c9211c1a32159cdfb5da47f4b11d2990ea97 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 11:14:27 +0000
Subject: [PATCH 052/125] Refactor and remove old code

---
 .gitlab-ci.yml                                |   7 +-
 benchmarking/bench_base.py                    | 567 +-----------------
 benchmarking/cfg/fifosizing_test.json         |  23 -
 benchmarking/cfg/metafi_fifosizing_test.json  |  57 --
 benchmarking/cfg/metafi_test.json             |   4 +-
 benchmarking/cfg/mvau_test.json               |   4 +-
 .../cfg/resnet50_fifosizing_test.json         |  66 --
 benchmarking/cfg/resnet50_test.json           |  18 +-
 benchmarking/cfg/synthetic_fifotest.json      |  64 ++
 benchmarking/cfg/transformer_gpt_all.json     |  26 +-
 benchmarking/cfg/transformer_radioml_all.json |  14 +-
 benchmarking/cfg/transformer_test.json        |   5 +-
 benchmarking/collect.py                       |  10 +-
 benchmarking/dut/metafi.py                    |  14 -
 benchmarking/dut/resnet50.py                  |  13 -
 benchmarking/dut/synthetic_nonlinear.py       |  11 -
 benchmarking/dut/transformer.py               |  42 +-
 benchmarking/harness/sink/ip/component.xml    | 256 --------
 .../harness/sink/ip/src/harness_sink.v        |  39 --
 .../sink/ip/xgui/harness_sink_v1_0.tcl        |  25 -
 benchmarking/harness/vector_xor.v             |  32 -
 src/finn/builder/build_dataflow.py            |   5 +-
 src/finn/builder/build_dataflow_config.py     |   6 +-
 23 files changed, 142 insertions(+), 1166 deletions(-)
 delete mode 100644 benchmarking/cfg/fifosizing_test.json
 delete mode 100644 benchmarking/cfg/metafi_fifosizing_test.json
 delete mode 100644 benchmarking/cfg/resnet50_fifosizing_test.json
 create mode 100644 benchmarking/cfg/synthetic_fifotest.json
 delete mode 100644 benchmarking/harness/sink/ip/component.xml
 delete mode 100644 benchmarking/harness/sink/ip/src/harness_sink.v
 delete mode 100644 benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
 delete mode 100644 benchmarking/harness/vector_xor.v

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c19da1d908..a82ad24eeb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -211,14 +211,9 @@ Bench:
     PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   parallel:
     matrix:
-      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all]
-
-#dev: mvau_test
-#fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test
-#transformer: transformer_test, transformer_radioml_all
+      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest]
 
 #TODO: add selector for none, reduced, full benchmark suite
-
 #TODO: introduce result collect job on parent level for easier visualization/excel interfacing
 #TODO: more control via (optional) variables
 #TODO: move power measurement from polling-based script to its own job/runner
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index a97054aca9..636af6bb5e 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -42,422 +42,6 @@
 import pandas as pd
 import onnxruntime as ort
 
-class MakeZYNQHarnessProject(Transformation):
-    """Based on MakeZYNQProject transformation, but integrates IP into test harness instead of DMA shell."""
-
-    def __init__(self, platform, output_dir, dut_duplication=1, clock_period_ns=10):
-        super().__init__()
-        self.platform = platform
-        self.output_dir = output_dir
-        self.dut_duplication = dut_duplication
-        self.clock_period_ns = clock_period_ns
-
-    def apply(self, model):
-        # create a config file and empty list of xo files
-        config = []
-        idma_idx = 0
-        odma_idx = 0
-        aximm_idx = 0
-        axilite_idx = 0
-        global_clk_ns = 0
-
-        # assume single stitched-ip (previously dataflowpartition) as DUT
-        # assume single primary input/output
-        input_tensor = model.graph.input[0]
-        output_tensor = model.graph.output[0]
-        input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
-        output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
-        instream_width = input_node_inst.get_instream_width_padded()
-        outstream_width = output_node_inst.get_outstream_width_padded()
-
-        # assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
-        # sdp_node = getCustomOp(node)
-        # dataflow_model_filename = sdp_node.get_nodeattr("model")
-        # kernel_model = ModelWrapper(dataflow_model_filename)
-        kernel_model = model
-
-        ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj")
-        if ipstitch_path is None or (not os.path.isdir(ipstitch_path)):
-            raise Exception("No stitched IPI design found, apply CreateStitchedIP first.")
-
-        vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv")
-        if vivado_stitch_vlnv is None:
-            raise Exception("No vlnv found, apply CreateStitchedIP first.")
-
-        ip_dirs = ["list"]
-        ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path)
-        ip_dirs.append("$::env(FINN_ROOT)/benchmarking/harness/sink/ip")
-        ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
-        config.append(
-            "set_property ip_repo_paths "
-            "[concat [get_property ip_repo_paths [current_project]] %s] "
-            "[current_project]" % ip_dirs_str
-        )
-        config.append("update_ip_catalog -rebuild -scan_changes")
-        config.append(
-            "import_files -fileset sources_1 -norecurse $::env(FINN_ROOT)/benchmarking/harness/vector_xor.v"
-        )
-
-        # get metadata property clk_ns to calculate clock frequency
-        clk_ns = float(kernel_model.get_metadata_prop("clk_ns"))
-        if clk_ns > global_clk_ns:
-            global_clk_ns = clk_ns
-
-        ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
-
-        # instantiate DUT, TODO: switch to wrapper verilog file for (multiple-) DUT instantiation
-        for id in range(self.dut_duplication):
-            dut_instance_name = "finn_design_%d" % id
-            config.append(
-                "create_bd_cell -type ip -vlnv %s %s" % (vivado_stitch_vlnv, dut_instance_name)
-            )
-            # sdp_node.set_nodeattr("instance_name", instance_names[node.name])
-            config.append(
-                "connect_bd_net [get_bd_pins %s/ap_clk] [get_bd_pins axi_interconnect_0/aclk]"
-                % dut_instance_name
-            )
-            config.append(
-                "connect_bd_net [get_bd_pins %s/ap_rst_n] [get_bd_pins axi_interconnect_0/aresetn]"
-                % dut_instance_name
-            )
-
-        # instantiate input harness
-        if instream_width > 8192:
-            print("ERROR: DUT input stream width > 8192")
-            raise Exception("ERROR: DUT input stream width > 8192")
-        elif instream_width > 4096:
-            num_sources = 8
-            source_width = roundup_to_integer_multiple(instream_width / 8, 8)
-        elif instream_width > 2048:
-            num_sources = 4
-            source_width = roundup_to_integer_multiple(instream_width / 4, 8)
-        elif instream_width > 1024:
-            num_sources = 2
-            source_width = roundup_to_integer_multiple(instream_width / 2, 8)
-        else:
-            num_sources = 1
-            source_width = instream_width
-
-        if self.dut_duplication > 1:
-            if num_sources > 1:
-                print("ERROR: DUT duplication with >1024 stream width not supported!")
-                raise Exception("ERROR: DUT duplication with >1024 stream width not supported!")
-
-            num_sources = self.dut_duplication  # one source per DUT instance
-            seed = 0xABCD
-            for id in range(num_sources):
-                config.append(
-                    "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d"
-                    % id
-                )
-                config.append(
-                    "set_property -dict [list \
-                    CONFIG.C_ATG_MODE {AXI4-Stream} \
-                    CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \
-                    CONFIG.C_AXIS_SPARSE_EN {false} \
-                    CONFIG.C_AXIS_TDATA_WIDTH {%d} \
-                    CONFIG.C_AXIS_TDEST_WIDTH {0} \
-                    CONFIG.C_AXIS_TID_WIDTH {0} \
-                    CONFIG.C_AXIS_TUSER_WIDTH {0} \
-                    CONFIG.STRM_DATA_SEED {%s} \
-                    ] [get_bd_cells axi_traffic_gen_%d]"
-                    % (source_width, "0x{:04X}".format(seed), id)
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]"
-                    % id
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]"
-                    % id
-                )
-                seed = seed + 99
-
-                config.append(
-                    "connect_bd_intf_net [get_bd_intf_pins axi_traffic_gen_%d/M_AXIS_MASTER] [get_bd_intf_pins finn_design_%d/s_axis_0]"
-                    % (id, id)
-                )
-
-        else:
-            seed = 0xABCD
-            for id in range(num_sources):
-                config.append(
-                    "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d"
-                    % id
-                )
-                config.append(
-                    "set_property -dict [list \
-                    CONFIG.C_ATG_MODE {AXI4-Stream} \
-                    CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \
-                    CONFIG.C_AXIS_SPARSE_EN {false} \
-                    CONFIG.C_AXIS_TDATA_WIDTH {%d} \
-                    CONFIG.C_AXIS_TDEST_WIDTH {0} \
-                    CONFIG.C_AXIS_TID_WIDTH {0} \
-                    CONFIG.C_AXIS_TUSER_WIDTH {0} \
-                    CONFIG.STRM_DATA_SEED {%s} \
-                    ] [get_bd_cells axi_traffic_gen_%d]"
-                    % (source_width, "0x{:04X}".format(seed), id)
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]"
-                    % id
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]"
-                    % id
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tready] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tready]"
-                    % id
-                )
-                seed = seed + 99
-
-            if num_sources > 1:
-                config.append(
-                    "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_tdata"
-                )
-                config.append(
-                    "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_tdata]" % num_sources
-                )
-
-                for id in range(num_sources):
-                    config.append(
-                        "connect_bd_net [get_bd_pins xlconcat_tdata/In%d] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tdata]"
-                        % (id, id)
-                    )
-
-                config.append(
-                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins xlconcat_tdata/dout]"
-                )
-            else:
-                config.append(
-                    "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins axi_traffic_gen_0/m_axis_1_tdata]"
-                )
-
-            # only connect valid from source 0 to DUT
-            config.append(
-                "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tvalid] [get_bd_pins axi_traffic_gen_0/m_axis_1_tvalid]"
-            )
-
-        # instantiate output harness
-        for id in range(self.dut_duplication):
-            config.append(
-                "create_bd_cell -type ip -vlnv xilinx.com:user:harness_sink:1.0 sink_%d" % id
-            )
-            config.append(
-                "set_property -dict [list CONFIG.STREAM_WIDTH {%d}] [get_bd_cells sink_%d]"
-                % (outstream_width, id)
-            )
-            config.append(
-                "connect_bd_intf_net [get_bd_intf_pins sink_%d/s_axis_0] [get_bd_intf_pins finn_design_%d/m_axis_0]"
-                % (id, id)
-            )
-
-        # GPIO control (TODO: connect interrupt)
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0")
-        config.append(
-            "set_property -dict [list \
-            CONFIG.C_ALL_INPUTS {0} \
-            CONFIG.C_GPIO_WIDTH {5} \
-            CONFIG.C_INTERRUPT_PRESENT {1} \
-            ] [get_bd_cells axi_gpio_0]"
-        )
-        config.append(
-            "connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] "
-            "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx)
-        )
-        config.append("assign_axi_addr_proc axi_gpio_0/S_AXI")
-        axilite_idx += 1
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_0")
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_1")
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_2")
-        config.append(
-            "set_property -dict [list \
-            CONFIG.DIN_FROM {0} \
-            CONFIG.DIN_TO {0} \
-            CONFIG.DIN_WIDTH {5} \
-            ] [get_bd_cells xlslice_0]"
-        )
-        config.append(
-            "set_property -dict [list \
-            CONFIG.DIN_FROM {1} \
-            CONFIG.DIN_TO {1} \
-            CONFIG.DIN_WIDTH {5} \
-            ] [get_bd_cells xlslice_1]"
-        )
-        config.append(
-            "set_property -dict [list \
-            CONFIG.DIN_FROM {2} \
-            CONFIG.DIN_TO {2} \
-            CONFIG.DIN_WIDTH {5} \
-            ] [get_bd_cells xlslice_2]"
-        )
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_0")
-        config.append(
-            "set_property -dict [list CONFIG.IN1_WIDTH.VALUE_SRC USER CONFIG.IN2_WIDTH.VALUE_SRC USER CONFIG.IN0_WIDTH.VALUE_SRC USER] [get_bd_cells xlconcat_0]"
-        )
-        config.append(
-            "set_property -dict [list \
-            CONFIG.IN0_WIDTH {3} \
-            CONFIG.NUM_PORTS {3} \
-            ] [get_bd_cells xlconcat_0]"
-        )
-        config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0")
-        config.append(
-            "set_property -dict [list \
-            CONFIG.CONST_VAL {0} \
-            CONFIG.CONST_WIDTH {3} \
-            ] [get_bd_cells xlconstant_0]"
-        )
-        config.append(
-            """
-            connect_bd_net [get_bd_pins xlslice_0/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
-            connect_bd_net [get_bd_pins xlslice_1/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
-            connect_bd_net [get_bd_pins xlslice_2/Din] [get_bd_pins axi_gpio_0/gpio_io_o]
-            connect_bd_net [get_bd_pins xlconstant_0/dout] [get_bd_pins xlconcat_0/In0]
-            connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_i] [get_bd_pins xlconcat_0/dout]
-        """
-        )
-        if self.dut_duplication > 1:
-            config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_valid")
-            config.append(
-                "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_valid]"
-                % self.dut_duplication
-            )
-            config.append(
-                "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_checksum"
-            )
-            config.append(
-                "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_checksum]"
-                % self.dut_duplication
-            )
-
-            config.append("create_bd_cell -type module -reference vector_xor vector_xor_valid")
-            config.append(
-                "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_valid]"
-                % self.dut_duplication
-            )
-            config.append("create_bd_cell -type module -reference vector_xor vector_xor_checksum")
-            config.append(
-                "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_checksum]"
-                % self.dut_duplication
-            )
-
-            config.append(
-                "connect_bd_net [get_bd_pins vector_xor_valid/in_data] [get_bd_pins xlconcat_valid/dout]"
-            )
-            config.append(
-                "connect_bd_net [get_bd_pins vector_xor_checksum/in_data] [get_bd_pins xlconcat_checksum/dout]"
-            )
-            config.append(
-                "connect_bd_net [get_bd_pins vector_xor_valid/out_data] [get_bd_pins xlconcat_0/In1]"
-            )
-            config.append(
-                "connect_bd_net [get_bd_pins vector_xor_checksum/out_data] [get_bd_pins xlconcat_0/In2]"
-            )
-            for id in range(self.dut_duplication):
-                config.append(
-                    "connect_bd_net [get_bd_pins sink_%d/valid] [get_bd_pins xlconcat_valid/In%d]"
-                    % (id, id)
-                )
-                config.append(
-                    "connect_bd_net [get_bd_pins sink_%d/checksum] [get_bd_pins xlconcat_checksum/In%d]"
-                    % (id, id)
-                )
-        else:
-            config.append("connect_bd_net [get_bd_pins sink_0/valid] [get_bd_pins xlconcat_0/In1]")
-            config.append(
-                "connect_bd_net [get_bd_pins sink_0/checksum] [get_bd_pins xlconcat_0/In2]"
-            )
-        for id in range(self.dut_duplication):
-            config.append(
-                "connect_bd_net [get_bd_pins xlslice_2/Dout] [get_bd_pins sink_%d/enable]" % id
-            )
-        for id in range(num_sources):
-            config.append(
-                "connect_bd_net [get_bd_pins xlslice_0/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_start]"
-                % id
-            )
-            config.append(
-                "connect_bd_net [get_bd_pins xlslice_1/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_stop]"
-                % id
-            )
-
-        # create a temporary folder for the project
-        vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
-        model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
-
-        fclk_mhz = int(1 / (global_clk_ns * 0.001))
-
-        # create a TCL recipe for the project
-        ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl"
-        config = "\n".join(config) + "\n"
-        with open(ipcfg, "w") as f:
-            f.write(
-                zynq_harness_template
-                % (
-                    fclk_mhz,
-                    axilite_idx,
-                    aximm_idx,
-                    self.platform,
-                    part_map[self.platform],
-                    config,
-                )
-            )
-
-        # create a TCL recipe for the project
-        synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh"
-        working_dir = os.environ["PWD"]
-        with open(synth_project_sh, "w") as f:
-            f.write("#!/bin/bash \n")
-            f.write("cd {}\n".format(vivado_pynq_proj_dir))
-            f.write("vivado -mode batch -source %s\n" % ipcfg)
-            f.write("cd {}\n".format(working_dir))
-
-        # call the synthesis script
-        bash_command = ["bash", synth_project_sh]
-        process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-        process_compile.communicate()
-
-        # collect results
-        os.makedirs(self.output_dir, exist_ok=True)
-
-        bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
-        if not os.path.isfile(bitfile_name):
-            raise Exception(
-                "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
-            )
-        hwh_name = vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh"
-        if not os.path.isfile(hwh_name):
-            raise Exception(
-                "Synthesis failed, no hwh file found. Check logs under %s" % vivado_pynq_proj_dir
-            )
-        synth_report_name = vivado_pynq_proj_dir + "/synth_report.xml"
-        model.set_metadata_prop("vivado_synth_rpt", synth_report_name)
-        model.set_metadata_prop("bitfile", bitfile_name)
-        model.set_metadata_prop("hw_handoff", hwh_name)
-
-        shcopy(bitfile_name, self.output_dir)
-        shcopy(hwh_name, self.output_dir)
-        shcopy(synth_report_name, self.output_dir)
-
-        post_synth_resources = model.analysis(post_synth_res)
-        with open(self.output_dir + "/post_synth_resources.json", "w") as f:
-                json.dump(post_synth_resources, f, indent=2)
-
-        timing_rpt = ("%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"% vivado_pynq_proj_dir)
-        shcopy(timing_rpt, self.output_dir + "/post_route_timing.rpt")
-        return (model, False)
-
-def step_synth_harness(model: ModelWrapper, cfg: DataflowBuildConfig):
-    # Build step version of above transformation (used for full builds)
-    model = model.transform(MakeZYNQHarnessProject(
-                platform=cfg.board,
-                output_dir=os.path.join(cfg.output_dir, "harness"),
-                #dut_duplication=dut_duplication, #TODO: enable for full builds
-                clock_period_ns=cfg.synth_clk_period_ns
-            ))
-    return model
 
 def start_test_batch_fast(results_path, project_path, run_target, pairs):
     # Prepare tcl script
@@ -786,14 +370,14 @@ def step_synth_power(self):
         build_dir = "temp_output_harness_build"
         # TODO: replace hold harness with new instr wrapper implementation
         #TODO: if synth fails this could contain stale bitstreams which will be power tested
-        model = model.transform(
-            MakeZYNQHarnessProject(
-                platform=self.board,
-                output_dir=build_dir,
-                dut_duplication=dut_duplication,
-                clock_period_ns=self.clock_period_ns
-            )
-        )
+        # model = model.transform(
+        #     MakeZYNQHarnessProject(
+        #         platform=self.board,
+        #         output_dir=build_dir,
+        #         dut_duplication=dut_duplication,
+        #         clock_period_ns=self.clock_period_ns
+        #     )
+        # )
 
         # COPY bitstreams and other outputs
         # TODO: integrate better (e.g. as artifact) and remove redundant copy
@@ -872,120 +456,6 @@ def step_parse_builder_output(self, build_dir):
         else:
             pass #TODO: warn/skip?
 
-        ### ANALYZE FIFOs ###
-        fifo_info = {}
-        # TODO: skip if not present
-        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
-
-        fifo_info["fifo_depths"] = {}
-        fifo_info["fifo_sizes"] = {}
-        total_fifo_size = 0
-        for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"):
-            node_inst = getCustomOp(node)
-            fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
-            fifo_info["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
-            total_fifo_size += fifo_info["fifo_sizes"][node.name] 
-        fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
-
-        self.output_dict["fifos"] = fifo_info
-
-    def step_fifotest(self, onnx_path, cfg, build_dir):
-        # requires certain output products (e.g., ESTIMATE_REPORTS, RTLSIM_PERFORMANCE)
-        # TODO: check them and skip/warn if missing
-        log = {}
-        # load performance reports
-        with open(build_dir + "/report/estimate_network_performance.json") as f:
-            est_data = json.load(f)
-        with open(build_dir + "/report/rtlsim_performance.json") as f:
-            sim_data = json.load(f) 
-
-        # check for deadlock
-        model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx")
-        first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
-        last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-        input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
-        output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
-        deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
-        log["deadlock"] = deadlock.tolist()
-
-        # check rtlsim throughput
-        throughput = sim_data["throughput[images/s]"]
-        stable_throughput = sim_data["stable_throughput[images/s]"]
-        estimated_throughput = est_data["estimated_throughput_fps"]
-        throughput_factor = throughput / estimated_throughput
-        stable_throughput_factor = stable_throughput / estimated_throughput
-
-        # TODO: Take throughput or stable_throughput?
-        throughput_pass = throughput_factor > self.params["fifo_throughput_factor_threshold"]
-
-        log["throughput_pass"] = throughput_pass
-        log["throughput"] = throughput
-        log["stable_throughput"] = stable_throughput
-        log["estimated_throughput"] = estimated_throughput
-
-        # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear
-        fifo_reduction_pass = []
-        log["fifo_reduction_results"] = {}
-        model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx")
-        for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"):
-            model = copy.deepcopy(model_orig)
-            node = model.get_node_from_name(node_orig.name)
-            node_inst = getCustomOp(node)
-
-            # skip shallow FIFOs
-            # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado?
-            if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]:
-                log["fifo_reduction_results"][node.name] = "skip"
-                continue
-
-            # reduce depth of current FIFO and reset generated code
-            node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"]))
-            node_inst.set_nodeattr("code_gen_dir_ipgen", "")
-            node_inst.set_nodeattr("ip_path", "")
-            node_inst.set_nodeattr("ipgen_path", "")
-
-            # save model variation
-            tmp_output_dir_var = build_dir + "/variations/" + node.name
-            os.makedirs(tmp_output_dir_var)
-            model.save(tmp_output_dir_var + "/model.onnx")
-
-            # build again, only re-run necessary steps to save time
-            cfg.output_dir = tmp_output_dir_var
-            cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"]
-            build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg)
-
-            # load performance report
-            with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f:
-                sim_data = json.load(f)
-
-            # check for deadlock
-            model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx")
-            first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name))
-            last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name))
-            input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"]
-            output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"]
-            var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected
-
-            # check rtlsim throughput
-            var_throughput = sim_data["throughput[images/s]"]
-            var_stable_throughput = sim_data["stable_throughput[images/s]"]
-            # TODO: take throughput or stable_throughput?
-            throughput_drop = (throughput - var_throughput) / throughput
-
-            if var_deadlock:   
-                fifo_reduction_pass.append(True)
-                log["fifo_reduction_results"][node.name] = 1.0
-            elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]:
-                fifo_reduction_pass.append(True)
-                log["fifo_reduction_results"][node.name] = throughput_drop
-            else:
-                fifo_reduction_pass.append(False)
-                log["fifo_reduction_results"][node.name] = "fail (no drop)"
-
-        if "fifos" not in self.output_dict:
-            self.output_dict["fifos"] = {}
-        self.output_dict["fifos"]["fifotest"] = log
-
     def steps_simple_model_flow(self):
         # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops)
         do_hls = self.params["do_hls"] if "do_hls" in self.params else False
@@ -1023,8 +493,8 @@ def steps_simple_model_flow(self):
             self.step_synthesis()
         if do_sim_power:
             self.step_sim_power()
-        if do_synth_power:
-            self.step_synth_power()
+        #if do_synth_power:
+        #    self.step_synth_power()
 
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
@@ -1062,18 +532,24 @@ def steps_full_build_flow(self):
             self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
 
         ### BUILD SETUP ###
-        # TODO: select output products here, depending on what shall be tested
-        # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow
         cfg = self.step_build_setup()
+        cfg.generate_outputs = self.params["output_products"]
+        cfg.output_dir = self.build_inputs["build_dir"]
+        cfg.synth_clk_period_ns = self.clock_period_ns
         cfg.board = self.board
         if self.board in alveo_part_map:
             cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO
             cfg.vitis_platform=alveo_default_platform[self.board]
         else:
             cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ
+        # enable extra performance optimizations (physopt)
+        cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
+        cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
         cfg.force_python_rtlsim = False
+        cfg.split_large_fifos = True
+        cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet
         #rtlsim_use_vivado_comps # TODO ?
         #cfg.default_swg_exception
         #cfg.large_fifo_mem_style
@@ -1086,9 +562,6 @@ def steps_full_build_flow(self):
                 cfg.auto_fifo_depths = False
                 cfg.live_fifo_sizing = True
                 cfg.enable_instrumentation = True
-                # Overwrite output products
-                # TODO: make configurable directly via JSON/YAML cfg
-                cfg.generate_outputs = [build_cfg.DataflowOutputType.BITFILE]
             else:
                 cfg.auto_fifo_depths = True
                 cfg.auto_fifo_strategy = self.params["fifo_method"]
@@ -1125,7 +598,3 @@ def steps_full_build_flow(self):
 
         ### ANALYSIS ###
         self.step_parse_builder_output(self.build_inputs["build_dir"])
-
-        # Only run in-depth FIFO test if selected
-        if "fifo_throughput_factor_threshold" in self.params:
-            self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"])
diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json
deleted file mode 100644
index cf49aa80a7..0000000000
--- a/benchmarking/cfg/fifosizing_test.json
+++ /dev/null
@@ -1,23 +0,0 @@
-[
-    {
-        "dut": ["synthetic_nonlinear"],
-        "dim": [32],
-        "kernel_size": [5],
-        "ch": [4],
-        "simd": [4],
-        "pe": [4],
-        "parallel_window": [1],
-
-        "lb_num_layers": [1],
-        "rb_num_layers": [3],
-
-        "fifo_method": ["characterize"],
-        "fifo_strategy": ["analytical", "rtlsim"],
-
-        "rtlsim_n": [10],
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [64],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    }
-    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json
deleted file mode 100644
index c61d1265fa..0000000000
--- a/benchmarking/cfg/metafi_fifosizing_test.json
+++ /dev/null
@@ -1,57 +0,0 @@
-[
-    {
-        "dut": ["metafi"],
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/metafi_fifosizing_xsi_n2.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "rtlsim_n": [10],
-
-        "fifo_method": ["manual"],
-
-        "fifo_rtlsim_n": [2],
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    },
-    {
-        "dut": ["metafi"],
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "rtlsim_n": [5],
-
-        "fifo_method": ["largefifo_rtlsim"],
-
-        "fifo_rtlsim_n": [2, 4, 8],
-        "fifo_throttle_factor": [0.5, 2],
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    },
-    {
-        "dut": ["metafi"],
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "rtlsim_n": [5],
-
-        "fifo_method": ["characterize"],
-        "fifo_strategy": ["rtlsim", "analytical"],
-
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    }
-    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json
index 6475f1aadd..bc10f857c3 100644
--- a/benchmarking/cfg/metafi_test.json
+++ b/benchmarking/cfg/metafi_test.json
@@ -7,8 +7,8 @@
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
 
-        "fifo_method": ["manual"],
+        "fifo_method": ["live"],
 
-        "rtlsim_n": [3]
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json
index e9fc3358b5..d4cb2072be 100644
--- a/benchmarking/cfg/mvau_test.json
+++ b/benchmarking/cfg/mvau_test.json
@@ -25,6 +25,8 @@
         "do_sim_power": [true],
         "do_synth_power": [true],
 
-        "dut_duplication": [1]
+        "dut_duplication": [1],
+
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]] ##
     }
     ]
diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json
deleted file mode 100644
index 075acda981..0000000000
--- a/benchmarking/cfg/resnet50_fifosizing_test.json
+++ /dev/null
@@ -1,66 +0,0 @@
-[
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/rn-50_fifosizing_xsi_n2.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["U250"],
-        "clock_period_ns": [4],
-
-        "rtlsim_n": [10],
-
-        "fifo_method": ["manual"],
-
-        "fifo_rtlsim_n": [2],
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    },
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["U250"],
-        "clock_period_ns": [4],
-
-        "rtlsim_n": [5],
-
-        "fifo_method": ["largefifo_rtlsim"],
-
-        "fifo_rtlsim_n": [2, 4, 8],
-        "fifo_throttle_factor": [0.5, 2],
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    },
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["U250"],
-        "clock_period_ns": [4],
-
-        "rtlsim_n": [5],
-
-        "fifo_method": ["characterize"],
-        "fifo_strategy": ["rtlsim", "analytical"],
-
-        "fifo_throughput_factor_threshold": [0.9],
-        "fifo_reduction_skip_threshold": [99999999999],
-        "fifo_reduction_factor": [0.5],
-        "fifo_reduction_throughput_drop_threshold": [0.01]
-    }
-    ]
\ No newline at end of file
diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json
index 4937cb8395..06a96729ab 100644
--- a/benchmarking/cfg/resnet50_test.json
+++ b/benchmarking/cfg/resnet50_test.json
@@ -12,6 +12,22 @@
 
         "fifo_method": ["manual"],
 
-        "rtlsim_n": [3]
+        "rtlsim_n": [5],
+        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]]
+    },
+    {
+        "dut": ["resnet50"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "fifo_method": ["live"],
+
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
     }
     ]
\ No newline at end of file
diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json
new file mode 100644
index 0000000000..1b40feb9e8
--- /dev/null
+++ b/benchmarking/cfg/synthetic_fifotest.json
@@ -0,0 +1,64 @@
+[
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [32],
+        "kernel_size": [5],
+        "ch": [4],
+        "simd": [4],
+        "pe": [4],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [3],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "rtlsim_n": [5],
+
+        "fifo_method": ["live"],
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [32],
+        "kernel_size": [5],
+        "ch": [4],
+        "simd": [4],
+        "pe": [4],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [3],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "rtlsim_n": [5],
+
+        "fifo_method": ["characterize"],
+        "fifo_strategy": ["analytical", "rtlsim"],
+        "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [32],
+        "kernel_size": [5],
+        "ch": [4],
+        "simd": [4],
+        "pe": [4],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [3],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "rtlsim_n": [5],
+
+        "fifo_method": ["largefifo_rtlsim"],
+        "fifo_rtlsim_n": [2],
+        "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json
index 4b1ee011c1..b0b70fb0aa 100644
--- a/benchmarking/cfg/transformer_gpt_all.json
+++ b/benchmarking/cfg/transformer_gpt_all.json
@@ -2,25 +2,11 @@
     {
         "dut": ["transformer"],
         "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"],
-        "dut_duplication": [1]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"],
-        "dut_duplication": [1]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"],
-        "dut_duplication": [1]
-    },   
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
-        "dut_duplication": [1]
+        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
+
+        "board": ["U280"],
+        "clock_period_ns": [10],
+
+        "output_products": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
     }
 ]
diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json
index f2c8733c20..5eeea031b2 100644
--- a/benchmarking/cfg/transformer_radioml_all.json
+++ b/benchmarking/cfg/transformer_radioml_all.json
@@ -3,12 +3,20 @@
         "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
-        "dut_duplication": [1]
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["transformer"],
         "seed": [12],
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
-        "dut_duplication": [1]
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     }
-]
+]
\ No newline at end of file
diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json
index a740a447b6..e0fcbc160d 100644
--- a/benchmarking/cfg/transformer_test.json
+++ b/benchmarking/cfg/transformer_test.json
@@ -16,6 +16,9 @@
         "model_mask": ["none"],
         "model_positional_encoding": ["binary"],
 
-        "dut_duplication": [1]
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index ffe2222f73..7ba7dc4cb0 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -89,10 +89,12 @@ def wait_for_power_measurements():
     for run in combined_log:
         with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live:
             metadata = {
-                "run_id": run["run_id"],
-                "task_id": run["task_id"],
-                "status": run["status"],
-                "total_time": run["total_time"]
+                "metadata": {
+                    "run_id": run["run_id"],
+                    "task_id": run["task_id"],
+                    "status": run["status"],
+                    "total_time": run["total_time"],
+                }
             }
             live.log_params(metadata)
             live.log_params(run["params"])
diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index b4bd4246b7..4c9dec2521 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -44,14 +44,8 @@ def step_build_setup(self):
         ]
 
         cfg = build_cfg.DataflowBuildConfig(
-            output_dir = self.build_inputs["build_dir"],
-            synth_clk_period_ns = self.clock_period_ns,
             steps=steps,
-
             target_fps=None, #23
-
-            split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test
-
             # folding_config_file=folding_config_file,
             # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
             # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
@@ -59,14 +53,6 @@ def step_build_setup(self):
 
             #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
             # standalone_thresholds=True,
-            # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST,
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP,
-                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
-                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
-            ],
         )
 
         # where is this used and why?
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
index ec03e44a8b..bf5aed8ab4 100644
--- a/benchmarking/dut/resnet50.py
+++ b/benchmarking/dut/resnet50.py
@@ -36,20 +36,7 @@ def step_build_setup(self):
         ]
 
         cfg = build_cfg.DataflowBuildConfig(
-            output_dir = self.build_inputs["build_dir"],
-            synth_clk_period_ns = self.clock_period_ns,
             steps=resnet50_build_steps,
-
-            split_large_fifos=True,
-
-            # enable extra performance optimizations (physopt)
-            vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST,
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP,
-                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
-                build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing
-            ],
         )
 
         return cfg
\ No newline at end of file
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index 19ba3a6ce0..4eb59ef7b2 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -289,19 +289,8 @@ def step_build_setup(self):
         # create build config for synthetic test models
 
         cfg = build_cfg.DataflowBuildConfig(
-            output_dir = self.build_inputs["build_dir"],
-            synth_clk_period_ns = self.clock_period_ns,
-
-            split_large_fifos=False,
             # manual folding
             target_fps=None,
-
-            shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP,
-                build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,
-            ],
         )
 
         return cfg
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 91c73bbffe..87522ad2e5 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -901,27 +901,11 @@ def step_build_setup(self):
         # Create a configuration for building the scaled dot-product attention
         # operator to a hardware accelerator
         cfg = build_cfg.DataflowBuildConfig(
-            # Unpack the build configuration parameters
-            #**params["build"]["finn"],
-            output_dir = self.build_inputs["build_dir"],
-            stitched_ip_gen_dcp = False, # only needed for further manual integration
-            synth_clk_period_ns = self.clock_period_ns,
             folding_config_file = "folding.yaml",
             specialize_layers_config_file = "specialize_layers.json",
             standalone_thresholds = True,
             max_multithreshold_bit_width = 16,
             mvau_wwidth_max = 2048,
-            split_large_fifos = True,
-
-            generate_outputs=[
-                build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
-                build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM
-                #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later
-                #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed
-                #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed
-                #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components TODO: try with pyXSI
-                #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation
-            ],
 
             verify_steps=[
                 # Verify the model after converting to the FINN onnx dialect
@@ -1006,30 +990,12 @@ def step_build_setup(self):
                 # Only for debugging for now, does not work if "vivado" style
                 # StreamingFIFOs are used
                 # node_by_node_rtlsim,
-
-                #test_step_insert_tlastmarker, # required for instrumentation_wrapper
-
                 "step_create_stitched_ip",
-
                 # "step_measure_rtlsim_performance", # not possible due to float components
-
-                step_synth_harness, #TODO: replace with instr wrapper (or port it into this step)
-                
-                #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
-
-                # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) 
-                #"step_synthesize_bitfile", 
-                #"step_make_pynq_driver",
-                #"step_deployment_package",
-
-                #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration
-                #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration
-
-                #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration
-                #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime
-                
-                #test_step_export_xo, # preparation step for original instr wrapper integration
-                #test_step_build_platform # synthesis with instr wrapper
+                "step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
+                "step_synthesize_bitfile", 
+                "step_make_pynq_driver",
+                "step_deployment_package",
             ]
         )
 
diff --git a/benchmarking/harness/sink/ip/component.xml b/benchmarking/harness/sink/ip/component.xml
deleted file mode 100644
index cb20a9abad..0000000000
--- a/benchmarking/harness/sink/ip/component.xml
+++ /dev/null
@@ -1,256 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<spirit:component xmlns:xilinx="http://www.xilinx.com" xmlns:spirit="http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-  <spirit:vendor>xilinx.com</spirit:vendor>
-  <spirit:library>user</spirit:library>
-  <spirit:name>harness_sink</spirit:name>
-  <spirit:version>1.0</spirit:version>
-  <spirit:busInterfaces>
-    <spirit:busInterface>
-      <spirit:name>s_axis_0</spirit:name>
-      <spirit:busType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis" spirit:version="1.0"/>
-      <spirit:abstractionType spirit:vendor="xilinx.com" spirit:library="interface" spirit:name="axis_rtl" spirit:version="1.0"/>
-      <spirit:slave/>
-      <spirit:portMaps>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TDATA</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_0_tdata</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TVALID</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_0_tvalid</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-        <spirit:portMap>
-          <spirit:logicalPort>
-            <spirit:name>TREADY</spirit:name>
-          </spirit:logicalPort>
-          <spirit:physicalPort>
-            <spirit:name>s_axis_0_tready</spirit:name>
-          </spirit:physicalPort>
-        </spirit:portMap>
-      </spirit:portMaps>
-    </spirit:busInterface>
-  </spirit:busInterfaces>
-  <spirit:model>
-    <spirit:views>
-      <spirit:view>
-        <spirit:name>xilinx_anylanguagesynthesis</spirit:name>
-        <spirit:displayName>Synthesis</spirit:displayName>
-        <spirit:envIdentifier>:vivado.xilinx.com:synthesis</spirit:envIdentifier>
-        <spirit:language>Verilog</spirit:language>
-        <spirit:modelName>harness_sink</spirit:modelName>
-        <spirit:fileSetRef>
-          <spirit:localName>xilinx_anylanguagesynthesis_view_fileset</spirit:localName>
-        </spirit:fileSetRef>
-        <spirit:parameters>
-          <spirit:parameter>
-            <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>18b9f9a4</spirit:value>
-          </spirit:parameter>
-        </spirit:parameters>
-      </spirit:view>
-      <spirit:view>
-        <spirit:name>xilinx_anylanguagebehavioralsimulation</spirit:name>
-        <spirit:displayName>Simulation</spirit:displayName>
-        <spirit:envIdentifier>:vivado.xilinx.com:simulation</spirit:envIdentifier>
-        <spirit:language>Verilog</spirit:language>
-        <spirit:modelName>harness_sink</spirit:modelName>
-        <spirit:fileSetRef>
-          <spirit:localName>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:localName>
-        </spirit:fileSetRef>
-        <spirit:parameters>
-          <spirit:parameter>
-            <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>18b9f9a4</spirit:value>
-          </spirit:parameter>
-        </spirit:parameters>
-      </spirit:view>
-      <spirit:view>
-        <spirit:name>xilinx_xpgui</spirit:name>
-        <spirit:displayName>UI Layout</spirit:displayName>
-        <spirit:envIdentifier>:vivado.xilinx.com:xgui.ui</spirit:envIdentifier>
-        <spirit:fileSetRef>
-          <spirit:localName>xilinx_xpgui_view_fileset</spirit:localName>
-        </spirit:fileSetRef>
-        <spirit:parameters>
-          <spirit:parameter>
-            <spirit:name>viewChecksum</spirit:name>
-            <spirit:value>6955aee3</spirit:value>
-          </spirit:parameter>
-        </spirit:parameters>
-      </spirit:view>
-    </spirit:views>
-    <spirit:ports>
-      <spirit:port>
-        <spirit:name>enable</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>valid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>checksum</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_0_tdata</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:vector>
-            <spirit:left spirit:format="long" spirit:resolve="dependent" spirit:dependency="(spirit:decode(id(&apos;MODELPARAM_VALUE.STREAM_WIDTH&apos;)) - 1)">7</spirit:left>
-            <spirit:right spirit:format="long">0</spirit:right>
-          </spirit:vector>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic_vector</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-          <spirit:driver>
-            <spirit:defaultValue spirit:format="long">0</spirit:defaultValue>
-          </spirit:driver>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_0_tvalid</spirit:name>
-        <spirit:wire>
-          <spirit:direction>in</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-      <spirit:port>
-        <spirit:name>s_axis_0_tready</spirit:name>
-        <spirit:wire>
-          <spirit:direction>out</spirit:direction>
-          <spirit:wireTypeDefs>
-            <spirit:wireTypeDef>
-              <spirit:typeName>std_logic</spirit:typeName>
-              <spirit:viewNameRef>xilinx_anylanguagesynthesis</spirit:viewNameRef>
-              <spirit:viewNameRef>xilinx_anylanguagebehavioralsimulation</spirit:viewNameRef>
-            </spirit:wireTypeDef>
-          </spirit:wireTypeDefs>
-        </spirit:wire>
-      </spirit:port>
-    </spirit:ports>
-    <spirit:modelParameters>
-      <spirit:modelParameter xsi:type="spirit:nameValueTypeType" spirit:dataType="integer">
-        <spirit:name>STREAM_WIDTH</spirit:name>
-        <spirit:displayName>Stream Width</spirit:displayName>
-        <spirit:value spirit:format="long" spirit:resolve="generated" spirit:id="MODELPARAM_VALUE.STREAM_WIDTH">8</spirit:value>
-      </spirit:modelParameter>
-    </spirit:modelParameters>
-  </spirit:model>
-  <spirit:fileSets>
-    <spirit:fileSet>
-      <spirit:name>xilinx_anylanguagesynthesis_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>src/harness_sink.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_18b9f9a4</spirit:userFileType>
-        <spirit:userFileType>IMPORTED_FILE</spirit:userFileType>
-      </spirit:file>
-    </spirit:fileSet>
-    <spirit:fileSet>
-      <spirit:name>xilinx_anylanguagebehavioralsimulation_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>src/harness_sink.v</spirit:name>
-        <spirit:fileType>verilogSource</spirit:fileType>
-        <spirit:userFileType>IMPORTED_FILE</spirit:userFileType>
-      </spirit:file>
-    </spirit:fileSet>
-    <spirit:fileSet>
-      <spirit:name>xilinx_xpgui_view_fileset</spirit:name>
-      <spirit:file>
-        <spirit:name>xgui/harness_sink_v1_0.tcl</spirit:name>
-        <spirit:fileType>tclSource</spirit:fileType>
-        <spirit:userFileType>CHECKSUM_6955aee3</spirit:userFileType>
-        <spirit:userFileType>XGUI_VERSION_2</spirit:userFileType>
-      </spirit:file>
-    </spirit:fileSet>
-  </spirit:fileSets>
-  <spirit:description>harness_sink_v1_0</spirit:description>
-  <spirit:parameters>
-    <spirit:parameter>
-      <spirit:name>STREAM_WIDTH</spirit:name>
-      <spirit:displayName>Stream Width</spirit:displayName>
-      <spirit:value spirit:format="long" spirit:resolve="user" spirit:id="PARAM_VALUE.STREAM_WIDTH">8</spirit:value>
-    </spirit:parameter>
-    <spirit:parameter>
-      <spirit:name>Component_Name</spirit:name>
-      <spirit:value spirit:resolve="user" spirit:id="PARAM_VALUE.Component_Name" spirit:order="1">harness_sink_v1_0</spirit:value>
-    </spirit:parameter>
-  </spirit:parameters>
-  <spirit:vendorExtensions>
-    <xilinx:coreExtensions>
-      <xilinx:supportedFamilies>
-        <xilinx:family xilinx:lifeCycle="Production">zynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">qzynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">azynq</xilinx:family>
-        <xilinx:family xilinx:lifeCycle="Production">zynquplus</xilinx:family>
-      </xilinx:supportedFamilies>
-      <xilinx:taxonomies>
-        <xilinx:taxonomy>/UserIP</xilinx:taxonomy>
-      </xilinx:taxonomies>
-      <xilinx:displayName>harness_sink_v1_0</xilinx:displayName>
-      <xilinx:autoFamilySupportLevel>level_0</xilinx:autoFamilySupportLevel>
-      <xilinx:definitionSource>package_project</xilinx:definitionSource>
-      <xilinx:coreRevision>2</xilinx:coreRevision>
-      <xilinx:coreCreationDateTime>2023-08-22T13:34:35Z</xilinx:coreCreationDateTime>
-    </xilinx:coreExtensions>
-    <xilinx:packagingInfo>
-      <xilinx:xilinxVersion>2022.2</xilinx:xilinxVersion>
-      <xilinx:checksum xilinx:scope="busInterfaces" xilinx:value="eacb320c"/>
-      <xilinx:checksum xilinx:scope="fileGroups" xilinx:value="b9abf8c4"/>
-      <xilinx:checksum xilinx:scope="ports" xilinx:value="90e07bc9"/>
-      <xilinx:checksum xilinx:scope="hdlParameters" xilinx:value="112e2f2d"/>
-      <xilinx:checksum xilinx:scope="parameters" xilinx:value="00cd102c"/>
-      <xilinx:targetDRCs>
-        <xilinx:targetDRC xilinx:tool="ipi">
-          <xilinx:targetDRCOption xilinx:name="ignore_freq_hz" xilinx:value="true"/>
-        </xilinx:targetDRC>
-      </xilinx:targetDRCs>
-    </xilinx:packagingInfo>
-  </spirit:vendorExtensions>
-</spirit:component>
diff --git a/benchmarking/harness/sink/ip/src/harness_sink.v b/benchmarking/harness/sink/ip/src/harness_sink.v
deleted file mode 100644
index e6b95e7797..0000000000
--- a/benchmarking/harness/sink/ip/src/harness_sink.v
+++ /dev/null
@@ -1,39 +0,0 @@
-`timescale 1ns / 1ps
-//////////////////////////////////////////////////////////////////////////////////
-// Company: 
-// Engineer: 
-// 
-// Create Date: 08/22/2023 02:19:08 PM
-// Design Name: 
-// Module Name: harness_sink
-// Project Name: 
-// Target Devices: 
-// Tool Versions: 
-// Description: 
-// 
-// Dependencies: 
-// 
-// Revision:
-// Revision 0.01 - File Created
-// Additional Comments:
-// 
-//////////////////////////////////////////////////////////////////////////////////
-
-
-module harness_sink #(
-    parameter STREAM_WIDTH=8
-)(
-    input enable,
-    output valid,
-    output checksum,
-    input [STREAM_WIDTH-1:0] s_axis_0_tdata,
-    input s_axis_0_tvalid,
-    output s_axis_0_tready
-);
-
-assign s_axis_0_tready = enable;
-
-assign valid = s_axis_0_tvalid;
-assign checksum = ^s_axis_0_tdata;
-
-endmodule
diff --git a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
deleted file mode 100644
index eb752d53a5..0000000000
--- a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl
+++ /dev/null
@@ -1,25 +0,0 @@
-# Definitional proc to organize widgets for parameters.
-proc init_gui { IPINST } {
-  ipgui::add_param $IPINST -name "Component_Name"
-  #Adding Page
-  set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
-  ipgui::add_param $IPINST -name "STREAM_WIDTH" -parent ${Page_0}
-
-
-}
-
-proc update_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } {
-	# Procedure called to update STREAM_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } {
-	# Procedure called to validate STREAM_WIDTH
-	return true
-}
-
-
-proc update_MODELPARAM_VALUE.STREAM_WIDTH { MODELPARAM_VALUE.STREAM_WIDTH PARAM_VALUE.STREAM_WIDTH } {
-	# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
-	set_property value [get_property value ${PARAM_VALUE.STREAM_WIDTH}] ${MODELPARAM_VALUE.STREAM_WIDTH}
-}
-
diff --git a/benchmarking/harness/vector_xor.v b/benchmarking/harness/vector_xor.v
deleted file mode 100644
index 3361860ab8..0000000000
--- a/benchmarking/harness/vector_xor.v
+++ /dev/null
@@ -1,32 +0,0 @@
-`timescale 1ns / 1ps
-//////////////////////////////////////////////////////////////////////////////////
-// Company: 
-// Engineer: 
-// 
-// Create Date: 08/22/2023 02:19:08 PM
-// Design Name: 
-// Module Name: harness_sink
-// Project Name: 
-// Target Devices: 
-// Tool Versions: 
-// Description: 
-// 
-// Dependencies: 
-// 
-// Revision:
-// Revision 0.01 - File Created
-// Additional Comments:
-// 
-//////////////////////////////////////////////////////////////////////////////////
-
-
-module vector_xor #(
-    parameter WIDTH=8
-)(
-    input [WIDTH-1:0] in_data,
-    output out_data
-);
-
-assign out_data = ^in_data;
-
-endmodule
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 284cd2baa3..baada9d1d2 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -160,7 +160,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             # restore stdout/stderr
             sys.stdout = stdout_orig
             sys.stderr = stderr_orig
-            time_per_step[step_name] = step_end - step_start
+            time_per_step[step_name] = round(step_end - step_start)
             chkpt_name = "%s.onnx" % (step_name)
             if cfg.save_intermediate_models:
                 intermediate_model_dir = cfg.output_dir + "/intermediate_models"
@@ -183,7 +183,8 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             print("Build failed")
             return -1
 
-    with open(cfg.output_dir + "/time_per_step.json", "w") as f:
+    time_per_step["total_build_time"] = sum(time_per_step.values())
+    with open(cfg.output_dir + "/report/time_per_step.json", "w") as f:
         json.dump(time_per_step, f, indent=2)
     print("Completed successfully")
     return 0
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 448c6e5c4e..a3db23a714 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -161,16 +161,16 @@ class DataflowBuildConfig:
     """
 
     #: Directory where the final build outputs will be written into
-    output_dir: str
+    output_dir: Optional[str] = None
 
     #: Target clock frequency (in nanoseconds) for Vivado synthesis.
     #: e.g. synth_clk_period_ns=5.0 will target a 200 MHz clock.
     #: If hls_clk_period_ns is not specified it will default to this value.
-    synth_clk_period_ns: float
+    synth_clk_period_ns: Optional[float] = None
 
     #: Which output(s) to generate from the build flow.  See documentation of
     #: DataflowOutputType for available options.
-    generate_outputs: List[DataflowOutputType]
+    generate_outputs: Optional[List[DataflowOutputType]] = None
 
     #: (Optional) Path to configuration JSON file in which user can specify
     #: a preferred implementation style (HLS or RTL) for each node.

From 6e2c379c095723489e21ba6c82967fc10284eb6a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 13:21:42 +0000
Subject: [PATCH 053/125] Refactor microbenchmarks to use normal builder flow

---
 benchmarking/bench.py                    |   8 +-
 benchmarking/bench_base.py               | 372 +++++++++--------------
 benchmarking/cfg/mvau_test.json          |   2 +-
 benchmarking/cfg/synthetic_fifotest.json |   4 +-
 benchmarking/collect.py                  |   8 +
 benchmarking/dut/mvau.py                 |  47 ++-
 benchmarking/dut/transformer.py          |   8 +-
 7 files changed, 194 insertions(+), 255 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 686c97ddc2..485c64bb76 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -10,19 +10,15 @@
 from dut.resnet50 import bench_resnet50
 from dut.metafi import bench_metafi
 from dut.synthetic_nonlinear import bench_synthetic_nonlinear
+from dut.transformer import bench_transformer
 
 dut = dict()
 dut["mvau"] = bench_mvau
 dut["resnet50"] = bench_resnet50
 dut["metafi"] = bench_metafi
 dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
+dut["transformer"] = bench_transformer
 
-# TODO: remove guard once transformer support has been fully merged
-try:
-    from dut.transformer import bench_transformer
-    dut["transformer"] = bench_transformer
-except ImportError:
-    pass
 
 def main(config_name):
     exit_code = 0
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 636af6bb5e..edc2e67d4d 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -126,6 +126,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
         self.debug = debug
 
         #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously
+        #TODO: coordinate with new builder loggin setup
 
         # General configuration
         # TODO: do not allow multiple targets in a single bench job due to measurement?
@@ -199,204 +200,153 @@ def save_local_artifacts_collection(self):
         # this should be called upon successful or failed completion of a run
         for (name, source_path) in self.local_artifacts_collection:
             self.save_local_artifact(name, source_path)
-
-    # only used in simple flow (TODO: unify)
-    def step_make_model(self):
-        pass
     
-    # only used in full build flow
+    # must be defined by subclass
     def step_export_onnx(self):
         pass
 
-    # only used in full build flow
+    # must be defined by subclass
     def step_build_setup(self):
         pass
 
-    # defaults to full build flow
-    # may be overwritten by subclass (e.g., to call simple flow instead)
+    # defaults to normal build flow, may be overwritten by subclass
     def run(self):
         self.steps_full_build_flow()
 
-    def step_finn_estimate(self):
-        # Gather FINN estimates
-        print("Gathering FINN estimates")
-
-        model = self.model_initial
-        finn_resources_model = res_estimation(model, fpgapart=self.part)
-        finn_cycles_model = model.analysis(exp_cycles_per_layer)
-        if self.target_node:
-            node = model.get_nodes_by_op_type(self.target_node)[0]
-            finn_resources = finn_resources_model[node.name]
-            finn_cycles = finn_cycles_model[node.name]
-        else:
-            finn_resources = finn_resources_model # TODO: aggregate?
-            finn_cycles = 0 # TODO: aggregate or drop
-        finn_estimates = finn_resources
-        finn_estimates["CYCLES"] = finn_cycles
-        self.output_dict["finn_estimates"] = finn_estimates
-
-    def step_hls(self):
-        # Perform Vitis HLS synthesis for HLS resource/performance reports
-        start_time = time.time()
-        print("Performing Vitis HLS synthesis")
-        model = self.model_initial
-        model = model.transform(PrepareIP(self.part, self.clock_period_ns))
-        model = model.transform(HLSSynthIP())
-
-        hls_resources_model = model.analysis(hls_synth_res_estimation)
-        if self.target_node:
-            node = model.get_nodes_by_op_type(self.target_node)[0]
-            hls_resources = hls_resources_model[node.name]
-        else:
-            hls_resources = hls_resources_model # TODO: aggregate?
-        self.output_dict["hls_estimates"] = hls_resources
-        self.output_dict["hls_time"] = int(time.time() - start_time)
-
-        self.model_step_hls = copy.deepcopy(model)
-
-    def step_rtlsim(self):
-        # Perform RTL simulation for performance measurement
-        start_time = time.time()
-        print("Performing Verilator RTL simulation (n=1)")
-        # Prepare
-        model = self.model_step_hls
-        model = model.transform(SetExecMode("rtlsim"))
-        model = model.transform(PrepareRTLSim())
-        # Generate input data
-        input_tensor = model.graph.input[0]
-        input_shape = model.get_tensor_shape(input_tensor.name)
-        input_dtype = model.get_tensor_datatype(input_tensor.name)
-        x = gen_finn_dt_tensor(input_dtype, input_shape)
-        input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case
-        # Run
-        oxe.execute_onnx(model, input_dict)["outp"]  # do not check output for correctness TODO: add functional verification throughout benchmarking steps
-        # Log result
-        node = model.get_nodes_by_op_type("MVAU_hls")[0]
-        inst = getCustomOp(node)
-        rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim")
-        self.output_dict["rtlsim_cycles"] = rtlsim_cycles
-        self.output_dict["rtlsim_time"] = int(time.time() - start_time)
-
-    def step_synthesis(self):
-        # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports
-        # TODO: avoid duplicate synthesis by using shell build also for post_synth_resources and power sim?
-        # TODO: check OMX synth strategy again!
-        start_time = time.time()
-        print("Performing Vivado (stitched-ip, out-of-context) synthesis")
-        model = self.model_step_hls
-        model = model.transform(ReplaceVerilogRelPaths())
-        model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
-        model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns))
-        ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth"))
-
-        start_test_batch_fast(
-            results_path=self.artifacts_dir_power,
-            project_path=os.path.join(
-                ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
-            ),
-            run_target="impl_1",
-            pairs=[(25, 0.5), (50, 0.5), (75, 0.5)],
-        )
-
-        # Log most important power results directly (refer to detailed logs for more)
-        for reportname in ["25_0.5", "50_0.5", "75_0.5"]:
-            with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
-                report = json.load(f)
-                power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
-                power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
-                ooc_synth_results["power_%s" % reportname] = power
-                ooc_synth_results["power_dyn_%s" % reportname] = power_dyn
-
-        self.output_dict["ooc_synth"] = ooc_synth_results
-        self.output_dict["ooc_synth_time"] = int(time.time() - start_time)
-
-        # Save model for logging purposes
-        model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id)))
-        self.model_step_synthesis = copy.deepcopy(model)
-
-    def step_sim_power(self):
-        # Perform Vivado simulation for accurate power report
-        start_time = time.time()
-        if "ooc_synth" not in self.output_dict:
-            print("ERROR: step_sim_power requires step_synthesis")
-        print("Performing Vivado simulation for power report")
-        if "rtlsim_cycles" in self.output_dict:
-            sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns
-        else:
-            sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns
-
-        model = self.model_step_synthesis
-        input_tensor = model.graph.input[0]
-        output_tensor = model.graph.output[0]
-        input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
-        output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
-        sim_power_report(
-            results_path=self.artifacts_dir_power,
-            project_path=os.path.join(
-                self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
-            ),
-            in_width=input_node_inst.get_instream_width(),
-            out_width=output_node_inst.get_outstream_width(),
-            dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(),
-            sim_duration_ns=sim_duration_ns,
-        )
-
-        # Log most important power results directly (refer to detailed logs for more)
-        for reportname in ["sim"]:
-            with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
-                report = json.load(f)
-                power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
-                power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
-                self.output_dict["power_%s" % reportname] = power
-                self.output_dict["power_dyn%s" % reportname] = power_dyn
-
-        self.output_dict["sim_power_time"] = int(time.time() - start_time)
-
-    def step_synth_power(self):
-        # Perform Vivado synthesis for on-hardware power measurement
-        start_time = time.time()
-        if self.model_step_hls is None:
-            print("ERROR: step_synth_power requires step_hls")
-        print("Performing Vivado synthesis with test harness integration for power measurement")
-
-        if "dut_duplication" in self.params:
-            dut_duplication = self.params["dut_duplication"]
-        else:
-            dut_duplication = 1
-    
-        model = self.model_step_hls.transform(ReplaceVerilogRelPaths())
-        model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
-
-        build_dir = "temp_output_harness_build"
-        # TODO: replace hold harness with new instr wrapper implementation
-        #TODO: if synth fails this could contain stale bitstreams which will be power tested
-        # model = model.transform(
-        #     MakeZYNQHarnessProject(
-        #         platform=self.board,
-        #         output_dir=build_dir,
-        #         dut_duplication=dut_duplication,
-        #         clock_period_ns=self.clock_period_ns
-        #     )
-        # )
-
-        # COPY bitstreams and other outputs
-        # TODO: integrate better (e.g. as artifact) and remove redundant copy
-        # TODO: make this more configurable or switch to job/artifact based power measurement 
-        shcopy(os.path.join(build_dir, "top_wrapper.bit"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
-        shcopy(os.path.join(build_dir, "top.hwh"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
-        shcopy(os.path.join(build_dir, "synth_report.xml"), 
-               os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
-        clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
-        measurement_settings = {"freq_mhz": clock_period_mhz}
-        with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
-            json.dump(measurement_settings, f, indent=2)
-
-        self.output_dict["synth_power_time"] = int(time.time() - start_time)
-
-        # Save model for logging purposes
-        model.save(os.path.join(self.artifacts_dir_models, "model_%d_synth_power.onnx" % (self.run_id)))
+    # def step_finn_estimate(self):
+    #     # Gather FINN estimates
+    #     print("Gathering FINN estimates")
+
+    #     model = self.model_initial
+    #     finn_resources_model = res_estimation(model, fpgapart=self.part)
+    #     finn_cycles_model = model.analysis(exp_cycles_per_layer)
+    #     if self.target_node:
+    #         node = model.get_nodes_by_op_type(self.target_node)[0]
+    #         finn_resources = finn_resources_model[node.name]
+    #         finn_cycles = finn_cycles_model[node.name]
+    #     else:
+    #         finn_resources = finn_resources_model # TODO: aggregate?
+    #         finn_cycles = 0 # TODO: aggregate or drop
+    #     finn_estimates = finn_resources
+    #     finn_estimates["CYCLES"] = finn_cycles
+    #     self.output_dict["finn_estimates"] = finn_estimates
+
+    # def step_hls(self):
+    #     # Perform Vitis HLS synthesis for HLS resource/performance reports
+    #     start_time = time.time()
+    #     print("Performing Vitis HLS synthesis")
+    #     model = self.model_initial
+    #     model = model.transform(PrepareIP(self.part, self.clock_period_ns))
+    #     model = model.transform(HLSSynthIP())
+
+    #     hls_resources_model = model.analysis(hls_synth_res_estimation)
+    #     if self.target_node:
+    #         node = model.get_nodes_by_op_type(self.target_node)[0]
+    #         hls_resources = hls_resources_model[node.name]
+    #     else:
+    #         hls_resources = hls_resources_model # TODO: aggregate?
+    #     self.output_dict["hls_estimates"] = hls_resources
+    #     self.output_dict["hls_time"] = int(time.time() - start_time)
+
+    #     self.model_step_hls = copy.deepcopy(model)
+
+    # def step_rtlsim(self):
+    #     # Perform RTL simulation for performance measurement
+    #     start_time = time.time()
+    #     print("Performing Verilator RTL simulation (n=1)")
+    #     # Prepare
+    #     model = self.model_step_hls
+    #     model = model.transform(SetExecMode("rtlsim"))
+    #     model = model.transform(PrepareRTLSim())
+    #     # Generate input data
+    #     input_tensor = model.graph.input[0]
+    #     input_shape = model.get_tensor_shape(input_tensor.name)
+    #     input_dtype = model.get_tensor_datatype(input_tensor.name)
+    #     x = gen_finn_dt_tensor(input_dtype, input_shape)
+    #     input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case
+    #     # Run
+    #     oxe.execute_onnx(model, input_dict)["outp"]  # do not check output for correctness TODO: add functional verification throughout benchmarking steps
+    #     # Log result
+    #     node = model.get_nodes_by_op_type("MVAU_hls")[0]
+    #     inst = getCustomOp(node)
+    #     rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim")
+    #     self.output_dict["rtlsim_cycles"] = rtlsim_cycles
+    #     self.output_dict["rtlsim_time"] = int(time.time() - start_time)
+
+# TODO: re-introduce simple Vivado power estimation as new builder step
+    # def step_synthesis(self):
+    #     # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports
+    #     start_time = time.time()
+    #     print("Performing Vivado (stitched-ip, out-of-context) synthesis")
+    #     model = self.model_step_hls
+    #     model = model.transform(ReplaceVerilogRelPaths())
+    #     model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
+    #     model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns))
+    #     ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth"))
+
+    #     start_test_batch_fast(
+    #         results_path=self.artifacts_dir_power,
+    #         project_path=os.path.join(
+    #             ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
+    #         ),
+    #         run_target="impl_1",
+    #         pairs=[(25, 0.5), (50, 0.5), (75, 0.5)],
+    #     )
+
+    #     # Log most important power results directly (refer to detailed logs for more)
+    #     for reportname in ["25_0.5", "50_0.5", "75_0.5"]:
+    #         with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
+    #             report = json.load(f)
+    #             power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
+    #             power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
+    #             ooc_synth_results["power_%s" % reportname] = power
+    #             ooc_synth_results["power_dyn_%s" % reportname] = power_dyn
+
+    #     self.output_dict["ooc_synth"] = ooc_synth_results
+    #     self.output_dict["ooc_synth_time"] = int(time.time() - start_time)
+
+    #     # Save model for logging purposes
+    #     model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id)))
+    #     self.model_step_synthesis = copy.deepcopy(model)
+
+# TODO: re-introduce sim-based Vivado power estimation as new builder step
+    # def step_sim_power(self):
+    #     # Perform Vivado simulation for accurate power report
+    #     start_time = time.time()
+    #     if "ooc_synth" not in self.output_dict:
+    #         print("ERROR: step_sim_power requires step_synthesis")
+    #     print("Performing Vivado simulation for power report")
+    #     if "rtlsim_cycles" in self.output_dict:
+    #         sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns
+    #     else:
+    #         sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns
+
+    #     model = self.model_step_synthesis
+    #     input_tensor = model.graph.input[0]
+    #     output_tensor = model.graph.output[0]
+    #     input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
+    #     output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
+    #     sim_power_report(
+    #         results_path=self.artifacts_dir_power,
+    #         project_path=os.path.join(
+    #             self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
+    #         ),
+    #         in_width=input_node_inst.get_instream_width(),
+    #         out_width=output_node_inst.get_outstream_width(),
+    #         dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(),
+    #         sim_duration_ns=sim_duration_ns,
+    #     )
+
+    #     # Log most important power results directly (refer to detailed logs for more)
+    #     for reportname in ["sim"]:
+    #         with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
+    #             report = json.load(f)
+    #             power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
+    #             power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
+    #             self.output_dict["power_%s" % reportname] = power
+    #             self.output_dict["power_dyn%s" % reportname] = power_dyn
+
+    #     self.output_dict["sim_power_time"] = int(time.time() - start_time)
 
     def step_parse_builder_output(self, build_dir):
         # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow
@@ -456,46 +406,6 @@ def step_parse_builder_output(self, build_dir):
         else:
             pass #TODO: warn/skip?
 
-    def steps_simple_model_flow(self):
-        # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops)
-        do_hls = self.params["do_hls"] if "do_hls" in self.params else False
-        do_rtlsim = self.params["do_rtlsim"] if "do_rtlsim" in self.params else False
-        do_synthesis = self.params["do_synthesis"] if "do_synthesis" in self.params else False
-        do_sim_power = self.params["do_sim_power"] if "do_sim_power" in self.params else False
-        do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False
-
-        # Perform steps
-        make_model_result = self.step_make_model()
-        if make_model_result is None:
-            return
-        else:
-            model, dut_info = make_model_result
-
-        # Save model for logging purposes
-        # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure
-        # E.g. parameters and paths could be stored as onnx attributes and benchmarking steps as generic or specialized custom_op transformations
-        model.save(os.path.join(self.artifacts_dir_models, "model_%d_initial.onnx" % (self.run_id)))
-
-        # Save model for use in other steps
-        self.model_initial = model
-
-        # Log dict reported by DUT-specific scripts to overall result dict
-        # E.g. this could contain SIMD/PE derived from folding factors or weight distribution information
-        self.output_dict["info"] = dut_info
-
-        self.step_finn_estimate()
-
-        if do_hls:
-            self.step_hls()
-        if do_rtlsim:
-            self.step_rtlsim()
-        if do_synthesis:
-            self.step_synthesis()
-        if do_sim_power:
-            self.step_sim_power()
-        #if do_synth_power:
-        #    self.step_synth_power()
-
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
 
@@ -510,6 +420,7 @@ def steps_full_build_flow(self):
         self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"]))
 
         ### MODEL CREATION/IMPORT ###
+        # TODO: track fixed input onnx models with DVC
         if "model_dir" in self.params:
             # input ONNX model and verification input/output pairs are provided
             model_dir = self.params["model_dir"]
@@ -521,7 +432,9 @@ def steps_full_build_flow(self):
         else:
             # input ONNX model (+ optional I/O pair for verification) will be generated
             self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx")
-            self.step_export_onnx(self.build_inputs["onnx_path"])
+            if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped":
+                # microbenchmarks might skip because no valid model can be generated for given params
+                return
             self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"])
 
         if "folding_path" in self.params:
@@ -543,6 +456,7 @@ def steps_full_build_flow(self):
         else:
             cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ
         # enable extra performance optimizations (physopt)
+        # TODO: check OMX synth strategy again!
         cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json
index d4cb2072be..07fd52cc2f 100644
--- a/benchmarking/cfg/mvau_test.json
+++ b/benchmarking/cfg/mvau_test.json
@@ -27,6 +27,6 @@
 
         "dut_duplication": [1],
 
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]] ##
+        "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
     }
     ]
diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json
index 1b40feb9e8..dfc63c6240 100644
--- a/benchmarking/cfg/synthetic_fifotest.json
+++ b/benchmarking/cfg/synthetic_fifotest.json
@@ -38,7 +38,7 @@
 
         "fifo_method": ["characterize"],
         "fifo_strategy": ["analytical", "rtlsim"],
-        "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+        "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["synthetic_nonlinear"],
@@ -59,6 +59,6 @@
 
         "fifo_method": ["largefifo_rtlsim"],
         "fifo_rtlsim_n": [2],
-        "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+        "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
\ No newline at end of file
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 7ba7dc4cb0..fa582d399a 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -88,6 +88,7 @@ def wait_for_power_measurements():
 
     for run in combined_log:
         with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live:
+            #TODO: add pipeline info to metadata (or as metric or other annotation?)
             metadata = {
                 "metadata": {
                     "run_id": run["run_id"],
@@ -99,6 +100,13 @@ def wait_for_power_measurements():
             live.log_params(metadata)
             live.log_params(run["params"])
 
+            # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate)
+
+            # OOC synth resource report (step_out_of_context_synthesis)
+
+            # shell synth resource report (step_synthesize_bitfile)
+
+
             if "builder" in run["output"]:
                 for key in run["output"]["builder"]:
                     live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False)
diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py
index a41eec694b..f62c6b59a7 100644
--- a/benchmarking/dut/mvau.py
+++ b/benchmarking/dut/mvau.py
@@ -1,6 +1,7 @@
 
 import math
 import numpy as np
+import json
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -19,6 +20,8 @@
 from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
     MinimizeWeightBitWidth,
 )
+import finn.builder.build_dataflow_config as build_cfg
+
 from bench_base import bench
 
 class bench_mvau(bench):
@@ -123,7 +126,7 @@ def _make_single_mvau_model(
 
         return model
 
-    def step_make_model(self):
+    def step_export_onnx(self, onnx_export_path):
         # Read params
         idt = self.params["idt"]
         wdt = self.params["wdt"]
@@ -157,10 +160,10 @@ def step_make_model(self):
         pe = mh // nf
         if mw % simd != 0 or mh % pe != 0:
             print("Invalid simd/pe configuration, skipping")
-            return
+            return "skipped"
         if m > 1 and (simd != mw or pe != mh):
             print("M > 1 not possible for non-max simd/pe, skipping")
-            return
+            return "skipped"
         output_dict["simd"] = simd
         output_dict["pe"] = pe
 
@@ -178,11 +181,11 @@ def step_make_model(self):
             if "sparsity_amount" in self.params:
                 if self.params["sparsity_amount"] > 0:
                     print("sparsity amount > 0 not applicable for none sparsity, skipping")
-                    return
+                    return "skipped"
         else:
             if self.params["sparsity_amount"] == 0:
                 print("sparsity amount = 0 not applicable for selected sparsity, skipping")
-                return
+                return "skipped"
             if sparsity_type == "unstructured":
                 idx = np.random.choice(
                     mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False
@@ -207,7 +210,7 @@ def step_make_model(self):
                     )
                 else:
                     print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
-                    return
+                    return "skipped"
                 W[idx_mw, :] = 0.0
             elif sparsity_type == "cols_regular":
                 if self.params["sparsity_amount"] == 0.25:
@@ -220,7 +223,7 @@ def step_make_model(self):
                     )
                 else:
                     print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
-                    return
+                    return "skipped"
                 W[:, idx_mh] = 0.0
 
             else:
@@ -289,7 +292,31 @@ def step_make_model(self):
         inst = getCustomOp(node)
 
         self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type
-        return model, output_dict
 
-    def run(self):
-        self.steps_simple_model_flow()
+        # log additional info about the generated model (e.g. SIMD/PE or sparsity)
+        with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f:
+            json.dump(output_dict, f, indent=2)
+
+        # TODO: also generate golden I/O pair for further verification steps
+        model.save(onnx_export_path)
+
+    def step_build_setup(self):
+        # create build config for synthetic microbenchmark models
+        cfg = build_cfg.DataflowBuildConfig(
+            # manual folding
+            target_fps=None,
+            steps=[
+                "step_create_dataflow_partition",
+                "step_minimize_bit_width",
+                "step_generate_estimate_reports",
+                "step_hw_codegen",
+                "step_hw_ipgen",
+                "step_create_stitched_ip",
+                "step_measure_rtlsim_performance",
+                "step_out_of_context_synthesis",
+                "step_synthesize_bitfile",
+                "step_make_pynq_driver",
+                "step_deployment_package",
+            ]
+        )
+        return cfg
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 87522ad2e5..2beca913c7 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -12,22 +12,16 @@
     QuantLinear,
     QuantReLU
 )
-import os
-from qonnx.core.modelwrapper import ModelWrapper
 # Progressbar
 from tqdm import trange
 import numpy as np
 from brevitas.export import export_qonnx
 import random
 import json
-import subprocess
-from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
 # FINN dataflow builder
-import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
 from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
-from bench_base import bench, step_synth_harness
-from finn.util.basic import alveo_part_map
+from bench_base import bench
 
 # Range information structure for seeding the range analysis for converting
 # quantized activations to MultiThreshold

From c19289b060330047f6b1cd399f0f56b4bf49ccb1 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 15:54:59 +0000
Subject: [PATCH 054/125] Refactor artifact handling and upload of key metrics

---
 benchmarking/bench-ci.yml                |   3 +
 benchmarking/bench.py                    |   7 +-
 benchmarking/bench_base.py               | 113 ++++++-----------
 benchmarking/collect.py                  | 147 ++++++++++++++++++++---
 src/finn/builder/build_dataflow_steps.py |   3 +-
 5 files changed, 178 insertions(+), 95 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index f62f2eb35a..c6d2c6bc91 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -49,6 +49,9 @@ Result Collection:
   stage: collect
   tags:
     - image_build
+  rules:
+    # Also run on failure of previous tasks to collect partial results
+    - when: always
   script:
     - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
     - dvc exp push git@github.com:eki-project/finn-plus.git
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 485c64bb76..fb890332b9 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -161,11 +161,14 @@ def get_default_session_options_new():
         log_dict["total_time"] = int(time.time() - start_time)
         log_dict["output"] = bench_object.output_dict
         log.append(log_dict)
+        # TODO: save this meta data into run-level reports dir insted of task*.json
         # overwrite output log file every time to allow early abort
         with open(log_path, "w") as f:
             json.dump(log, f, indent=2)
-        
-        # save local artifacts of this run (e.g., detailed debug info)
+
+        # save GitLab artifacts of this run (e.g., reports and deployment package)
+        bench_object.save_artifacts_collection()
+        # save local artifacts of this run (e.g., full build dir, detailed debug info)
         bench_object.save_local_artifacts_collection()
     print("Stopping job")
     return exit_code
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index edc2e67d4d..eef9edd721 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -6,6 +6,7 @@
 import time
 import traceback
 import glob
+import shutil
 import numpy as np
 from shutil import copy as shcopy
 from shutil import copytree
@@ -134,6 +135,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
             self.board = params["board"]
         else:
             self.board = "RFSoC2x2"
+            self.params["board"] = self.board
 
         if "part" in params:
             self.part = params["part"]
@@ -146,60 +148,53 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
             self.clock_period_ns = params["clock_period_ns"]
         else:
             self.clock_period_ns = 10
+            self.params["clock_period_ns"] = self.clock_period_ns
 
         # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts)
         print("Clearing FINN BUILD DIR ahead of run")
         delete_dir_contents(os.environ["FINN_BUILD_DIR"])
 
-        # Initialize output directories (might exist from other runs of the same job)
-        self.artifacts_dir_models = os.path.join(self.artifacts_dir, "models")
-        os.makedirs(self.artifacts_dir_models, exist_ok=True)
-        self.artifacts_dir_power = os.path.join(self.artifacts_dir, "power_vivado", "run_%d" % (self.run_id))
-        os.makedirs(self.artifacts_dir_power, exist_ok=True)
-
-        self.save_dir_bitstreams = os.path.join(self.save_dir, "bitstreams")
-        os.makedirs(self.save_dir_bitstreams, exist_ok=True)
-
-        # Intermediate models saved between steps
-        # TODO: create setter functions for intermediate models or other artifacts that log them to gitlab artifacts or local dir automatically
-        self.model_initial = None
-        self.model_step_hls = None
-        self.model_step_synthesis = None
-
         # Initialize dictionary to collect all benchmark results
+        # TODO: remove completely or only use for meta data, actual results go into run-specific .json files within /report
         self.output_dict = {}
 
         # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow
         self.build_inputs = {}
 
-        # Collect tuples of (name, source path) to save as local artifacts upon run completion or fail by exception
+        # Collect tuples of (name, source path, archive?) to save as pipeline artifacts upon run completion or fail by exception
+        self.artifacts_collection = []
+
+        # Collect tuples of (name, source path, archive?) to save as local artifacts upon run completion or fail by exception
         self.local_artifacts_collection = []
         if self.debug:
             # Save entire FINN build dir and working dir
             # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure)
-            self.local_artifacts_collection.append(("finn_tmp", os.environ["FINN_BUILD_DIR"]))
-            self.local_artifacts_collection.append(("finn_cwd", os.environ["FINN_ROOT"]))
+            self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False))
+            self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False))
 
-    def save_artifact(self, name, source_path):
-        target_path = os.path.join(self.artifacts_dir, name, "run_%d" % (self.run_id))
-        os.makedirs(target_path, exist_ok=True)
+    def save_artifact(self, target_path, source_path, archive=False):
         if os.path.isdir(source_path):
-            copytree(source_path, target_path, dirs_exist_ok=True)
-        else:
+            if archive:
+                os.makedirs(os.path.dirname(target_path), exist_ok=True)
+                shutil.make_archive(target_path, "zip", source_path)
+            else:
+                os.makedirs(target_path, exist_ok=True)
+                copytree(source_path, target_path, dirs_exist_ok=True)
+        elif os.path.isfile(source_path):
+            os.makedirs(target_path, exist_ok=True)
             shcopy(source_path, target_path)
 
-    def save_local_artifact(self, name, source_path):
-        target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
-        os.makedirs(target_path, exist_ok=True)
-        if os.path.isdir(source_path):
-            copytree(source_path, target_path, dirs_exist_ok=True)
-        else:
-            shcopy(source_path, target_path)
+    def save_artifacts_collection(self):
+        # this should be called upon successful or failed completion of a run
+        for (name, source_path, archive) in self.artifacts_collection:
+            target_path = os.path.join(self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name)
+            self.save_artifact(target_path, source_path, archive)
 
     def save_local_artifacts_collection(self):
         # this should be called upon successful or failed completion of a run
-        for (name, source_path) in self.local_artifacts_collection:
-            self.save_local_artifact(name, source_path)
+        for (name, source_path, archive) in self.local_artifacts_collection:
+            target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
+            self.save_artifact(target_path, source_path, archive)
     
     # must be defined by subclass
     def step_export_onnx(self):
@@ -349,26 +344,7 @@ def run(self):
     #     self.output_dict["sim_power_time"] = int(time.time() - start_time)
 
     def step_parse_builder_output(self, build_dir):
-        # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow
-
-        ### SAVE BITSTREAMS ###
-        if (os.path.exists(os.path.join(build_dir, "harness"))):
-            # TODO: integrate better (e.g. as artifact) and remove redundant copy
-            # TODO: make this more configurable or switch to job/artifact based power measurement
-            # TODO: make compatible to new instr wrapper (or however we generate these outputs)
-            shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), 
-                os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id))
-            shcopy(os.path.join(build_dir, "harness/top.hwh"), 
-                os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id))
-            shcopy(os.path.join(build_dir, "harness/synth_report.xml"), 
-                os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id))
-            clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0)
-            measurement_settings = {"freq_mhz": clock_period_mhz}
-            with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f:
-                json.dump(measurement_settings, f, indent=2)
-        else:
-            pass #TODO: warn/skip?
-
+        # TODO: output as .json or even add as new build step
         ### CHECK FOR VERIFICATION STEP SUCCESS ###
         if (os.path.exists(os.path.join(build_dir, "verification_output"))):
             # Collect all verification output filenames
@@ -381,30 +357,7 @@ def step_parse_builder_output(self, build_dir):
     
             # Construct a dictionary reporting the verification status as string
             self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]}
-            # TODO: mark job as failed if verification fails
-        else:
-            pass #TODO: warn/skip?
-
-        ### PARSE SYNTH RESOURCE REPORT ###
-        if (os.path.exists(os.path.join(build_dir, "harness/post_synth_resources.json"))):
-            report_path = os.path.join(build_dir, "harness/post_synth_resources.json") 
-            # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run)
-            report_filter = "(top)"
-            # Open the report file
-            with open(report_path) as file:
-                # Load the JSON formatted report
-                report = pd.read_json(file, orient="index")
-            # Filter the reported rows according to some regex filter rule
-            report = report.filter(regex=report_filter, axis="rows")
-            # Generate a summary of the total resources
-            summary = report.sum()
-
-            #TODO: parse finn estimates, hls estimates, step times, rtlsim performance(rtlsim n=1, n=100)
-            #TODO: optional simulation of instr wrapper instead of running on hw
-
-            self.output_dict["builder"] = summary.to_dict()
-        else:
-            pass #TODO: warn/skip?
+            # TODO: mark job as failed if verification fails?
 
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
@@ -417,7 +370,13 @@ def steps_full_build_flow(self):
         delete_dir_contents(tmp_buildflow_dir)
         self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output")
         os.makedirs(self.build_inputs["build_dir"], exist_ok=True)
-        self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"]))
+
+        # Save full build dir as local artifact
+        self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False))
+        # Save reports and deployment package as pipeline artifacts
+        self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "report"), False))
+        self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "build_dataflow.log"), False))
+        self.artifacts_collection.append(("deploy", os.path.join(self.build_inputs["build_dir"], "deploy"), True))
 
         ### MODEL CREATION/IMPORT ###
         # TODO: track fixed input onnx models with DVC
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index fa582d399a..27a298acea 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -76,19 +76,45 @@ def wait_for_power_measurements():
         time.sleep(60)
     print("Power measurement complete")
 
+def open_json_report(id, report_name):
+    path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
+    if os.path.isfile(path):
+        with open(path, "r") as f:
+            report = json.load(f)
+        return report
+    else:
+        return None
+
+def log_metrics_from_report(id, live, report_name, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in keys:
+            if key in report:
+                live.log_metric(prefix + key, report[key], plot=False)
+
+def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        if key_top in report:
+            for key in keys:
+                if key in report[key_top]:
+                    live.log_metric(prefix + key, report[key_top][key], plot=False)
+
 if __name__ == "__main__":
     print("Consolidating synthesis results from all sub-jobs of the array")
     consolidate_logs(sys.argv[1], sys.argv[2])
+    # TODO: remove task-level .json logs and GitLab artifacts of this job?
 
-    # TEST DVC
-    # TODO: proper metric collection directly from .jsons in report build dir
+    ### PUSH RESULTS TO DVC ###
     combined_log = []
     with open(sys.argv[2], "r") as f:
         combined_log = json.load(f)
 
     for run in combined_log:
+        id = run["run_id"]
         with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live:
-            #TODO: add pipeline info to metadata (or as metric or other annotation?)
+            ### PARAMS ###
+            #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?)
             metadata = {
                 "metadata": {
                     "run_id": run["run_id"],
@@ -98,18 +124,109 @@ def wait_for_power_measurements():
                 }
             }
             live.log_params(metadata)
-            live.log_params(run["params"])
-
-            # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate)
-
-            # OOC synth resource report (step_out_of_context_synthesis)
-
-            # shell synth resource report (step_synthesize_bitfile)
-
-
-            if "builder" in run["output"]:
-                for key in run["output"]["builder"]:
-                    live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False)
+            params = {
+                "params": run["params"]
+            }
+            live.log_params(params)
+
+            ### METRICS ###
+            # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.)
+            # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36)
+
+            # estimate_layer_resources.json
+            log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [
+                "LUT",
+                "DSP",
+                "BRAM_18K",
+                "URAM",
+                ], prefix="estimate/resources/")
+
+            # estimate_layer_resources_hls.json
+            log_nested_metrics_from_report(id, live, "estimate_layer_resources_hls.json", "total", [
+                "LUT",
+                "FF",
+                "DSP",
+                "DSP48E",
+                "DSP58E", # TODO: aggregate/unify DSP reporting
+                "BRAM_18K",
+                "URAM",
+                ], prefix="hls_estimate/resources/")
+
+            # estimate_network_performance.json
+            log_metrics_from_report(id, live, "estimate_network_performance.json", [
+                "critical_path_cycles",
+                "max_cycles",
+                "max_cycles_node_name",
+                "estimated_throughput_fps",
+                "estimated_latency_ns",
+                ], prefix="estimate/performance/")
+
+            # rtlsim_performance.json
+            log_metrics_from_report(id, live, "rtlsim_performance.json", [
+                "N",
+                "TIMEOUT",
+                "latency_cycles",
+                "cycles",
+                "fclk[mhz]",
+                "throughput[images/s]",
+                "stable_throughput[images/s]",
+                # add INPUT_DONE, OUTPUT_DONE, number transactions?
+                ], prefix="rtlsim/performance/")
+
+            # fifo_sizing.json
+            log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"])
+
+            # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
+            log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
+                "LUT",
+                "LUTRAM",
+                "FF",
+                "DSP",
+                "BRAM",
+                "BRAM_18K",
+                "BRAM_36K",
+                "URAM",
+                ], prefix="synth(ooc)/resources/")
+            log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
+                "WNS",
+                "fmax_mhz",
+                # add TNS? what is "delay"?
+                ], prefix="synth(ooc)/timing/")
+
+            # post_synth_resources.json (shell synth / step_synthesize_bitfile)
+            log_nested_metrics_from_report(id, live, "post_synth_resources.json", "(top)", [
+                "LUT",
+                "FF",
+                "SRL",
+                "DSP",
+                "BRAM_18K",
+                "BRAM_36K",
+                "URAM",
+                ], prefix="synth/resources/")
+
+            # post synth timing report 
+            # TODO: only exported as post_route_timing.rpt, not .json
+
+            # verification steps
+            if "output" in run:
+                if "builder_verification" in run["output"]:
+                    live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False)
+
+            # instrumentation measurement
+            # TODO
+
+            # power measurement
+            # TODO
+
+            # live fifosizing report + png
+            # TODO
+
+            # time_per_step.json
+            log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
+
+            ### ARTIFACTS ###
+            # Build reports, as they come from GitLab artifact
+            live.log_artifact(os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports"))
 
     # TODO: disabled for now, update accordingly to new runner-based measurement setup
     # wait_for_power_measurements()
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 68631346b9..c925a1ac05 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -530,6 +530,7 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
     report_dir = cfg.output_dir + "/report"
     os.makedirs(report_dir, exist_ok=True)
     estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation)
+    estimate_layer_resources_hls["total"] = aggregate_dict_keys(estimate_layer_resources_hls)
     with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f:
         json.dump(estimate_layer_resources_hls, f, indent=2)
 
@@ -651,7 +652,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "depth_trigger_uram",
         "depth_trigger_bram",
     ]
-    extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
+    extract_model_config_to_json(model, cfg.output_dir + "/report/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
     # json file has been written. otherwise, since these transforms may add/remove

From bd3b5de3bd2cad891999ec2532d4caad7a5b98eb Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 18:01:22 +0000
Subject: [PATCH 055/125] Add basic measurement job

---
 .gitlab-ci.yml             |  7 +++--
 benchmarking/bench-ci.yml  | 26 +++++++++++++++---
 benchmarking/bench_base.py |  5 ++--
 benchmarking/measure.py    | 55 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 9 deletions(-)
 create mode 100644 benchmarking/measure.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a82ad24eeb..31e963729b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -18,6 +18,9 @@ variables:
   CPU_CORES:
     description: "Select number of CPU cores and test workers"
     value: "64"
+  CPU_CORES_BENCH:
+    description: "Select number of CPU cores for benchmark runs"
+    value: "32"
   PARALLEL_JOBS:
     description: "Number of parallel Slurm array jobs per Benchmark job"
     value: "2"
@@ -26,7 +29,7 @@ variables:
     value: "2-0" # [days-hours]
   SLURM_PARTITION:
     description: "Slurm partition (e.g., normal, largemem, fpga, gpu)"
-    value: "largemem"
+    value: "normal"
   SLURM_QOS:
     description: "Optional QoS option (include --qos, e.g., --qos express)"
     value: ""
@@ -154,7 +157,7 @@ FINN Test Suite 2022.2:
     paths:
       - deps
   variables:
-    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p largemem -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
     PYTEST_PARALLEL: "$CPU_CORES"
     FINN_XILINX_VERSION: "2022.2"
   before_script:
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index c6d2c6bc91..5c2771465a 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -20,9 +20,9 @@ FINN Build:
     - job: Fetch Repos
       pipeline: $PARENT_PIPELINE_ID
   variables:
-    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )"
-    NUM_DEFAULT_WORKERS: "$CPU_CORES"
-    PYTEST_PARALLEL: "$CPU_CORES"
+    SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"
+    NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH"
+    PYTEST_PARALLEL: "$CPU_CORES_BENCH"
   before_script:
     - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk)
     - cd $PATH_WORKDIR/finn-plus
@@ -42,6 +42,24 @@ FINN Build:
     paths:
       - bench_artifacts/
 
+Measurement:
+  id_tokens:
+    CI_JOB_JWT:
+      aud: https://git.uni-paderborn.de
+  stage: measure
+  tags:
+    - board
+  rules:
+    # Also run on failure of previous tasks to measure partial results
+    - when: always
+  script:
+    - python benchmarking/measure.py
+  artifacts:
+    name: "bench_artifacts"
+    when: always
+    paths:
+      - bench_artifacts/
+
 Result Collection:
   id_tokens:
     CI_JOB_JWT:
@@ -54,7 +72,7 @@ Result Collection:
     - when: always
   script:
     - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
-    - dvc exp push git@github.com:eki-project/finn-plus.git
+    - dvc exp push -r push git@github.com:eki-project/finn-plus.git
   artifacts:
     name: "bench_results"
     when: always
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index eef9edd721..b39a8b0dde 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -369,7 +369,7 @@ def steps_full_build_flow(self):
         os.makedirs(tmp_buildflow_dir, exist_ok=True)
         delete_dir_contents(tmp_buildflow_dir)
         self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output")
-        os.makedirs(self.build_inputs["build_dir"], exist_ok=True)
+        os.makedirs(os.path.join(self.build_inputs["build_dir"], "report"), exist_ok=True)
 
         # Save full build dir as local artifact
         self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False))
@@ -390,11 +390,10 @@ def steps_full_build_flow(self):
             self.build_inputs["onnx_path"] = self.params["model_path"]
         else:
             # input ONNX model (+ optional I/O pair for verification) will be generated
-            self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx")
+            self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx")
             if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped":
                 # microbenchmarks might skip because no valid model can be generated for given params
                 return
-            self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"])
 
         if "folding_path" in self.params:
             self.build_inputs["folding_path"] = self.params["folding_path"]
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
new file mode 100644
index 0000000000..6744eacedb
--- /dev/null
+++ b/benchmarking/measure.py
@@ -0,0 +1,55 @@
+import os
+import subprocess
+import shutil
+
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print('Failed to delete %s. Reason: %s' % (file_path, e))
+
+if __name__ == "__main__":
+    print("Looking for deployment packages in artifacts..")
+    # Find deployment packages from artifacts
+    artifacts_dir = os.path.join("bench_artifacts", "runs_output")
+    for run in os.listdir(artifacts_dir):
+        run_dir = os.path.join(artifacts_dir, run)
+        reports_dir = os.path.join(run_dir, "reports")
+        deploy_archive = os.path.join(run_dir, "deploy.zip")
+        extract_dir = "measurement"
+        if os.path.isfile(deploy_archive):
+            print("Found deployment package in %s, extracting.." % run_dir)
+
+            # Extract to temporary dir
+            shutil.unpack_archive(deploy_archive, extract_dir)
+
+            # Run driver
+            print("Running driver..")
+            subprocess.run([f"python {extract_dir}/driver/driver.py",
+                            f"--bitfile {extract_dir}/bitfile/finn-accel.bit",
+                            f"--settingsfile {extract_dir}/driver/settings.json",
+                            f"--reportfile {extract_dir}/measured_performance.json",
+                            ]) 
+            print("Driver finished.")
+
+            # Copy results back to artifact directory
+            for report in ["measured_performance.json", 
+                           "fifo_sizing_report.json",
+                           "fifo_depth_export.json",
+                           "fifo_sizing_graph.png",
+                           ]:
+                report_path = os.path.join(extract_dir, report)
+                if os.path.isfile(report_path):
+                    print("Copying %s to %s" % (report_path, reports_dir))
+                    shutil.copy(report_path, reports_dir)
+
+            print("Clearing temporary directory..")
+            # Clear temporary dir
+            delete_dir_contents(extract_dir)
+            print("Done.")

From 8fa1483a9700ac90291e756106104ad7e1022664 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 21:26:04 +0000
Subject: [PATCH 056/125] Fixes to measurement and logging

---
 benchmarking/bench.py           |  4 ++--
 benchmarking/bench_base.py      |  4 ++--
 benchmarking/cfg/mvau_test.json |  6 ------
 benchmarking/collect.py         | 36 +++++++++++++++++++++++++--------
 benchmarking/measure.py         |  8 ++++----
 5 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index fb890332b9..e7f38d0e29 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -145,8 +145,8 @@ def get_default_session_options_new():
 
         start_time = time.time()
         try:
-            bench_object.run()
-            if not bench_object.output_dict:
+            result = bench_object.run()
+            if result == "skipped":
                 log_dict["status"] = "skipped"
                 print("Run skipped")
             else:
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index b39a8b0dde..7634ead091 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -206,7 +206,7 @@ def step_build_setup(self):
 
     # defaults to normal build flow, may be overwritten by subclass
     def run(self):
-        self.steps_full_build_flow()
+        return self.steps_full_build_flow()
 
     # def step_finn_estimate(self):
     #     # Gather FINN estimates
@@ -393,7 +393,7 @@ def steps_full_build_flow(self):
             self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx")
             if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped":
                 # microbenchmarks might skip because no valid model can be generated for given params
-                return
+                return "skipped"
 
         if "folding_path" in self.params:
             self.build_inputs["folding_path"] = self.params["folding_path"]
diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json
index 07fd52cc2f..c42b16782c 100644
--- a/benchmarking/cfg/mvau_test.json
+++ b/benchmarking/cfg/mvau_test.json
@@ -19,12 +19,6 @@
         "ram_style": ["distributed"],
         "ram_style_thr": ["distributed"],
 
-        "do_hls": [true],
-        "do_rtlsim": [true],
-        "do_synthesis": [true],
-        "do_sim_power": [true],
-        "do_synth_power": [true],
-
         "dut_duplication": [1],
 
         "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 27a298acea..fbc0118d79 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -85,6 +85,12 @@ def open_json_report(id, report_name):
     else:
         return None
 
+def log_all_metrics_from_report(id, live, report_name, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in report:
+            live.log_metric(prefix + key, report[key], plot=False)
+
 def log_metrics_from_report(id, live, report_name, keys, prefix=""):
     report = open_json_report(id, report_name)
     if report:
@@ -112,7 +118,10 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
 
     for run in combined_log:
         id = run["run_id"]
-        with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live:
+        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + id
+        experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME")
+        #TODO: cache images once we switch to a cache provider that works with DVC Studio
+        with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live:
             ### PARAMS ###
             #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?)
             metadata = {
@@ -124,11 +133,15 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                 }
             }
             live.log_params(metadata)
-            params = {
-                "params": run["params"]
-            }
+            params = {"params": run["params"]}
             live.log_params(params)
 
+            # dut_info.json (additional information about DUT generated during model generation)
+            dut_info_report = open_json_report(id, "dut_info.json")
+            if dut_info_report:
+                dut_info = {"dut_info": dut_info_report}
+                live.log_params(dut_info)
+
             ### METRICS ###
             # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.)
             # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36)
@@ -174,7 +187,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                 ], prefix="rtlsim/performance/")
 
             # fifo_sizing.json
-            log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"])
+            log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/")
 
             # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
             log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
@@ -213,13 +226,20 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                     live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False)
 
             # instrumentation measurement
-            # TODO
+            log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/")
 
             # power measurement
             # TODO
 
-            # live fifosizing report + png
-            # TODO
+            # live fifosizing report + graph png
+            log_metrics_from_report(id, live, "fifo_sizing_report.json", [
+                "error",
+                "fifo_size_total_kB",
+                ], prefix="fifosizing/live/")
+
+            image = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png")
+            if os.path.isfile(image):
+                live.log_image("fifosizing_pass_1", image)
 
             # time_per_step.json
             log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index 6744eacedb..e0a5da0bfc 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -31,10 +31,10 @@ def delete_dir_contents(dir):
 
             # Run driver
             print("Running driver..")
-            subprocess.run([f"python {extract_dir}/driver/driver.py",
-                            f"--bitfile {extract_dir}/bitfile/finn-accel.bit",
-                            f"--settingsfile {extract_dir}/driver/settings.json",
-                            f"--reportfile {extract_dir}/measured_performance.json",
+            subprocess.run(["python", f"{extract_dir}/driver/driver.py",
+                            "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
+                            "--settingsfile", f"{extract_dir}/driver/settings.json",
+                            "--reportfile", f"{extract_dir}/measured_performance.json",
                             ]) 
             print("Driver finished.")
 

From 2a7c9c4ffedbdddbfd0cf9e9288cdeb0b31972ac Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Feb 2025 22:34:19 +0000
Subject: [PATCH 057/125] Minor fixes

---
 .gitlab-ci.yml          | 2 +-
 benchmarking/collect.py | 2 +-
 benchmarking/measure.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 31e963729b..decf20fe6c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,7 @@ variables:
     value: ""
   TEST_SUITE:
     description: "Select test suite to run"
-    value: "full"
+    value: "none" # DEBUG
     options:
       - "none"
       - "quicktest"
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index fbc0118d79..7abbd865d2 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -118,7 +118,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
 
     for run in combined_log:
         id = run["run_id"]
-        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + id
+        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
         experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME")
         #TODO: cache images once we switch to a cache provider that works with DVC Studio
         with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live:
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index e0a5da0bfc..543b48fff9 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -31,7 +31,7 @@ def delete_dir_contents(dir):
 
             # Run driver
             print("Running driver..")
-            subprocess.run(["python", f"{extract_dir}/driver/driver.py",
+            subprocess.run(["sudo", "python", f"{extract_dir}/driver/driver.py",
                             "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
                             "--settingsfile", f"{extract_dir}/driver/settings.json",
                             "--reportfile", f"{extract_dir}/measured_performance.json",

From aa9f4e40acd2307c9c35843e0e136207b3067522 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sat, 1 Mar 2025 11:54:10 +0000
Subject: [PATCH 058/125] Fix pynq measurement issues

---
 benchmarking/bench-ci.yml                                    | 3 ++-
 benchmarking/measure.py                                      | 2 +-
 src/finn/qnn-data/templates/driver/driver_fifosizing.py      | 2 --
 src/finn/qnn-data/templates/driver/driver_instrumentation.py | 4 ----
 4 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 5c2771465a..695aa8a1a3 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -53,7 +53,8 @@ Measurement:
     # Also run on failure of previous tasks to measure partial results
     - when: always
   script:
-    - python benchmarking/measure.py
+    # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment
+    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py"
   artifacts:
     name: "bench_artifacts"
     when: always
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index 543b48fff9..e0a5da0bfc 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -31,7 +31,7 @@ def delete_dir_contents(dir):
 
             # Run driver
             print("Running driver..")
-            subprocess.run(["sudo", "python", f"{extract_dir}/driver/driver.py",
+            subprocess.run(["python", f"{extract_dir}/driver/driver.py",
                             "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
                             "--settingsfile", f"{extract_dir}/driver/settings.json",
                             "--reportfile", f"{extract_dir}/measured_performance.json",
diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index 560959991f..be1f20156a 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -2,9 +2,7 @@
 import json
 import os
 import argparse
-import matplotlib as mpl
 import matplotlib.pyplot as plt
-import numpy as np
 from pynq.pl_server.device import Device
 
 from driver_instrumentation import FINNInstrumentationOverlay
diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
index fea9446bf5..5db2217d45 100644
--- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -1,10 +1,6 @@
 import time
 import json
 import argparse
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-from IPython.display import clear_output
-import numpy as np
 from pynq import Overlay
 from pynq.ps import Clocks
 from pynq.pl_server.device import Device

From f7ad385bce0c206af2fab3103e57ecb8a86d2aa3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 4 Mar 2025 18:35:20 +0000
Subject: [PATCH 059/125] Minor infrastructure improvements

---
 benchmarking/bench-ci.yml               |   7 +-
 benchmarking/bench.py                   |  20 ++---
 benchmarking/bench_base.py              |  40 ++++-----
 benchmarking/collect.py                 | 103 ++++++++++++++----------
 benchmarking/dut/synthetic_nonlinear.py |   3 +
 src/finn/builder/build_dataflow.py      |  12 +++
 6 files changed, 106 insertions(+), 79 deletions(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 695aa8a1a3..1c03ecbd02 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -72,10 +72,5 @@ Result Collection:
     # Also run on failure of previous tasks to collect partial results
     - when: always
   script:
-    - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json
+    - python3.10 benchmarking/collect.py
     - dvc exp push -r push git@github.com:eki-project/finn-plus.git
-  artifacts:
-    name: "bench_results"
-    when: always
-    paths:
-      - bench_results.json
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index e7f38d0e29..2dbcdbe87f 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -56,8 +56,6 @@ def get_default_session_options_new():
 
     artifacts_dir = os.path.join(experiment_dir, "bench_artifacts")
     print("Collecting results in path: %s" % artifacts_dir)
-    os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True)
-    log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id))
     
     # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
     if job_id == 0:
@@ -71,13 +69,13 @@ def get_default_session_options_new():
 
     # Gather benchmarking configs
     if config_name == "manual":
-        configs_path, config_select = os.path.split(os.environ.get("MANUAL_CFG_PATH"))
+        config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
     else:
         configs_path = os.path.join(os.path.dirname(__file__), "cfg")
         config_select = config_name + ".json"
+        config_path = os.path.join(configs_path, config_select)
 
     # Load config
-    config_path = os.path.join(configs_path, config_select)
     print("Loading config %s" % (config_path))
     if os.path.exists(config_path):
         with open(config_path, "r") as f:
@@ -118,9 +116,7 @@ def get_default_session_options_new():
 
     # Run benchmark
     # TODO: integrate this loop (especially status logging) into the bench class
-    # TODO: log additional info as artifact or directly into info section of json (e.g. dut, versions, date)
-    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable)
-    log = []
+    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable), coordinate with new logging
     for run, run_id in enumerate(selected_runs):
         print(
             "Starting run %d/%d (id %d of %d total runs)"
@@ -143,7 +139,6 @@ def get_default_session_options_new():
             print("ERROR: no DUT specified")
             return 1
 
-        start_time = time.time()
         try:
             result = bench_object.run()
             if result == "skipped":
@@ -158,13 +153,12 @@ def get_default_session_options_new():
             exit_code = 1
             # TODO: exception catch all in builder prevents internal failures from being caught here
 
-        log_dict["total_time"] = int(time.time() - start_time)
         log_dict["output"] = bench_object.output_dict
-        log.append(log_dict)
-        # TODO: save this meta data into run-level reports dir insted of task*.json
-        # overwrite output log file every time to allow early abort
+
+        # log metadata of this run to its own report directory
+        log_path = os.path.join(bench_object.report_dir, "metadata_bench.json")
         with open(log_path, "w") as f:
-            json.dump(log, f, indent=2)
+            json.dump(log_dict, f, indent=2)
 
         # save GitLab artifacts of this run (e.g., reports and deployment package)
         bench_object.save_artifacts_collection()
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 7634ead091..6a4bd63c51 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -42,7 +42,7 @@
 from finn.builder.build_dataflow_config import DataflowBuildConfig
 import pandas as pd
 import onnxruntime as ort
-
+#TODO: merge this file into bench.py once most functionality has been moved to builder
 
 def start_test_batch_fast(results_path, project_path, run_target, pairs):
     # Prepare tcl script
@@ -170,7 +170,25 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
             # Save entire FINN build dir and working dir
             # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure)
             self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False))
-            self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False))
+            #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False))
+
+        ### SETUP ###
+        # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
+        # Ensure it exists but is empty (clear potential artifacts from previous runs)
+        tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow")
+        os.makedirs(tmp_buildflow_dir, exist_ok=True)
+        delete_dir_contents(tmp_buildflow_dir)
+        self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir
+        self.build_dir = os.path.join(tmp_buildflow_dir, "build_output")
+        self.report_dir = os.path.join(self.build_dir, "report")
+        os.makedirs(self.report_dir, exist_ok=True)
+
+        # Save full build dir as local artifact
+        self.local_artifacts_collection.append(("build_output", self.build_dir, False))
+        # Save reports and deployment package as pipeline artifacts
+        self.artifacts_collection.append(("reports", self.report_dir, False))
+        self.artifacts_collection.append(("reports", os.path.join(self.build_dir, "build_dataflow.log"), False))
+        self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True))
 
     def save_artifact(self, target_path, source_path, archive=False):
         if os.path.isdir(source_path):
@@ -362,22 +380,6 @@ def step_parse_builder_output(self, build_dir):
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
 
-        ### SETUP ###
-        # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
-        # Ensure it exists but is empty (clear potential artifacts from previous runs)
-        tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow")
-        os.makedirs(tmp_buildflow_dir, exist_ok=True)
-        delete_dir_contents(tmp_buildflow_dir)
-        self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output")
-        os.makedirs(os.path.join(self.build_inputs["build_dir"], "report"), exist_ok=True)
-
-        # Save full build dir as local artifact
-        self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False))
-        # Save reports and deployment package as pipeline artifacts
-        self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "report"), False))
-        self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "build_dataflow.log"), False))
-        self.artifacts_collection.append(("deploy", os.path.join(self.build_inputs["build_dir"], "deploy"), True))
-
         ### MODEL CREATION/IMPORT ###
         # TODO: track fixed input onnx models with DVC
         if "model_dir" in self.params:
@@ -403,6 +405,8 @@ def steps_full_build_flow(self):
             self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
 
         ### BUILD SETUP ###
+        # TODO: convert to YAML-based builder config
+        # TODO: split up into default config, dut-specific config, and run-specific config
         cfg = self.step_build_setup()
         cfg.generate_outputs = self.params["output_products"]
         cfg.output_dir = self.build_inputs["build_dir"]
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 7abbd865d2..7b568563fa 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -3,8 +3,11 @@
 import os
 import sys
 import time
+import shutil
 from dvclive import Live
 
+from util import delete_dir_contents
+
 def merge_dicts(a: dict, b: dict):
     for key in b:
         if key in a:
@@ -76,6 +79,10 @@ def wait_for_power_measurements():
         time.sleep(60)
     print("Power measurement complete")
 
+def log_dvc_metric(live, prefix, name, value):
+    # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
+    live.log_metric(prefix + name.replace("/", "-"), value, plot=False)
+
 def open_json_report(id, report_name):
     path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
     if os.path.isfile(path):
@@ -89,14 +96,14 @@ def log_all_metrics_from_report(id, live, report_name, prefix=""):
     report = open_json_report(id, report_name)
     if report:
         for key in report:
-            live.log_metric(prefix + key, report[key], plot=False)
+            log_dvc_metric(live, prefix, key, report[key])
 
 def log_metrics_from_report(id, live, report_name, keys, prefix=""):
     report = open_json_report(id, report_name)
     if report:
         for key in keys:
             if key in report:
-                live.log_metric(prefix + key, report[key], plot=False)
+                log_dvc_metric(live, prefix, key, report[key])
 
 def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""):
     report = open_json_report(id, report_name)
@@ -104,39 +111,43 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
         if key_top in report:
             for key in keys:
                 if key in report[key_top]:
-                    live.log_metric(prefix + key, report[key_top][key], plot=False)
+                    log_dvc_metric(live, prefix, key, report[key_top][key])
 
 if __name__ == "__main__":
-    print("Consolidating synthesis results from all sub-jobs of the array")
-    consolidate_logs(sys.argv[1], sys.argv[2])
-    # TODO: remove task-level .json logs and GitLab artifacts of this job?
-
-    ### PUSH RESULTS TO DVC ###
-    combined_log = []
-    with open(sys.argv[2], "r") as f:
-        combined_log = json.load(f)
-
-    for run in combined_log:
-        id = run["run_id"]
+    # Go through all runs found in the artifacts and log their results to DVC
+    run_dir_list = os.listdir(os.path.join("bench_artifacts", "runs_output"))
+    print("Looking for runs in %s" % run_dir_list)
+    run_ids = []
+    for run_dir in run_dir_list:
+        if run_dir.startswith("run_"):
+            run_id = int(run_dir[4:])
+            run_ids.append(run_id)
+    run_ids.sort()
+    print("Found %d runs" % len(run_ids))
+
+    for id in run_ids:
+        print("Processing run %d" % id)
         experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
         experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME")
         #TODO: cache images once we switch to a cache provider that works with DVC Studio
         with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live:
             ### PARAMS ###
-            #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?)
-            metadata = {
-                "metadata": {
-                    "run_id": run["run_id"],
-                    "task_id": run["task_id"],
-                    "status": run["status"],
-                    "total_time": run["total_time"],
-                }
-            }
-            live.log_params(metadata)
-            params = {"params": run["params"]}
+            # input parameters logged by benchmarking infrastructure
+            metadata_bench = open_json_report(id, "metadata_bench.json")   
+            params = {"params": metadata_bench["params"]}
             live.log_params(params)
 
-            # dut_info.json (additional information about DUT generated during model generation)
+            # optional metadata logged by builder
+            metadata_builder = open_json_report(id, "metadata_builder.json")
+            if metadata_builder:
+                metadata = {
+                    "metadata": {
+                        "tool_version": metadata_builder["tool_version"],
+                    }
+                }
+                live.log_params(metadata)
+
+            # optional dut_info.json (additional information about DUT generated during model generation)
             dut_info_report = open_json_report(id, "dut_info.json")
             if dut_info_report:
                 dut_info = {"dut_info": dut_info_report}
@@ -146,6 +157,21 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.)
             # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36)
 
+            # status
+            status = metadata_bench["status"]
+            if status == "ok":
+                # mark as failed if either bench or builder indicates failure
+                if metadata_builder:
+                    status_builder = metadata_builder["status"]
+                    if status_builder == "failed":
+                        status = "failed"
+            log_dvc_metric(live, "", "status", status)
+
+            # verification steps
+            if "output" in metadata_bench:
+                if "builder_verification" in metadata_bench["output"]:
+                    log_dvc_metric(live, "", "verification", metadata_bench["output"]["builder_verification"]["verification"])
+
             # estimate_layer_resources.json
             log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [
                 "LUT",
@@ -220,11 +246,6 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             # post synth timing report 
             # TODO: only exported as post_route_timing.rpt, not .json
 
-            # verification steps
-            if "output" in run:
-                if "builder_verification" in run["output"]:
-                    live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False)
-
             # instrumentation measurement
             log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/")
 
@@ -245,15 +266,13 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
 
             ### ARTIFACTS ###
-            # Build reports, as they come from GitLab artifact
-            live.log_artifact(os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports"))
-
-    # TODO: disabled for now, update accordingly to new runner-based measurement setup
-    # wait_for_power_measurements()
-    # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
-    #                         "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
-    #                         "power_measure.json")
-    # if os.path.isfile(power_log_path):
-    #     print("Merging power measurement logs with remaining logs")
-    #     merge_logs(sys.argv[2], power_log_path, sys.argv[2])
+            # Log build reports as they come from GitLab artifacts,
+            # but copy them to a central dir first so all runs share the same path
+            run_report_dir = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports")
+            dvc_report_dir = "reports"
+            os.makedirs(dvc_report_dir, exist_ok=True)
+            delete_dir_contents(dvc_report_dir)
+            shutil.copytree(run_report_dir, dvc_report_dir, dirs_exist_ok=True)
+            live.log_artifact(dvc_report_dir)
+
     print("Done")
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index 4eb59ef7b2..759f31838b 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -190,6 +190,9 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe):
     dup_config["PE"] = pe
     dup_config["NumOutputStreams"] = 2
     dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name
+    # We always need to set outFIFODepths explictly for DuplicateStreams
+    # because it has no default value that corresponds automatically to NumOutputStreams
+    dup_config["outFIFODepths"] = [2] * 2
 
     add_config = {}
     add_config["domain"] = "finn.custom_op.fpgadataflow.hls"
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index baada9d1d2..8602fffa09 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -181,11 +181,23 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             else:
                 print("enable_build_pdb_debug not set in build config, exiting...")
             print("Build failed")
+            metadata = {
+                "status": "failed",
+                "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")),
+            }
+            with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
+                json.dump(metadata, f, indent=2)
             return -1
 
     time_per_step["total_build_time"] = sum(time_per_step.values())
     with open(cfg.output_dir + "/report/time_per_step.json", "w") as f:
         json.dump(time_per_step, f, indent=2)
+    metadata = {
+        "status": "ok",
+        "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")),
+    }
+    with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
+        json.dump(metadata, f, indent=2)
     print("Completed successfully")
     return 0
 

From c73b9c14c04f1a41090f21d4f85991864dbc925a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 09:45:49 +0000
Subject: [PATCH 060/125] Separate build & measure artifacts, fixes

---
 .gitlab-ci.yml                  |  8 ---
 benchmarking/bench-ci.yml       | 12 ++--
 benchmarking/bench.py           |  8 ++-
 benchmarking/collect.py         | 97 ++++++---------------------------
 benchmarking/dut/transformer.py | 28 +---------
 benchmarking/measure.py         | 26 ++++-----
 benchmarking/util.py            | 34 ++++++++++++
 7 files changed, 73 insertions(+), 140 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index decf20fe6c..79d772f65d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -215,11 +215,3 @@ Bench:
   parallel:
     matrix:
       - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest]
-
-#TODO: add selector for none, reduced, full benchmark suite
-#TODO: introduce result collect job on parent level for easier visualization/excel interfacing
-#TODO: more control via (optional) variables
-#TODO: move power measurement from polling-based script to its own job/runner
-#TODO: ensure a freshly initialized workdir on job/runner level (e.g. created directories seem to stay there)
-#TODO: (optionally) save ALL build artifacts/logs/temporary files to artifacts or PFS for debugging (maybe via Jacamar feature of setting individual persistent workdirs?)
-#TODO: fix clock frequency discrepancies between setting, synth, and driver
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 1c03ecbd02..99adf1e0dc 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -1,5 +1,5 @@
 stages:
-  - synth
+  - build
   - measure
   - collect
 
@@ -15,7 +15,7 @@ FINN Build:
   id_tokens:
     CI_JOB_JWT:
       aud: https://git.uni-paderborn.de
-  stage: synth
+  stage: build
   needs:
     - job: Fetch Repos
       pipeline: $PARENT_PIPELINE_ID
@@ -37,10 +37,10 @@ FINN Build:
     paths:
       - deps
   artifacts:
-    name: "bench_artifacts"
+    name: "build_artifacts"
     when: always
     paths:
-      - bench_artifacts/
+      - build_artifacts/
 
 Measurement:
   id_tokens:
@@ -56,10 +56,10 @@ Measurement:
     # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment
     - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py"
   artifacts:
-    name: "bench_artifacts"
+    name: "measurement_artifacts"
     when: always
     paths:
-      - bench_artifacts/
+      - measurement_artifacts/
 
 Result Collection:
   id_tokens:
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 2dbcdbe87f..ea85082fc8 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -54,7 +54,8 @@ def get_default_session_options_new():
     # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
     experiment_dir = os.environ.get("CI_PROJECT_DIR")
 
-    artifacts_dir = os.path.join(experiment_dir, "bench_artifacts")
+    artifacts_dir = os.path.join(experiment_dir, "build_artifacts")
+    os.makedirs(artifacts_dir, exist_ok=True)
     print("Collecting results in path: %s" % artifacts_dir)
     
     # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
@@ -151,7 +152,6 @@ def get_default_session_options_new():
             log_dict["status"] = "failed"
             print("Run failed: " + traceback.format_exc())
             exit_code = 1
-            # TODO: exception catch all in builder prevents internal failures from being caught here
 
         log_dict["output"] = bench_object.output_dict
 
@@ -164,9 +164,11 @@ def get_default_session_options_new():
         bench_object.save_artifacts_collection()
         # save local artifacts of this run (e.g., full build dir, detailed debug info)
         bench_object.save_local_artifacts_collection()
+
+        #TODO: examine verification result and builder status here to fail pipeline via exit code?
+
     print("Stopping job")
     return exit_code
-    #TODO: add additional exit codes (e.g. when some verification within the run failed)?
 
 if __name__ == "__main__":
     exit_code = main(sys.argv[1])
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 7b568563fa..bcff28104c 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -1,92 +1,25 @@
-import itertools
 import json
 import os
-import sys
-import time
 import shutil
 from dvclive import Live
 
 from util import delete_dir_contents
 
-def merge_dicts(a: dict, b: dict):
-    for key in b:
-        if key in a:
-            if isinstance(a[key], dict) and isinstance(b[key], dict):
-                merge_dicts(a[key], b[key])
-            elif a[key] != b[key]:
-                raise Exception("ERROR: Dict merge conflict")
-        else:
-            a[key] = b[key]
-    return a
-
-def consolidate_logs(path, output_filepath):
-    log = []
-    i = 0
-    while (i < 1024):
-        if (os.path.isfile(os.path.join(path,"task_%d.json"%(i)))):
-            with open(os.path.join(path,"task_%d.json"%(i)), "r") as f:
-                log_task = json.load(f)
-            log.extend(log_task)
-        i = i + 1
-    
-    with open(output_filepath, "w") as f:
-        json.dump(log, f, indent=2)
-
-def merge_logs(log_a, log_b, log_out):
-    # merges json log (list of nested dicts) b into a, not vice versa (TODO)
-
-    with open(log_a, "r") as f:
-        a = json.load(f)
-    with open(log_b, "r") as f:
-        b = json.load(f)
-
-    for idx, run_a in enumerate(a):
-        for run_b in b:
-            if run_a["run_id"] == run_b["run_id"]:
-                #a[idx] |= run_b # requires Python >= 3.9
-                #a[idx] = {**run_a, **run_b}
-                a[idx] = merge_dicts(run_a, run_b)
-                break
-
-    # also sort by run id
-    out = sorted(a, key=lambda x: x["run_id"])
-
-    with open(log_out, "w") as f:
-        json.dump(out, f, indent=2)
-
-def wait_for_power_measurements():
-    # TODO: detect when no bitstreams are to be measured (e.g. for fifosizing) and skip
-    # TODO: make configurable, relative to some env variable due to different mountint points
-    bitstreams_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
-                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
-                            "bitstreams")
-    
-    power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", 
-                            "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), 
-                            "power_measure.json")
-
-    # count bitstreams to measure (can't rely on total number of runs since some of them could've failed)
-    files = os.listdir(bitstreams_path)
-    bitstream_count = len(list(filter(lambda x : ".bit" in x, files)))
-
-    log = []
-    print("Checking if all bitstreams of pipeline have been measured..")
-    while(len(log) < bitstream_count):
-        if os.path.isfile(power_log_path):
-            with open(power_log_path, "r") as f:
-                log = json.load(f)
-        print("Found measurements for %d/%d bitstreams"%(len(log),bitstream_count))
-        time.sleep(60)
-    print("Power measurement complete")
 
 def log_dvc_metric(live, prefix, name, value):
     # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
     live.log_metric(prefix + name.replace("/", "-"), value, plot=False)
 
 def open_json_report(id, report_name):
-    path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
-    if os.path.isfile(path):
-        with open(path, "r") as f:
+    # look in both, build & measurement, artifacts
+    path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
+    path2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
+    if os.path.isfile(path1):
+        with open(path1, "r") as f:
+            report = json.load(f)
+        return report
+    elif os.path.isfile(path2):
+        with open(path2, "r") as f:
             report = json.load(f)
         return report
     else:
@@ -115,7 +48,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
 
 if __name__ == "__main__":
     # Go through all runs found in the artifacts and log their results to DVC
-    run_dir_list = os.listdir(os.path.join("bench_artifacts", "runs_output"))
+    run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output"))
     print("Looking for runs in %s" % run_dir_list)
     run_ids = []
     for run_dir in run_dir_list:
@@ -258,7 +191,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                 "fifo_size_total_kB",
                 ], prefix="fifosizing/live/")
 
-            image = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png")
+            image = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png")
             if os.path.isfile(image):
                 live.log_image("fifosizing_pass_1", image)
 
@@ -268,11 +201,15 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             ### ARTIFACTS ###
             # Log build reports as they come from GitLab artifacts,
             # but copy them to a central dir first so all runs share the same path
-            run_report_dir = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports")
+            run_report_dir1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports")
+            run_report_dir2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports")
             dvc_report_dir = "reports"
             os.makedirs(dvc_report_dir, exist_ok=True)
             delete_dir_contents(dvc_report_dir)
-            shutil.copytree(run_report_dir, dvc_report_dir, dirs_exist_ok=True)
+            if os.path.isdir(run_report_dir1):
+                shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True)
+            if os.path.isdir(run_report_dir2):
+                shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True)
             live.log_artifact(dvc_report_dir)
 
     print("Done")
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 2beca913c7..ea9713edfa 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -43,15 +43,7 @@
     node_by_node_rtlsim,  # noqa: Maybe unused, only for debugging
     node_by_node_cppsim,
 )
-# from performance.platform_build_steps import(
-#      test_step_gen_vitis_xo,
-#      test_step_gen_instrumentation_wrapper,
-#      test_step_gen_instrwrap_sim,
-#      test_step_insert_tlastmarker,
-#      test_step_export_xo,
-#      test_step_build_platform,
-#      test_step_run_instrwrap_sim
-# )
+
 
 ### ADAPTED FROM utils.py
 # Seeds all relevant random number generators to the same seed for
@@ -994,21 +986,3 @@ def step_build_setup(self):
         )
 
         return cfg
-
-    #def run(self):
-    # self.steps_full_build_flow()
-    # DEBUG code for live logging of long instr wrapper simulation:
-    # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log")
-    # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True)
-    # sim_output_dir = build_dir + "/instrwrap_sim"
-    # # Prepare bash script
-    # bash_script = os.getcwd() + "/run_vivado_sim.sh"
-    # with open(bash_script, "w") as script:
-    #     script.write("#!/bin/bash\n")
-    #     script.write("cd %s\n"%(sim_output_dir))
-    #     script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path))
-    # # Run script
-    # print("Running Vivado simulation of instrumentation wrapper")
-    # sub_proc = subprocess.Popen(["bash", bash_script])
-    # sub_proc.communicate()
-    #######
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index e0a5da0bfc..3accb734b9 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -2,29 +2,22 @@
 import subprocess
 import shutil
 
+from util import delete_dir_contents
 
-def delete_dir_contents(dir):
-    for filename in os.listdir(dir):
-        file_path = os.path.join(dir, filename)
-        try:
-            if os.path.isfile(file_path) or os.path.islink(file_path):
-                os.unlink(file_path)
-            elif os.path.isdir(file_path):
-                shutil.rmtree(file_path)
-        except Exception as e:
-            print('Failed to delete %s. Reason: %s' % (file_path, e))
 
 if __name__ == "__main__":
     print("Looking for deployment packages in artifacts..")
     # Find deployment packages from artifacts
-    artifacts_dir = os.path.join("bench_artifacts", "runs_output")
-    for run in os.listdir(artifacts_dir):
-        run_dir = os.path.join(artifacts_dir, run)
-        reports_dir = os.path.join(run_dir, "reports")
-        deploy_archive = os.path.join(run_dir, "deploy.zip")
+    artifacts_in_dir = os.path.join("build_artifacts", "runs_output")
+    artifacts_out_dir = os.path.join("measurement_artifacts", "runs_output")
+    for run in os.listdir(artifacts_in_dir):
+        run_in_dir = os.path.join(artifacts_in_dir, run)
+        run_out_dir = os.path.join(artifacts_out_dir, run)
+        reports_dir = os.path.join(run_out_dir, "reports")
+        deploy_archive = os.path.join(run_in_dir, "deploy.zip")
         extract_dir = "measurement"
         if os.path.isfile(deploy_archive):
-            print("Found deployment package in %s, extracting.." % run_dir)
+            print("Found deployment package in %s, extracting.." % run_in_dir)
 
             # Extract to temporary dir
             shutil.unpack_archive(deploy_archive, extract_dir)
@@ -47,6 +40,7 @@ def delete_dir_contents(dir):
                 report_path = os.path.join(extract_dir, report)
                 if os.path.isfile(report_path):
                     print("Copying %s to %s" % (report_path, reports_dir))
+                    os.makedirs(reports_dir, exist_ok=True)
                     shutil.copy(report_path, reports_dir)
 
             print("Clearing temporary directory..")
diff --git a/benchmarking/util.py b/benchmarking/util.py
index 17dec02762..1b4363a707 100644
--- a/benchmarking/util.py
+++ b/benchmarking/util.py
@@ -1,5 +1,6 @@
 # Utility functions for benchmarking
 import os, shutil
+import json
 from qonnx.core.datatype import DataType
 import xml.etree.ElementTree as ET
 
@@ -85,3 +86,36 @@ def delete_dir_contents(dir):
                 shutil.rmtree(file_path)
         except Exception as e:
             print('Failed to delete %s. Reason: %s' % (file_path, e))
+
+def merge_dicts(a: dict, b: dict):
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_dicts(a[key], b[key])
+            elif a[key] != b[key]:
+                raise Exception("ERROR: Dict merge conflict")
+        else:
+            a[key] = b[key]
+    return a
+
+def merge_logs(log_a, log_b, log_out):
+    # merges json log (list of nested dicts) b into a, not vice versa (TODO)
+
+    with open(log_a, "r") as f:
+        a = json.load(f)
+    with open(log_b, "r") as f:
+        b = json.load(f)
+
+    for idx, run_a in enumerate(a):
+        for run_b in b:
+            if run_a["run_id"] == run_b["run_id"]:
+                #a[idx] |= run_b # requires Python >= 3.9
+                #a[idx] = {**run_a, **run_b}
+                a[idx] = merge_dicts(run_a, run_b)
+                break
+
+    # also sort by run id
+    out = sorted(a, key=lambda x: x["run_id"])
+
+    with open(log_out, "w") as f:
+        json.dump(out, f, indent=2)

From b70ba9e1481faf426d187395f4072bdea6c0f4c0 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 09:47:56 +0000
Subject: [PATCH 061/125] Fix collection job import

---
 benchmarking/collect.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index bcff28104c..5cbe5fbf41 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -3,8 +3,17 @@
 import shutil
 from dvclive import Live
 
-from util import delete_dir_contents
 
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print('Failed to delete %s. Reason: %s' % (file_path, e))
 
 def log_dvc_metric(live, prefix, name, value):
     # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)

From 4853d0b9c7d49404f109d46a75a135188e93de95 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 09:54:09 +0000
Subject: [PATCH 062/125] Fix util import

---
 benchmarking/bench_base.py              |  2 +-
 benchmarking/collect.py                 | 11 +----------
 benchmarking/dut/synthetic_nonlinear.py |  2 +-
 benchmarking/util.py                    | 11 -----------
 4 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 6a4bd63c51..61a999750c 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -28,7 +28,7 @@
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map
 from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
-from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 5cbe5fbf41..bcff28104c 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -3,17 +3,8 @@
 import shutil
 from dvclive import Live
 
+from util import delete_dir_contents
 
-def delete_dir_contents(dir):
-    for filename in os.listdir(dir):
-        file_path = os.path.join(dir, filename)
-        try:
-            if os.path.isfile(file_path) or os.path.islink(file_path):
-                os.unlink(file_path)
-            elif os.path.isdir(file_path):
-                shutil.rmtree(file_path)
-        except Exception as e:
-            print('Failed to delete %s. Reason: %s' % (file_path, e))
 
 def log_dvc_metric(live, prefix, name, value):
     # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py
index 759f31838b..eb91999b2e 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/benchmarking/dut/synthetic_nonlinear.py
@@ -24,7 +24,7 @@
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir
-from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents
+from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
 from finn.util.test import get_trained_network_and_ishape
 from finn.util.basic import alveo_default_platform
 
diff --git a/benchmarking/util.py b/benchmarking/util.py
index 1b4363a707..23ecc0a984 100644
--- a/benchmarking/util.py
+++ b/benchmarking/util.py
@@ -1,7 +1,6 @@
 # Utility functions for benchmarking
 import os, shutil
 import json
-from qonnx.core.datatype import DataType
 import xml.etree.ElementTree as ET
 
 def _find_rows_and_headers(table):
@@ -14,7 +13,6 @@ def _find_rows_and_headers(table):
             break
     return (rows, headers)
 
-
 def summarize_table(table):
     table_summary = {}
     table_summary["headers"] = []
@@ -40,7 +38,6 @@ def summarize_table(table):
 
     return table_summary
 
-
 def summarize_section(section):
     section_summary = {}
     section_summary["tables"] = []
@@ -57,7 +54,6 @@ def summarize_section(section):
 
     return section_summary
 
-
 def power_xml_to_dict(xml_path):
     tree = ET.parse(xml_path)
     root = tree.getroot()
@@ -69,13 +65,6 @@ def power_xml_to_dict(xml_path):
 
     return result
 
-def prepare_inputs(input_tensor, idt, wdt):
-    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
-        # convert bipolar to binary
-        return {"inp": (input_tensor + 1) / 2}
-    else:
-        return {"inp": input_tensor}
-
 def delete_dir_contents(dir):
     for filename in os.listdir(dir):
         file_path = os.path.join(dir, filename)

From 0c812bc54fbc4a5df24141a48e1cf646a0c008e2 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 10:20:24 +0000
Subject: [PATCH 063/125] Nested interconnects for Zynq-7000, fixes

---
 .../driver/driver_instrumentation.py          | 101 +++++++++++-------
 .../fpgadataflow/make_pynq_driver.py          |   7 +-
 .../fpgadataflow/make_zynq_proj.py            |  16 +--
 .../transformation/fpgadataflow/templates.py  |   2 +
 4 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
index fea9446bf5..90a0ed5b89 100644
--- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -1,31 +1,28 @@
-import time
-import json
 import argparse
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-from IPython.display import clear_output
-import numpy as np
+import json
+import time
 from pynq import Overlay
-from pynq.ps import Clocks
 from pynq.pl_server.device import Device
+from pynq.ps import Clocks
+
+# Instrumentation wrapper register map #
+# ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
+# ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
+# ap_uint<32> &latency,
+# ap_uint<32> &interval,
+# ap_uint<32> &checksum,
+# ap_uint<32> &min_latency
 
-### Instrumentation wrapper register map ###
-#ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
-#ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
-#ap_uint<32> &latency,
-#ap_uint<32> &interval,
-#ap_uint<32> &checksum,
-#ap_uint<32> &min_latency
 
 class FINNInstrumentationOverlay(Overlay):
     def __init__(
         self,
         bitfile_name,
-        platform = "zynq",
-        fclk_mhz = 100.0,
-        device = None,
-        download = True,
-        seed = 1,
+        platform="zynq",
+        fclk_mhz=100.0,
+        device=None,
+        download=True,
+        seed=1,
     ):
         super().__init__(bitfile_name, download=download, device=device)
 
@@ -40,27 +37,34 @@ def __init__(
                 self.fclk_mhz_actual = Clocks.fclk0_mhz
 
     def instrumentation_read(self, name):
-        return self.instrumentation_wrap_0.read(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"])
+        return self.instrumentation_wrap_0.read(
+            offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"]
+        )
 
     def instrumentation_write(self, name, value):
-        return self.instrumentation_wrap_0.write(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], value=value)
+        return self.instrumentation_wrap_0.write(
+            offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"],
+            value=value,
+        )
 
     def reset_accelerator(self):
-        self.axi_gpio_0.write(offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0)
+        self.axi_gpio_0.write(
+            offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0
+        )
 
     def start_accelerator(self):
-        lfsr_seed = (self.seed << 16) & 0xffff0000 # upper 16 bits
-        self.instrumentation_write("cfg", lfsr_seed + 1) # start operation
+        lfsr_seed = (self.seed << 16) & 0xFFFF0000  # upper 16 bits
+        self.instrumentation_write("cfg", lfsr_seed + 1)  # start operation
 
     def observe_instrumentation(self, debug_print=True):
         status_reg = self.instrumentation_read("status")
         chksum_reg = self.instrumentation_read("checksum")
         min_latency = self.instrumentation_read("min_latency")
         latency = self.instrumentation_read("latency")
-        interval =  self.instrumentation_read("interval")
+        interval = self.instrumentation_read("interval")
 
-        frame = (chksum_reg >> 24) & 0x000000ff
-        checksum = chksum_reg & 0x00ffffff
+        frame = (chksum_reg >> 24) & 0x000000FF
+        checksum = chksum_reg & 0x00FFFFFF
         overflow_err = (status_reg & 0x00000001) != 0
         underflow_err = (status_reg & 0x00000002) != 0
 
@@ -83,14 +87,25 @@ def observe_instrumentation(self, debug_print=True):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper')
-    parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10)
-    parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0)
-    parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1)
-    parser.add_argument('--device', help='FPGA device to be used', type=int, default=0)
-    parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit")
-    parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json")
-    parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="")
+    parser = argparse.ArgumentParser(
+        description="Profile FINN-generated accelerator using instrumentation wrapper"
+    )
+    parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10)
+    parser.add_argument(
+        "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0
+    )
+    parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1)
+    parser.add_argument("--device", help="FPGA device to be used", type=int, default=0)
+    parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit")
+    parser.add_argument(
+        "--reportfile",
+        help="Name of output .json report file",
+        type=str,
+        default="measured_performance.json",
+    )
+    parser.add_argument(
+        "--settingsfile", help="Name of optional input .json settings file", type=str, default=""
+    )
     # parse arguments
     args = parser.parse_args()
     runtime = args.runtime
@@ -111,7 +126,9 @@ def observe_instrumentation(self, debug_print=True):
 
     # instantiate FINN accelerator driver and pass batchsize and bitfile
     print("Programming FPGA..")
-    accel = FINNInstrumentationOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed)
+    accel = FINNInstrumentationOverlay(
+        bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed
+    )
 
     # start accelerator
     print("Running accelerator..")
@@ -121,7 +138,15 @@ def observe_instrumentation(self, debug_print=True):
     time.sleep(runtime)
 
     # read measurement from instrumentation
-    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = accel.observe_instrumentation()
+    (
+        overflow_err,
+        underflow_err,
+        frame,
+        checksum,
+        min_latency,
+        latency,
+        interval,
+    ) = accel.observe_instrumentation()
 
     # write report to file
     report = {
@@ -135,7 +160,7 @@ def observe_instrumentation(self, debug_print=True):
         "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
         "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))),
         "min_pipeline_depth": round(min_latency / interval, 2),
-        "pipeline_depth" : round(latency / interval, 2),
+        "pipeline_depth": round(latency / interval, 2),
     }
     with open(reportfile, "w") as f:
         json.dump(report, f, indent=2)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index b935f5eea0..c26fa845ed 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -26,9 +26,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import json
 import numpy as np
 import os
-import json
 import qonnx
 import shutil
 import warnings
@@ -303,6 +303,8 @@ def apply(self, model):
                 else:
                     continue
 
+        return (model, False)
+
 
 class MakePYNQDriverInstrumentation(Transformation):
     def __init__(self, platform, clk_period_ns):
@@ -320,7 +322,8 @@ def apply(self, model):
 
         # create (copy) the static instrumentation driver
         driver_template = (
-            os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py"
+            os.environ["FINN_ROOT"]
+            + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py"
         )
         driver_py = pynq_driver_dir + "/driver.py"
         shutil.copy(driver_template, driver_py)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 846d95a11b..98372b700f 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -174,13 +174,16 @@ def apply(self, model):
                 )
                 # connect to master interconnect
                 config.append(
-                    "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]"
+                    "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] "
+                    "-boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]"
                     % (master_axilite_idx, i)
                 )
-                # connect clocks/reset TODO: suppport zynq_7000
+                # connect clocks/reset
                 config.append(
-                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_%d/ACLK]"
-                    % (i)
+                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
+                    "{ Clk {/zynq_ps/$zynq_ps_clkname} Freq {} "
+                    "Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  "
+                    "[get_bd_pins axi_interconnect_%d/ACLK]" % (i)
                 )
                 master_axilite_idx += 1
                 total_axilite_count = max(0, total_axilite_count - 64)
@@ -359,10 +362,11 @@ def apply(self, model):
             config.append("delete_bd_objs [get_bd_cells smartconnect_0]")
             aximm_idx = 1
 
-        # finalize nested interconnect clock/reset TODO: support zynq_7000
+        # finalize nested interconnect clock/reset
         for i in range(1, nested_interconnect_count + 1):
             config.append(
-                "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_%d/M*_ACLK]"
+                "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
+                "{ Clk {/zynq_ps/$zynq_ps_clkname} }  [get_bd_pins axi_interconnect_%d/M*_ACLK]"
                 % (i)
             )
 
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index d9040d83f2..6cde5cfa66 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -146,6 +146,7 @@
 create_bd_design "top"
 if {$ZYNQ_TYPE == "zynq_us+"} {
     set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]]
+    set zynq_ps_clkname "pl_clk0"
     create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
     #activate one slave port, deactivate the second master port
@@ -156,6 +157,7 @@
     set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
 } elseif {$ZYNQ_TYPE == "zynq_7000"} {
     set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]]
+    set zynq_ps_clkname "FCLK_CLK0"
     create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps]

From 4bf21a295ee38d65b33212012c8952f167db03dc Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 11:05:19 +0000
Subject: [PATCH 064/125] Force disable additional AXI-lite interfaces for live
 FIFO sizing

---
 src/finn/builder/build_dataflow_steps.py | 31 ++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2f05886afd..5dc971cf33 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -91,8 +91,8 @@
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.make_pynq_driver import (
-    MakePYNQDriverIODMA,
     MakePYNQDriverInstrumentation,
+    MakePYNQDriverIODMA,
 )
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
@@ -555,6 +555,29 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior
     if cfg.live_fifo_sizing:
+        # Disable runtime-writable weights, external weights, and dynamic mode,
+        # as we don't support additional AXI-lite interfaces next to the FIFOs
+        for node in model.graph.node:
+            if node.domain.startswith("finn.custom_op.fpgadataflow"):
+                node_inst = getCustomOp(node)
+                try:
+                    if node_inst.get_nodeattr("runtime_writeable_weights") == 1:
+                        node_inst.set_nodeattr("runtime_writeable_weights", 0)
+                        if node_inst.get_nodeattr("ram_style") == "ultra":
+                            node_inst.set_nodeattr("ram_style", "block")
+                except AttributeError:
+                    pass
+                try:
+                    if node_inst.get_nodeattr("mem_mode") == "external":
+                        node_inst.set_nodeattr("mem_mode", "internal_decoupled")
+                except AttributeError:
+                    pass
+                try:
+                    if node_inst.get_nodeattr("dynamic_mode") == 1:
+                        node_inst.set_nodeattr("dynamic_mode", 0)
+                except AttributeError:
+                    pass
+
         # Create all DWCs and FIFOs normally
         model = model.transform(InsertDWC())
         model = model.transform(InsertFIFO(create_shallow_fifos=True))
@@ -826,7 +849,11 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         driver_dir = cfg.output_dir + "/driver"
         if cfg.enable_instrumentation:
-            model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing))
+            model = model.transform(
+                MakePYNQDriverInstrumentation(
+                    cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing
+                )
+            )
         else:
             model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)

From 5cc29245fdff75931bfd9f8feee86261f1231f46 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 6 Mar 2025 17:11:31 +0000
Subject: [PATCH 065/125] Enable live fifosizing for transformer

---
 benchmarking/bench_base.py               |  2 ++
 benchmarking/cfg/synthetic_fifotest.json | 30 ++++++++++++------------
 benchmarking/collect.py                  |  2 +-
 benchmarking/dut/transformer.py          | 26 ++++++++++----------
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 61a999750c..5f828ca4e4 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -426,6 +426,8 @@ def steps_full_build_flow(self):
         cfg.force_python_rtlsim = False
         cfg.split_large_fifos = True
         cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet
+        cfg.save_intermediate_models = True # Save the intermediate model graphs
+        cfg.verify_save_full_context = True, # Output full context dump for verification steps
         #rtlsim_use_vivado_comps # TODO ?
         #cfg.default_swg_exception
         #cfg.large_fifo_mem_style
diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json
index dfc63c6240..7e362200af 100644
--- a/benchmarking/cfg/synthetic_fifotest.json
+++ b/benchmarking/cfg/synthetic_fifotest.json
@@ -1,15 +1,15 @@
 [
     {
         "dut": ["synthetic_nonlinear"],
-        "dim": [32],
+        "dim": [64],
         "kernel_size": [5],
-        "ch": [4],
-        "simd": [4],
-        "pe": [4],
+        "ch": [8],
+        "simd": [8],
+        "pe": [8],
         "parallel_window": [1],
 
         "lb_num_layers": [1],
-        "rb_num_layers": [3],
+        "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
@@ -21,15 +21,15 @@
     },
     {
         "dut": ["synthetic_nonlinear"],
-        "dim": [32],
+        "dim": [64],
         "kernel_size": [5],
-        "ch": [4],
-        "simd": [4],
-        "pe": [4],
+        "ch": [8],
+        "simd": [8],
+        "pe": [8],
         "parallel_window": [1],
 
         "lb_num_layers": [1],
-        "rb_num_layers": [3],
+        "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
@@ -42,15 +42,15 @@
     },
     {
         "dut": ["synthetic_nonlinear"],
-        "dim": [32],
+        "dim": [64],
         "kernel_size": [5],
-        "ch": [4],
-        "simd": [4],
-        "pe": [4],
+        "ch": [8],
+        "simd": [8],
+        "pe": [8],
         "parallel_window": [1],
 
         "lb_num_layers": [1],
-        "rb_num_layers": [3],
+        "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
         "clock_period_ns": [10],
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index bcff28104c..45f6073d1b 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -49,7 +49,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
 if __name__ == "__main__":
     # Go through all runs found in the artifacts and log their results to DVC
     run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output"))
-    print("Looking for runs in %s" % run_dir_list)
+    print("Looking for runs in build artifacts")
     run_ids = []
     for run_dir in run_dir_list:
         if run_dir.startswith("run_"):
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index ea9713edfa..5d0566a476 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -910,15 +910,6 @@ def step_build_setup(self):
             verify_input_npy=self.build_inputs["input_npy_path"],
             # File with expected test outputs for verification
             verify_expected_output_npy=self.build_inputs["output_npy_path"],
-            # Output full context dump for verification steps
-            verify_save_full_context=True,
-            # Save the intermediate model graphs
-            save_intermediate_models=True,
-            # Avoid RTL simulation for setting the FIFO sizes
-            auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE,
-            # Do not automatically set FIFO sizes as this requires RTL simulation
-            # not implemented for the attention operator
-            auto_fifo_depths=False,
             # Build steps to execute
             steps=[
                 # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering
@@ -963,11 +954,6 @@ def step_build_setup(self):
                 "step_generate_estimate_reports",
                 "step_hw_codegen",
                 "step_hw_ipgen",
-                # Set the attention- and residual-related FIFO depths insert FIFOs
-                # and apply folding configuration once again
-                # Note: Implement all FIFOs with a depth at least as deep as the
-                # sequence length in URAM.
-                set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len),
                 # Run additional node-by-node verification in RTL simulation of the
                 # model before creating the stitched IP
                 # Note: end-to-end verification of the stitched IP in RTL simulation
@@ -985,4 +971,16 @@ def step_build_setup(self):
             ]
         )
 
+        # TESTING custom vs live FIFO-sizing
+        if self.params["fifo_method"] == "live":
+            # insert default FIFO-sizing step (behind step_generate_estimate_reports)
+            for i in range(len(cfg.steps)):
+                if cfg.steps[i] == "step_generate_estimate_reports":
+                    cfg.steps.insert(i+1, "step_set_fifo_depths")
+        else:
+            # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen)
+            for i in range(len(cfg.steps)):
+                if cfg.steps[i] == "step_hw_ipgen":
+                    cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len))
+
         return cfg

From 05f72d2d28a11857ed10a411ce2970a5280320a0 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 7 Mar 2025 10:46:25 +0000
Subject: [PATCH 066/125] Minor fixes for Transformer flow

---
 benchmarking/dut/transformer.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 5d0566a476..819b9b5fa2 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -21,6 +21,7 @@
 # FINN dataflow builder
 import finn.builder.build_dataflow_config as build_cfg
 from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
+from qonnx.core.modelwrapper import ModelWrapper
 from bench_base import bench
 
 # Range information structure for seeding the range analysis for converting
@@ -855,10 +856,14 @@ def step_build_setup(self):
             seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"]
         else:
             # for real input models
-            _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape
-            # TODO: use the following to get dimensions for GPT models?
-            #model = ModelWrapper(self.build_inputs["onnx_path"])
-            #_, emb_dim, seq_len = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0")
+            inp_shape = np.load(self.build_inputs["input_npy_path"]).shape
+            if len(inp_shape) == 3:
+                # for RadioML Transformers
+                _, seq_len, emb_dim = inp_shape
+            else:
+                # for GPTs (why is this different?)
+                model = ModelWrapper(self.build_inputs["onnx_path"])
+                _, seq_len, emb_dim = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0")
 
         # Read the input value range information for the dataset from the parameters
         # Note: Consider calibrating this on the fly from the dataset
@@ -972,7 +977,7 @@ def step_build_setup(self):
         )
 
         # TESTING custom vs live FIFO-sizing
-        if self.params["fifo_method"] == "live":
+        if self.params.get("fifo_method") == "live":
             # insert default FIFO-sizing step (behind step_generate_estimate_reports)
             for i in range(len(cfg.steps)):
                 if cfg.steps[i] == "step_generate_estimate_reports":

From 230ac92471342c0a28e91168fc3b57895b0c8651 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 7 Mar 2025 11:52:05 +0000
Subject: [PATCH 067/125] Fix clkname variable expansion

---
 src/finn/transformation/fpgadataflow/make_zynq_proj.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 98372b700f..c6449468cf 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -181,9 +181,7 @@ def apply(self, model):
                 # connect clocks/reset
                 config.append(
                     "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
-                    "{ Clk {/zynq_ps/$zynq_ps_clkname} Freq {} "
-                    "Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  "
-                    "[get_bd_pins axi_interconnect_%d/ACLK]" % (i)
+                    '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/ACLK]' % (i)
                 )
                 master_axilite_idx += 1
                 total_axilite_count = max(0, total_axilite_count - 64)
@@ -366,8 +364,7 @@ def apply(self, model):
         for i in range(1, nested_interconnect_count + 1):
             config.append(
                 "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
-                "{ Clk {/zynq_ps/$zynq_ps_clkname} }  [get_bd_pins axi_interconnect_%d/M*_ACLK]"
-                % (i)
+                '"Clk /zynq_ps/$zynq_ps_clkname"  [get_bd_pins axi_interconnect_%d/M*_ACLK]' % (i)
             )
 
         # create a temporary folder for the project

From 8dcf182129813a81221d1ca764f1045a7a24ac09 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 7 Mar 2025 12:56:02 +0000
Subject: [PATCH 068/125] [Driver] Reset PYNQ cache before loading overlay

---
 src/finn/qnn-data/templates/driver/driver_fifosizing.py      | 2 ++
 src/finn/qnn-data/templates/driver/driver_instrumentation.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index be1f20156a..778d74b21e 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -3,6 +3,7 @@
 import os
 import argparse
 import matplotlib.pyplot as plt
+from pynq import PL
 from pynq.pl_server.device import Device
 
 from driver_instrumentation import FINNInstrumentationOverlay
@@ -211,6 +212,7 @@ def determine_start_depth(self, ):
 
 
     print("Programming FPGA..")
+    PL.reset() # reset PYNQ cache
     accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths)
     
     (start_depth, iteration_runtime) = accel.determine_start_depth()
diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
index 90a0ed5b89..51c85587cf 100644
--- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -2,6 +2,7 @@
 import json
 import time
 from pynq import Overlay
+from pynq import PL
 from pynq.pl_server.device import Device
 from pynq.ps import Clocks
 
@@ -126,6 +127,7 @@ def observe_instrumentation(self, debug_print=True):
 
     # instantiate FINN accelerator driver and pass batchsize and bitfile
     print("Programming FPGA..")
+    PL.reset() # reset PYNQ cache
     accel = FINNInstrumentationOverlay(
         bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed
     )

From 8e4a2095235bfdab0e47dfde8a51019143a8bfd0 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 7 Mar 2025 14:34:53 +0000
Subject: [PATCH 069/125] Add VGG-10 and MobileNetV1

---
 .gitlab-ci.yml                         |   2 +-
 benchmarking/bench.py                  |   4 +
 benchmarking/cfg/mobilenetv1_test.json |  32 +++++
 benchmarking/cfg/vgg10_test.json       |  32 +++++
 benchmarking/dut/mobilenetv1.py        | 160 +++++++++++++++++++++++++
 benchmarking/dut/vgg10.py              |  53 ++++++++
 6 files changed, 282 insertions(+), 1 deletion(-)
 create mode 100644 benchmarking/cfg/mobilenetv1_test.json
 create mode 100644 benchmarking/cfg/vgg10_test.json
 create mode 100644 benchmarking/dut/mobilenetv1.py
 create mode 100644 benchmarking/dut/vgg10.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 79d772f65d..074bc98f0c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -214,4 +214,4 @@ Bench:
     PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   parallel:
     matrix:
-      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest]
+      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest, vgg10_test, mobilenetv1_test]
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index ea85082fc8..41cfdbbbf7 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -11,6 +11,8 @@
 from dut.metafi import bench_metafi
 from dut.synthetic_nonlinear import bench_synthetic_nonlinear
 from dut.transformer import bench_transformer
+from dut.vgg10 import bench_vgg10
+from dut.mobilenetv1 import bench_mobilenetv1
 
 dut = dict()
 dut["mvau"] = bench_mvau
@@ -18,6 +20,8 @@
 dut["metafi"] = bench_metafi
 dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
 dut["transformer"] = bench_transformer
+dut["vgg10"] = bench_vgg10
+dut["mobilenetv1"] = bench_mobilenetv1
 
 
 def main(config_name):
diff --git a/benchmarking/cfg/mobilenetv1_test.json b/benchmarking/cfg/mobilenetv1_test.json
new file mode 100644
index 0000000000..d080638722
--- /dev/null
+++ b/benchmarking/cfg/mobilenetv1_test.json
@@ -0,0 +1,32 @@
+[
+    {
+        "dut": ["mobilenetv1"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "fifo_method": ["manual"],
+
+        "rtlsim_n": [5],
+        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "fifo_method": ["live"],
+
+        "rtlsim_n": [5],
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/benchmarking/cfg/vgg10_test.json b/benchmarking/cfg/vgg10_test.json
new file mode 100644
index 0000000000..7a6e1a5deb
--- /dev/null
+++ b/benchmarking/cfg/vgg10_test.json
@@ -0,0 +1,32 @@
+[
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "fifo_method": ["largefifo_rtlsim"],
+
+        "rtlsim_n": [5],
+        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
+        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
+        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "clock_period_ns": [10],
+
+        "fifo_method": ["live"],
+
+        "rtlsim_n": [5],
+        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py
new file mode 100644
index 0000000000..56b13a6095
--- /dev/null
+++ b/benchmarking/dut/mobilenetv1.py
@@ -0,0 +1,160 @@
+from qonnx.core.modelwrapper import ModelWrapper
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    ShellFlowType,
+    VerificationStepType,
+)
+from finn.builder.build_dataflow_steps import verify_step
+from finn.transformation.streamline import Streamline
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
+from qonnx.transformation.remove import RemoveIdentityOps
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.general import (
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+    ApplyConfig,
+)
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from qonnx.transformation.infer_datatypes import InferDataTypes
+
+from bench_base import bench
+
+
+def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Streamline())
+    additional_streamline_transformations = [
+        DoubleToSingleFloat(),
+        reorder.MoveMulPastDWConv(),
+        absorb.AbsorbMulIntoMultiThreshold(),
+        ChangeDataLayoutQuantAvgPool2d(),
+        InferDataLayouts(),
+        reorder.MoveTransposePastScalarMul(),
+        absorb.AbsorbTransposeIntoFlatten(),
+        reorder.MoveFlattenPastAffine(),
+        reorder.MoveFlattenPastTopK(),
+        reorder.MoveScalarMulPastMatMul(),
+        CollapseRepeatedMul(),
+        RemoveIdentityOps(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in additional_streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
+
+    return model
+
+def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RoundAndClipThresholds())
+    model = model.transform(InferDataLayouts())
+    return model
+
+def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
+    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+        try:
+            from finnexperimental.analysis.partitioning import partition
+
+            # apply partitioning of the model, restricting the first and last layers
+            # to SLR0
+            default_slr = 0
+            abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
+            floorplan = partition(
+                model,
+                cfg.synth_clk_period_ns,
+                cfg.board,
+                abs_anchors=abs_anchors,
+                multivariant=False,
+            )[0]
+            # apply floorplan to model
+            model = model.transform(ApplyConfig(floorplan))
+            print("SLR floorplanning applied")
+        except Exception:
+            print("No SLR floorplanning applied")
+    return model
+
+def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferThresholdingLayer())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+class bench_mobilenetv1(bench):
+    def step_build_setup(self):
+        # create build config for MobileNetV1 (based on finn-examples)
+        mobilenet_build_steps = [
+            step_mobilenet_streamline,
+            step_mobilenet_lower_convs,
+            step_mobilenet_convert_to_hw_layers_separate_th,
+            "step_create_dataflow_partition",
+            "step_specialize_layers",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            "step_set_fifo_depths",
+            "step_create_stitched_ip",
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+        # mobilenet_build_steps_alveo = [
+        #     step_mobilenet_streamline,
+        #     step_mobilenet_lower_convs,
+        #     step_mobilenet_convert_to_hw_layers,
+        #     "step_create_dataflow_partition",
+        #     "step_specialize_layers",
+        #     "step_apply_folding_config",
+        #     "step_minimize_bit_width",
+        #     "step_generate_estimate_reports",
+        #     "step_hw_codegen",
+        #     "step_hw_ipgen",
+        #     "step_set_fifo_depths",
+        #     "step_create_stitched_ip",
+        #     step_mobilenet_slr_floorplan,
+        #     "step_synthesize_bitfile",
+        #     "step_make_pynq_driver",
+        #     "step_deployment_package",
+        # ]
+
+        cfg = DataflowBuildConfig(
+            steps=mobilenet_build_steps,
+        )
+
+        return cfg
diff --git a/benchmarking/dut/vgg10.py b/benchmarking/dut/vgg10.py
new file mode 100644
index 0000000000..e64a58fb2f
--- /dev/null
+++ b/benchmarking/dut/vgg10.py
@@ -0,0 +1,53 @@
+from qonnx.core.modelwrapper import ModelWrapper
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+
+from bench_base import bench
+
+
+def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Change3DTo4DTensors())
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    return model
+
+def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(GiveUniqueNodeNames())
+    return model
+
+class bench_vgg10(bench):
+    def step_build_setup(self):
+        # create build config for VGG-10 (based on finn-examples)
+        vgg10_build_steps = [
+            "step_tidy_up",
+            step_pre_streamline,
+            "step_streamline",
+            "step_convert_to_hw",
+            step_convert_final_layers,
+            "step_create_dataflow_partition",
+            "step_specialize_layers",
+            "step_target_fps_parallelization",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+            "step_hw_codegen",
+            "step_hw_ipgen",
+            "step_create_stitched_ip",
+            "step_measure_rtlsim_performance",
+            "step_out_of_context_synthesis",
+            "step_synthesize_bitfile",
+            "step_make_pynq_driver",
+            "step_deployment_package",
+        ]
+
+        cfg = DataflowBuildConfig(
+            steps=vgg10_build_steps,
+            standalone_thresholds=True,
+        )
+
+        return cfg

From 68c41b8d5366f4809b07f11ab53f843e50afafcc Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 7 Mar 2025 17:17:33 +0000
Subject: [PATCH 070/125] Fix variable named range

---
 benchmarking/dut/transformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 819b9b5fa2..ec737ce6b8 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -867,8 +867,8 @@ def step_build_setup(self):
 
         # Read the input value range information for the dataset from the parameters
         # Note: Consider calibrating this on the fly from the dataset
-        range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable?
-        input_range = tuple(np.array([range]).T)
+        value_range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable?
+        input_range = tuple(np.array([value_range]).T)
         # Construct the seed range information of the input tensor
         range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range)
     

From 9884beff238d6a2a7d52f449b2600797fecf329a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sat, 8 Mar 2025 17:39:30 +0000
Subject: [PATCH 071/125] Reduce benchmark parallelism, force push exp

---
 .gitlab-ci.yml            | 2 +-
 benchmarking/bench-ci.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 074bc98f0c..6b8e8369b8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -20,7 +20,7 @@ variables:
     value: "64"
   CPU_CORES_BENCH:
     description: "Select number of CPU cores for benchmark runs"
-    value: "32"
+    value: "8"
   PARALLEL_JOBS:
     description: "Number of parallel Slurm array jobs per Benchmark job"
     value: "2"
diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 99adf1e0dc..73b91508d7 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -73,4 +73,4 @@ Result Collection:
     - when: always
   script:
     - python3.10 benchmarking/collect.py
-    - dvc exp push -r push git@github.com:eki-project/finn-plus.git
+    - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git

From ef7b8cf5bb124b440129a4197a9a83b064d99397 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sat, 8 Mar 2025 18:10:59 +0000
Subject: [PATCH 072/125] Fix FIFO width export for driver

---
 .../templates/driver/driver_fifosizing.py     | 222 ++++++++++++------
 .../fpgadataflow/make_pynq_driver.py          |  13 +-
 2 files changed, 155 insertions(+), 80 deletions(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index 560959991f..5aa116ebac 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -1,27 +1,32 @@
-import time
-import json
-import os
 import argparse
-import matplotlib as mpl
+import json
 import matplotlib.pyplot as plt
-import numpy as np
-from pynq.pl_server.device import Device
-
+import os
+import sys
+import time
 from driver_instrumentation import FINNInstrumentationOverlay
+from pynq.pl_server.device import Device
 
 
 class FINNLiveFIFOOverlay(FINNInstrumentationOverlay):
     def __init__(
         self,
         bitfile_name,
-        platform = "zynq",
-        fclk_mhz = 100.0,
-        device = None,
-        download = True,
-        seed = 1,
-        fifo_widths = {},
+        platform="zynq",
+        fclk_mhz=100.0,
+        device=None,
+        download=True,
+        seed=1,
+        fifo_widths=dict(),
     ):
-        super().__init__(bitfile_name, platform = platform, fclk_mhz = fclk_mhz, seed = seed, download = download, device = device)
+        super().__init__(
+            bitfile_name,
+            platform=platform,
+            fclk_mhz=fclk_mhz,
+            seed=seed,
+            download=download,
+            device=device,
+        )
 
         self.error = False
         self.fifo_widths = fifo_widths
@@ -33,9 +38,13 @@ def __init__(
         # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps
         # We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights
         if (len(self.ip_dict.keys()) - 3) != self.num_fifos:
+            print(
+                "Error: Number of expected FIFOs (%d) doesn't match number of AXI-Lite interfaces (%d)"
+                % (self.num_fifos, len(self.ip_dict.keys()) - 3)
+            )
             self.error = True
 
-    def configure_fifo(self, i, mode, depth = 2):
+    def configure_fifo(self, i, mode, depth=2):
         ### Virtual FIFO register map ###
         mode_offset = 0x10
         depth_offset = 0x18
@@ -45,43 +54,51 @@ def configure_fifo(self, i, mode, depth = 2):
         max_occupancy_ctrl_offset = 0x34
 
         ip_name = "StreamingDataflowPartition_%d" % i
-        getattr(self, ip_name).write(offset=mode_offset, value = mode)
-        getattr(self, ip_name).write(offset=depth_offset, value = depth)
+        getattr(self, ip_name).write(offset=mode_offset, value=mode)
+        getattr(self, ip_name).write(offset=depth_offset, value=depth)
 
     def total_fifo_size(self, depths):
         # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs
         total_size_bits = 0
         for i, depth in enumerate(depths):
-            total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths["StreamingFIFO_hls_%d" % i]
+            total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[i]
         total_size_kB = total_size_bits / 8.0 / 1000.0
         return total_size_kB
-    
-    def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.5):
+
+    def size_iteratively(self, start_depth, iteration_runtime, reduction_factor=0.5):
         ### Iterative FIFO-sizing function ###
         fifo_minimum_reached = [False] * self.num_fifos
-        
+
         if isinstance(start_depth, list):
             # Individual start depth for each FIFO has been supplied
             fifo_depths = start_depth
         else:
             # Initialize all depths to the same start depth
             fifo_depths = [start_depth] * self.num_fifos
-        
+
         # Reset accelerator and configure FIFOs
         self.reset_accelerator()
         for i in range(0, self.num_fifos):
-            self.configure_fifo(i, mode = 1, depth = fifo_depths[i])
+            self.configure_fifo(i, mode=1, depth=fifo_depths[i])
 
         # Run once to determine target interval
         self.start_accelerator()
         time.sleep(1)
-        (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False)
+        (
+            overflow_err,
+            underflow_err,
+            frame,
+            checksum,
+            min_latency,
+            latency,
+            interval,
+        ) = self.observe_instrumentation(False)
         log_total_fifo_size = [int(self.total_fifo_size(fifo_depths))]
         log_interval = [interval]
         log_min_latency = [min_latency]
         log_latency = [latency]
         target_interval = interval
-        
+
         # Iteratively reduce FIFO depth until all FIFOs are minimized
         iteration = 0
         start_time = time.time()
@@ -96,7 +113,7 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.
 
                     # Configure all FIFOs
                     for i in range(0, self.num_fifos):
-                        self.configure_fifo(i, mode = 1, depth = fifo_depths[i])
+                        self.configure_fifo(i, mode=1, depth=fifo_depths[i])
 
                     # Start accelerator
                     self.start_accelerator()
@@ -104,8 +121,16 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.
                     # Let it run
                     time.sleep(iteration_runtime)
 
-                    # Check if throughput dropped or deadlock occured 
-                    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False)
+                    # Check if throughput dropped or deadlock occured
+                    (
+                        overflow_err,
+                        underflow_err,
+                        frame,
+                        checksum,
+                        min_latency,
+                        latency,
+                        interval,
+                    ) = self.observe_instrumentation(False)
 
                     if interval > target_interval or interval == 0 or overflow_err or underflow_err:
                         # Revert depth reduction and mark FIFO as minimized
@@ -115,7 +140,7 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.
                         log_total_fifo_size.append(int(self.total_fifo_size(fifo_depths)))
                         log_interval.append(interval)
                         log_min_latency.append(min_latency)
-                        log_latency.append(latency) 
+                        log_latency.append(latency)
 
                     if fifo_depths[fifo_id] == 1:
                         fifo_minimum_reached[fifo_id] = True
@@ -133,9 +158,18 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.
         duration = int(end_time - start_time)
         print("Done (%d seconds)" % duration)
 
-        return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency, duration
+        return (
+            fifo_depths,
+            log_total_fifo_size,
+            log_interval,
+            log_min_latency,
+            log_latency,
+            duration,
+        )
 
-    def determine_start_depth(self, ):
+    def determine_start_depth(
+        self,
+    ):
         ### Attempt to determine start depth for all FIFOs automatically ###
         # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis
         start_depth = 64
@@ -148,15 +182,28 @@ def determine_start_depth(self, ):
 
             # Configure FIFOs
             for i in range(0, self.num_fifos):
-                self.configure_fifo(i, mode = 1, depth = start_depth)
-            
+                self.configure_fifo(i, mode=1, depth=start_depth)
+
             # Start accelerator and let it run for a long time
             self.start_accelerator()
             time.sleep(1)
-            
+
             # Examine performance
-            (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation()
-            if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:
+            (
+                overflow_err,
+                underflow_err,
+                frame,
+                checksum,
+                min_latency,
+                latency,
+                interval,
+            ) = self.observe_instrumentation()
+            if (
+                interval > 0
+                and interval == last_interval
+                and not overflow_err
+                and not underflow_err
+            ):
                 # Accelerator runs with stable interval, reset to previous start depth
                 start_depth_found = True
                 start_depth = last_start_depth
@@ -164,13 +211,13 @@ def determine_start_depth(self, ):
                 # Start depth is still too small, increase for next try
                 last_start_depth = start_depth
                 start_depth = start_depth * 2
-            
+
             last_interval = interval
 
             if start_depth > 1000000:
                 print("Couldn't find a working start depth, please set manually")
                 self.error = True
-            
+
         # Determine runtime per iteration based on performance, so that stable-state is guaranteed
         # Use a simple overestimation for now to be safe
         iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)
@@ -179,15 +226,27 @@ def determine_start_depth(self, ):
         print("Determined iteration runtime based on performance: %f s" % iteration_runtime)
         return (start_depth, iteration_runtime)
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper')
-    parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10)
-    parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0)
-    parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1)
-    parser.add_argument('--device', help='FPGA device to be used', type=int, default=0)
-    parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit")
-    parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json")
-    parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="")
+    parser = argparse.ArgumentParser(
+        description="Profile performance of FINN-generated accelerator using instrumentation wrapper"
+    )
+    parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10)
+    parser.add_argument(
+        "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0
+    )
+    parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1)
+    parser.add_argument("--device", help="FPGA device to be used", type=int, default=0)
+    parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit")
+    parser.add_argument(
+        "--reportfile",
+        help="Name of output .json report file",
+        type=str,
+        default="measured_performance.json",
+    )
+    parser.add_argument(
+        "--settingsfile", help="Name of optional input .json settings file", type=str, default=""
+    )
     # parse arguments
     args = parser.parse_args()
     runtime = args.runtime
@@ -208,58 +267,67 @@ def determine_start_depth(self, ):
                 frequency = settings["fclk_mhz"]
 
             # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,
-            # {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}
+            # {'fifo_widths': {0: 8, 1: 32, 2: 24}}
             fifo_widths = settings["fifo_widths"]
 
-
     print("Programming FPGA..")
-    accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths)
-    
+    accel = FINNLiveFIFOOverlay(
+        bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths
+    )
+    if accel.error:
+        print("Error: Accelerator initialization failed.")
+        sys.exit(1)
+
+    print("Determining start depth..")
     (start_depth, iteration_runtime) = accel.determine_start_depth()
 
     ### First pass
     print("Starting first pass..")
     pass1_result = accel.size_iteratively(start_depth, iteration_runtime)
-    (fifo_depths,
-    log_total_fifo_size,
-    log_interval,
-    log_min_latency,
-    log_latency,
-    duration) = pass1_result
+    (
+        fifo_depths,
+        log_total_fifo_size,
+        log_interval,
+        log_min_latency,
+        log_latency,
+        duration,
+    ) = pass1_result
 
     ### Visualize results and save as "fifo_sizing_graph.png"
     fig, ax1 = plt.subplots()
 
-    color = 'tab:red'
-    ax1.set_xlabel('Iteration')
-    ax1.set_ylabel('Total FIFO Size [kB]', color=color)
+    color = "tab:red"
+    ax1.set_xlabel("Iteration")
+    ax1.set_ylabel("Total FIFO Size [kB]", color=color)
     ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)
-    ax1.tick_params(axis='y', labelcolor=color)
+    ax1.tick_params(axis="y", labelcolor=color)
     ax1.set_ylim(0, max(log_total_fifo_size))
-            
-    ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
 
-    color = 'tab:blue'
-    ax2.set_ylabel('Latency [cycles]', color=color)
+    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
+
+    color = "tab:blue"
+    ax2.set_ylabel("Latency [cycles]", color=color)
     ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)
-    ax2.tick_params(axis='y', labelcolor=color)
-    #ax2.set_ylim(0, max(log_latency))
+    ax2.tick_params(axis="y", labelcolor=color)
+    # ax2.set_ylim(0, max(log_latency))
 
     ax2.axhline(log_min_latency[0], color="green", label="Minimum (1st frame) Latency")
     ax2.legend()
 
     plt.tight_layout()
-    plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi = 300)
+    plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi=300)
 
     ### Second pass for fine-tuning
     print("Starting second pass..")
-    pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)
-    (fifo_depths,
-    log_total_fifo_size,
-    log_interval,
-    log_min_latency,
-    log_latency,
-    duration) = pass2_result
+    pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor=0.95)
+    (
+        fifo_depths,
+        log_total_fifo_size,
+        log_interval,
+        log_min_latency,
+        log_latency,
+        duration,
+    ) = pass2_result
 
     ### Generate fifo_sizing_report.json
     fifo_report = {
@@ -283,7 +351,7 @@ def determine_start_depth(self, ):
         },
     }
     for fifo, depth in enumerate(fifo_depths):
-        size = (depth + accel.fifo_depth_offset) * accel.fifo_widths["StreamingFIFO_hls_%d" % fifo]
+        size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[fifo]
         fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset
         fifo_report["fifo_sizes"][fifo] = size
     with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f:
@@ -312,9 +380,9 @@ def determine_start_depth(self, ):
         "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
         "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))),
         "min_pipeline_depth": round(min_latency / interval, 2),
-        "pipeline_depth" : round(latency / interval, 2),
+        "pipeline_depth": round(latency / interval, 2),
     }
     with open(reportfile, "w") as f:
         json.dump(report, f, indent=2)
 
-    print("Done.")
\ No newline at end of file
+    print("Done.")
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 9ccc0e08f8..c18adb8d14 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -346,10 +346,17 @@ def apply(self, model):
         }
         if self.live_fifo_sizing:
             # export FIFO widths to the settings file as well
+            # at this stage, the FIFOs are already wrapped in StreamingDataflowPartitions
             fifo_widths = {}
-            for node in model.get_nodes_by_op_type("StreamingFIFO_hls"):
-                node_inst = getCustomOp(node)
-                fifo_widths[node.name] = node_inst.get_instream_width()
+            for sdp_node in model.get_nodes_by_op_type("StreamingDataflowPartition"):
+                sdp_node_inst = getCustomOp(sdp_node)
+                sdp_id = sdp_node_inst.get_nodeattr("partition_id")
+                dataflow_model_filename = sdp_node_inst.get_nodeattr("model")
+                kernel_model = ModelWrapper(dataflow_model_filename)
+                for node in kernel_model.graph.node:
+                    if node.op_type.startswith("StreamingFIFO"):
+                        node_inst = getCustomOp(node)
+                        fifo_widths[sdp_id] = node_inst.get_instream_width()
             settings["fifo_widths"] = fifo_widths
 
         settingsfile = pynq_driver_dir + "/settings.json"

From 4cfc32adb6867e757f8a7f77a9b0d57e06f48b20 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sat, 8 Mar 2025 18:26:33 +0000
Subject: [PATCH 073/125] Transformer: disable cppsim for virtual fifosizing

---
 benchmarking/dut/transformer.py              |  7 +++++--
 benchmarking/dut/transformer_custom_steps.py | 22 +++++++++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index ec737ce6b8..1798ea1410 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -908,7 +908,7 @@ def step_build_setup(self):
                 # converting to HLS
                 build_cfg.VerificationStepType.TIDY_UP_PYTHON,
                 # Verify the model after generating C++ HLS and applying folding
-                build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
+                #build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, #only inserted if live FIFO-sizing is off
                 # No RTL Simulation support for now
             ],
             # File with test inputs for verification
@@ -963,7 +963,7 @@ def step_build_setup(self):
                 # model before creating the stitched IP
                 # Note: end-to-end verification of the stitched IP in RTL simulation
                 # is still not possible due to missing float IPs
-                node_by_node_cppsim,
+                #node_by_node_cppsim, #only inserted if live FIFO-sizing is off
                 # Only for debugging for now, does not work if "vivado" style
                 # StreamingFIFOs are used
                 # node_by_node_rtlsim,
@@ -987,5 +987,8 @@ def step_build_setup(self):
             for i in range(len(cfg.steps)):
                 if cfg.steps[i] == "step_hw_ipgen":
                     cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len))
+                    # also enable cppsim, which doesn't work with virtual FIFOs
+                    cfg.steps.insert(i+2, node_by_node_cppsim)
+                    cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM)
 
         return cfg
diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py
index 4ff497b892..5b0d39c756 100644
--- a/benchmarking/dut/transformer_custom_steps.py
+++ b/benchmarking/dut/transformer_custom_steps.py
@@ -11,6 +11,8 @@
 # YAML for loading experiment configurations
 import yaml
 
+import json
+
 # QONNX quantization data types
 from qonnx.core.datatype import DataType
 
@@ -616,7 +618,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                 del config[node.name]
 
         # Create/Open a YAML file to store the configuration for later reuse
-        with open(cfg.output_dir + "/final_hw_config.yaml", "w") as file:
+        # TODO: make consistent with .json report in default step
+        with open(cfg.output_dir + "/report/final_hw_config.yaml", "w") as file:
             # Store the configuration dictionary as YAML code
             yaml.safe_dump(config, file)
 
@@ -628,6 +631,23 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(SplitLargeFIFOs())
         model = model.transform(RemoveShallowFIFOs())
 
+        # generate a dedicated report about final FIFO sizes
+        fifo_info = {}
+        fifo_info["fifo_depths"] = {}
+        fifo_info["fifo_sizes"] = {}
+        total_fifo_size = 0
+        for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"):
+            node_inst = getCustomOp(node)
+            fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth")
+            fifo_info["fifo_sizes"][
+                node.name
+            ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth")
+            total_fifo_size += fifo_info["fifo_sizes"][node.name]
+        fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0)
+
+        with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f:
+            json.dump(fifo_info, f, indent=2)
+
         # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again
         # this will only run for the new nodes (e.g. FIFOs and DWCs)
         model = model.transform(

From b0fb5f258c984f1f30aea20cefbed1f01b5a27e1 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 9 Mar 2025 10:29:39 +0000
Subject: [PATCH 074/125] [Driver] Reset PYNQ cache before loading Overlay

---
 src/finn/qnn-data/templates/driver/driver_instrumentation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
index 90a0ed5b89..aa5225eab6 100644
--- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -1,7 +1,7 @@
 import argparse
 import json
 import time
-from pynq import Overlay
+from pynq import PL, Overlay
 from pynq.pl_server.device import Device
 from pynq.ps import Clocks
 
@@ -126,6 +126,7 @@ def observe_instrumentation(self, debug_print=True):
 
     # instantiate FINN accelerator driver and pass batchsize and bitfile
     print("Programming FPGA..")
+    PL.reset()  # reset PYNQ cache
     accel = FINNInstrumentationOverlay(
         bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed
     )

From a08e2c4e12be4fa532f6b81ff428142dfaa757cd Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 9 Mar 2025 10:33:44 +0000
Subject: [PATCH 075/125] [Driver] Reset PYNQ cache, fix json int keys

---
 src/finn/qnn-data/templates/driver/driver_fifosizing.py  | 6 ++++--
 src/finn/transformation/fpgadataflow/make_pynq_driver.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index 5aa116ebac..fc50314cf3 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -5,6 +5,7 @@
 import sys
 import time
 from driver_instrumentation import FINNInstrumentationOverlay
+from pynq import PL
 from pynq.pl_server.device import Device
 
 
@@ -61,7 +62,7 @@ def total_fifo_size(self, depths):
         # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs
         total_size_bits = 0
         for i, depth in enumerate(depths):
-            total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[i]
+            total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[str(i)]
         total_size_kB = total_size_bits / 8.0 / 1000.0
         return total_size_kB
 
@@ -271,6 +272,7 @@ def determine_start_depth(
             fifo_widths = settings["fifo_widths"]
 
     print("Programming FPGA..")
+    PL.reset()  # reset PYNQ cache
     accel = FINNLiveFIFOOverlay(
         bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths
     )
@@ -351,7 +353,7 @@ def determine_start_depth(
         },
     }
     for fifo, depth in enumerate(fifo_depths):
-        size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[fifo]
+        size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[str(fifo)]
         fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset
         fifo_report["fifo_sizes"][fifo] = size
     with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f:
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index c18adb8d14..e7c947192a 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -350,7 +350,8 @@ def apply(self, model):
             fifo_widths = {}
             for sdp_node in model.get_nodes_by_op_type("StreamingDataflowPartition"):
                 sdp_node_inst = getCustomOp(sdp_node)
-                sdp_id = sdp_node_inst.get_nodeattr("partition_id")
+                # JSON doesn't support int keys
+                sdp_id = str(sdp_node_inst.get_nodeattr("partition_id"))
                 dataflow_model_filename = sdp_node_inst.get_nodeattr("model")
                 kernel_model = ModelWrapper(dataflow_model_filename)
                 for node in kernel_model.graph.node:

From c0fcb10ab19a7ea3f22e132ac898d6fec8355cc6 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Sun, 9 Mar 2025 11:29:14 +0000
Subject: [PATCH 076/125] Improve error propagation

---
 benchmarking/bench.py   | 14 +++++++++++---
 benchmarking/measure.py | 14 +++++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 41cfdbbbf7..3d0a575057 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -151,7 +151,7 @@ def get_default_session_options_new():
                 print("Run skipped")
             else:
                 log_dict["status"] = "ok"
-                print("Run completed")
+                print("Run successfully completed")
         except Exception:
             log_dict["status"] = "failed"
             print("Run failed: " + traceback.format_exc())
@@ -159,6 +159,16 @@ def get_default_session_options_new():
 
         log_dict["output"] = bench_object.output_dict
 
+        # examine status reported by builder (which catches all exceptions before they reach us here)
+        # we could also fail the pipeline if functional verification fails (TODO)
+        builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json")
+        if os.path.isfile(builder_log_path):
+            with open(builder_log_path, "r") as f:
+                builder_log = json.load(f)
+            if builder_log["status"] == "failed":
+                print("Run failed (builder reported failure)")
+                exit_code = 1
+
         # log metadata of this run to its own report directory
         log_path = os.path.join(bench_object.report_dir, "metadata_bench.json")
         with open(log_path, "w") as f:
@@ -169,8 +179,6 @@ def get_default_session_options_new():
         # save local artifacts of this run (e.g., full build dir, detailed debug info)
         bench_object.save_local_artifacts_collection()
 
-        #TODO: examine verification result and builder status here to fail pipeline via exit code?
-
     print("Stopping job")
     return exit_code
 
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index 3accb734b9..d0e5a64aa8 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import subprocess
 import shutil
 
@@ -6,6 +7,7 @@
 
 
 if __name__ == "__main__":
+    exit_code = 0
     print("Looking for deployment packages in artifacts..")
     # Find deployment packages from artifacts
     artifacts_in_dir = os.path.join("build_artifacts", "runs_output")
@@ -24,12 +26,16 @@
 
             # Run driver
             print("Running driver..")
-            subprocess.run(["python", f"{extract_dir}/driver/driver.py",
+            result = subprocess.run(["python", f"{extract_dir}/driver/driver.py",
                             "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
                             "--settingsfile", f"{extract_dir}/driver/settings.json",
                             "--reportfile", f"{extract_dir}/measured_performance.json",
-                            ]) 
-            print("Driver finished.")
+                            ])
+            if result.returncode != 0:
+                print("Driver reported error!")
+                exit_code = 1
+            else:
+                print("Driver finished successfully.")
 
             # Copy results back to artifact directory
             for report in ["measured_performance.json", 
@@ -47,3 +53,5 @@
             # Clear temporary dir
             delete_dir_contents(extract_dir)
             print("Done.")
+    print("Processed all deployment packages.")
+    sys.exit(exit_code)

From f8bf6e7b20c8515f1126dfb493c531b26bb133a6 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 11 Mar 2025 12:10:55 +0000
Subject: [PATCH 077/125] Zip debug artifacts

---
 benchmarking/bench_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 5f828ca4e4..9f6689dcd3 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -169,7 +169,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
         if self.debug:
             # Save entire FINN build dir and working dir
             # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure)
-            self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False))
+            self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True))
             #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False))
 
         ### SETUP ###

From a63b4ae10b2ddaba82e320708762eb3e1dbe87d2 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 14 Mar 2025 16:19:36 +0100
Subject: [PATCH 078/125] Fix MNV1 fifo step order

---
 benchmarking/dut/mobilenetv1.py | 41 ++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py
index 56b13a6095..06042816cf 100644
--- a/benchmarking/dut/mobilenetv1.py
+++ b/benchmarking/dut/mobilenetv1.py
@@ -1,4 +1,21 @@
+from bench_base import bench
 from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
+
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
 from finn.builder.build_dataflow_config import (
     DataflowBuildConfig,
     ShellFlowType,
@@ -6,25 +23,8 @@
 )
 from finn.builder.build_dataflow_steps import verify_step
 from finn.transformation.streamline import Streamline
-from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
-import finn.transformation.streamline.absorb as absorb
-import finn.transformation.streamline.reorder as reorder
-from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
-from qonnx.transformation.remove import RemoveIdentityOps
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
-from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.transformation.general import (
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-    ApplyConfig,
-)
-import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
-from qonnx.transformation.infer_datatypes import InferDataTypes
-
-from bench_base import bench
 
 
 def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
@@ -55,6 +55,7 @@ def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     return model
 
+
 def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(LowerConvsToMatMul())
     model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
@@ -66,6 +67,7 @@ def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(InferDataLayouts())
     return model
 
+
 def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(to_hw.InferPool())
     model = model.transform(to_hw.InferConvInpGen())
@@ -78,6 +80,7 @@ def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildC
     model = model.transform(GiveReadableTensorNames())
     return model
 
+
 def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
         try:
@@ -101,6 +104,7 @@ def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
             print("No SLR floorplanning applied")
     return model
 
+
 def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(to_hw.InferPool())
     model = model.transform(to_hw.InferConvInpGen())
@@ -114,6 +118,7 @@ def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: Da
     model = model.transform(GiveReadableTensorNames())
     return model
 
+
 class bench_mobilenetv1(bench):
     def step_build_setup(self):
         # create build config for MobileNetV1 (based on finn-examples)
@@ -126,9 +131,9 @@ def step_build_setup(self):
             "step_apply_folding_config",
             "step_minimize_bit_width",
             "step_generate_estimate_reports",
+            "step_set_fifo_depths",
             "step_hw_codegen",
             "step_hw_ipgen",
-            "step_set_fifo_depths",
             "step_create_stitched_ip",
             "step_synthesize_bitfile",
             "step_make_pynq_driver",

From cce646dc091a61f4af8984fa05962062d27c45d6 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 21 Mar 2025 11:52:18 +0100
Subject: [PATCH 079/125] Allow local test execution

---
 .gitignore                 |  5 +++
 benchmarking/bench.py      | 62 +++++++++++++++++++++++---------------
 benchmarking/bench_base.py |  7 +++--
 3 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index be61378730..f40370b443 100644
--- a/.gitignore
+++ b/.gitignore
@@ -96,3 +96,8 @@ MANIFEST
 
 # downloaded dep repos
 /deps/
+
+bench_input
+bench_output
+bench_save
+bench_work
diff --git a/benchmarking/bench.py b/benchmarking/bench.py
index 3d0a575057..54788ac6a5 100644
--- a/benchmarking/bench.py
+++ b/benchmarking/bench.py
@@ -5,6 +5,9 @@
 import time
 import traceback
 import onnxruntime as ort
+import importlib
+
+from util import delete_dir_contents
 
 from dut.mvau import bench_mvau
 from dut.resnet50 import bench_resnet50
@@ -36,11 +39,36 @@ def get_default_session_options_new():
         return _default_session_options
     ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
 
-    # Gather job array info
-    job_id = int(os.environ["SLURM_JOB_ID"])
-    #TODO: allow portable execution on any platform by making as many env vars as possible optional
-    print("Job launched with ID: %d" % (job_id))
     try:
+        # Launched via SLURM, expect additional CI env vars
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
+        experiment_dir = os.environ.get("CI_PROJECT_DIR")
+        save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"),
+                            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
+        work_dir = os.environ["PATH_WORKDIR"]
+
+        # Gather benchmarking configs
+        if config_name == "manual":
+            config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
+        else:
+            configs_path = os.path.join(os.path.dirname(__file__), "cfg")
+            config_select = config_name + ".json"
+            config_path = os.path.join(configs_path, config_select)
+        print("Job launched with SLURM ID: %d" % (job_id))
+    except KeyError:
+        # Launched without SLURM, assume test run on local machine
+        job_id = 0
+        experiment_dir = "bench_output/" + time.strftime("%d_%H_%M")
+        save_dir = "bench_save/" + time.strftime("%d_%H_%M")
+        work_dir = "bench_work"
+        os.makedirs(work_dir, exist_ok=True)
+        delete_dir_contents(work_dir)
+        config_path = config_name # expect caller to provide direct path to a single config file
+        print("Local test job launched without SLURM")
+
+    try:
+        # Launched as SLURM job array
         array_id = int(os.environ["SLURM_ARRAY_JOB_ID"])
         task_id = int(os.environ["SLURM_ARRAY_TASK_ID"])
         task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"])
@@ -49,36 +77,20 @@ def get_default_session_options_new():
             % (array_id, task_id, task_count)
         )
     except KeyError:
+        # Launched as single (SLURM or non-SLURM) job
         array_id = job_id
         task_id = 0
         task_count = 1
         print("Launched as single job")
 
     # Prepare result directory
-    # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
-    experiment_dir = os.environ.get("CI_PROJECT_DIR")
-
     artifacts_dir = os.path.join(experiment_dir, "build_artifacts")
     os.makedirs(artifacts_dir, exist_ok=True)
     print("Collecting results in path: %s" % artifacts_dir)
-    
-    # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
-    if job_id == 0:
-        #DEBUG mode
-        save_dir = experiment_dir + "_save"
-    else:
-        save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"),
-                            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
-    print("Saving additional artifacts in path: %s" % save_dir)
-    os.makedirs(save_dir, exist_ok=True)
 
-    # Gather benchmarking configs
-    if config_name == "manual":
-        config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
-    else:
-        configs_path = os.path.join(os.path.dirname(__file__), "cfg")
-        config_select = config_name + ".json"
-        config_path = os.path.join(configs_path, config_select)
+    # Prepare local save dir for large artifacts (e.g., build output, tmp dir dump for debugging)
+    os.makedirs(save_dir, exist_ok=True)
+    print("Saving additional artifacts in path: %s" % save_dir)
 
     # Load config
     print("Loading config %s" % (config_path))
@@ -136,7 +148,7 @@ def get_default_session_options_new():
         # Create bench object for respective DUT
         if "dut" in params:
             if params["dut"] in dut:
-                bench_object = dut[params["dut"]](params, task_id, run_id, artifacts_dir, save_dir)
+                bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir)
             else:
                 print("ERROR: unknown DUT specified")
                 return 1
diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 9f6689dcd3..cc25fc7ff7 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -117,11 +117,12 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt
         json_file.write(json.dumps(power_report_dict, indent=2))
 
 class bench():
-    def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True):
+    def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True):
         super().__init__()
         self.params = params
         self.task_id = task_id
         self.run_id = run_id
+        self.work_dir = work_dir
         self.artifacts_dir = artifacts_dir
         self.save_dir = save_dir
         self.debug = debug
@@ -175,7 +176,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True)
         ### SETUP ###
         # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
         # Ensure it exists but is empty (clear potential artifacts from previous runs)
-        tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow")
+        tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow")
         os.makedirs(tmp_buildflow_dir, exist_ok=True)
         delete_dir_contents(tmp_buildflow_dir)
         self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir
@@ -422,7 +423,7 @@ def steps_full_build_flow(self):
         cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
-        cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
+        #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
         cfg.force_python_rtlsim = False
         cfg.split_large_fifos = True
         cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet

From 2c9925d29bc9e39d0de2dbd02a8221ecd1f786ec Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 24 Mar 2025 09:17:48 +0100
Subject: [PATCH 080/125] Start search for start depths from 1

---
 src/finn/qnn-data/templates/driver/driver_fifosizing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index fc50314cf3..ada6979db2 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -173,7 +173,7 @@ def determine_start_depth(
     ):
         ### Attempt to determine start depth for all FIFOs automatically ###
         # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis
-        start_depth = 64
+        start_depth = 1
         last_interval = 0
         start_depth_found = False
 

From 9f3e7c73dd3d403b1b1fe51156b3c36bc2dd2e61 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 24 Mar 2025 17:06:59 +0100
Subject: [PATCH 081/125] Let driver fill live FIFO sizes into complete folding
 config

---
 src/finn/builder/build_dataflow_steps.py      | 53 ++++++++++++-------
 .../templates/driver/driver_fifosizing.py     | 27 ++++++++--
 .../fpgadataflow/make_pynq_driver.py          |  9 +++-
 3 files changed, 65 insertions(+), 24 deletions(-)

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 5dc971cf33..6f8e1e7007 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -44,6 +44,7 @@
     GiveUniqueNodeNames,
     RemoveStaticGraphInputs,
     RemoveUnusedTensors,
+    SortGraph,
 )
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
@@ -553,8 +554,40 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     `GiveUniqueNodeNames`.
     """
 
+    hw_attrs = [
+        "PE",
+        "SIMD",
+        "parallel_window",
+        "ram_style",
+        "depth",
+        "impl_style",
+        "resType",
+        "mem_mode",
+        "runtime_writeable_weights",
+        "inFIFODepths",
+        "outFIFODepths",
+        "depth_trigger_uram",
+        "depth_trigger_bram",
+    ]
+
     # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior
     if cfg.live_fifo_sizing:
+        # Create all DWCs and FIFOs normally
+        model = model.transform(InsertDWC())
+        model = model.transform(
+            InsertFIFO(vivado_ram_style=cfg.large_fifo_mem_style, create_shallow_fifos=True)
+        )
+
+        # Clean up model
+        model = model.transform(SortGraph())
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+
+        # save original folding config before potentially modifying it
+        cfg_path = cfg.output_dir + "/report/folding_config_before_lfs.json"
+        extract_model_config_to_json(model, cfg_path, hw_attrs)
+        model.set_metadata_prop("folding_config_before_lfs", cfg_path)
+
         # Disable runtime-writable weights, external weights, and dynamic mode,
         # as we don't support additional AXI-lite interfaces next to the FIFOs
         for node in model.graph.node:
@@ -578,10 +611,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                 except AttributeError:
                     pass
 
-        # Create all DWCs and FIFOs normally
-        model = model.transform(InsertDWC())
-        model = model.transform(InsertFIFO(create_shallow_fifos=True))
-
         # Specialize FIFOs to HLS back-end instead of default RTL back-end
         for node in model.get_nodes_by_op_type("StreamingFIFO"):
             node_inst = getCustomOp(node)
@@ -594,6 +623,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             node_inst.set_nodeattr("impl_style", "virtual")
 
         # Clean up model
+        model = model.transform(SortGraph())
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(GiveReadableTensorNames())
 
@@ -659,21 +689,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             model = model.transform(ApplyConfig(cfg.folding_config_file))
 
     # extract the final configuration and save it as json
-    hw_attrs = [
-        "PE",
-        "SIMD",
-        "parallel_window",
-        "ram_style",
-        "depth",
-        "impl_style",
-        "resType",
-        "mem_mode",
-        "runtime_writeable_weights",
-        "inFIFODepths",
-        "outFIFODepths",
-        "depth_trigger_uram",
-        "depth_trigger_bram",
-    ]
     extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index ada6979db2..1cbc5053cf 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -259,6 +259,7 @@ def determine_start_depth(
     settingsfile = args.settingsfile
     devID = args.device
     device = Device.devices[devID]
+    folding_config_lfs = None
 
     # overwrite frequency if specified in settings file
     if settingsfile != "":
@@ -267,10 +268,15 @@ def determine_start_depth(
             if "fclk_mhz" in settings:
                 frequency = settings["fclk_mhz"]
 
-            # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,
-            # {'fifo_widths': {0: 8, 1: 32, 2: 24}}
+            # For live FIFO-sizing, we also expect the FIFO widths (in bits) exported by FINN, e.g.,
+            # {'fifo_widths': {"0": 8, "1": 32, "2": 24}}
             fifo_widths = settings["fifo_widths"]
 
+            # The settings can also contain the original folding config,
+            # into which we can insert the live FIFO sizes once we are done
+            if "folding_config_before_lfs" in settings:
+                folding_config_lfs = settings["folding_config_before_lfs"]
+
     print("Programming FPGA..")
     PL.reset()  # reset PYNQ cache
     accel = FINNLiveFIFOOverlay(
@@ -362,11 +368,24 @@ def determine_start_depth(
     ### Generate fifo_depth_export.json to export FIFO depths for use in FINN
     fifo_depth_export = {}
     for fifo, depth in enumerate(fifo_depths):
-        fifo_depth_export["StreamingFIFO_rtl_%d" % fifo] = {}
-        fifo_depth_export["StreamingFIFO_rtl_%d" % fifo]["depth"] = depth + accel.fifo_depth_offset
+        fifo_name = "StreamingFIFO_rtl_%d" % fifo
+        fifo_depth_export[fifo_name] = {}
+        fifo_depth_export[fifo_name]["depth"] = depth + accel.fifo_depth_offset
     with open(os.path.join(report_dir, "fifo_depth_export.json"), "w") as f:
         json.dump(fifo_depth_export, f, indent=2)
 
+    # Also export directly into original folding config for convenience
+    if folding_config_lfs:
+        for key in list(folding_config_lfs.keys()):
+            if key.startswith("StreamingFIFO"):
+                fifo_name = "StreamingFIFO_rtl_%d" % int(key.removeprefix("StreamingFIFO_"))
+                # Rename FIFO from StreamingFIFO_* to StreamingFIFO_rtl_*
+                folding_config_lfs[fifo_name] = folding_config_lfs.pop(key)
+                folding_config_lfs[fifo_name]["depth"] = fifo_depth_export[fifo_name]["depth"]
+                folding_config_lfs[fifo_name]["impl_style"] = "rtl"
+        with open(os.path.join(report_dir, "folding_config_lfs.json"), "w") as f:
+            json.dump(folding_config_lfs, f, indent=2)
+
     ### Generate the usual instrumentation performance report based on final state
     min_latency = log_min_latency[-1]
     latency = log_latency[-1]
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index e7c947192a..e065641b27 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -313,7 +313,7 @@ def __init__(self, platform, clk_period_ns, live_fifo_sizing):
         self.clk_period_ns = clk_period_ns
         self.live_fifo_sizing = live_fifo_sizing
 
-    def apply(self, model):
+    def apply(self, model: ModelWrapper):
         # TODO: support runtime-writable and external weights
         # TODO: support Alveo and Versal platforms
 
@@ -359,6 +359,13 @@ def apply(self, model):
                         node_inst = getCustomOp(node)
                         fifo_widths[sdp_id] = node_inst.get_instream_width()
             settings["fifo_widths"] = fifo_widths
+            # export original folding config to settings file,
+            # so that the driver can generate a final cfg with live fifo sizes applied
+            folding_path = model.get_metadata_prop("folding_config_before_lfs")
+            if folding_path:
+                with open(folding_path, "r") as f:
+                    folding_cfg = json.load(f)
+                settings["folding_config_before_lfs"] = folding_cfg
 
         settingsfile = pynq_driver_dir + "/settings.json"
         with open(settingsfile, "w") as f:

From 7a2ff270f206ddfe4e86d4a122870d816a86f0e1 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 24 Mar 2025 18:15:09 +0100
Subject: [PATCH 082/125] Generate follow-up bench cfg for lfs experiments

---
 benchmarking/bench_base.py               |  7 +++++
 benchmarking/collect.py                  | 34 ++++++++++++++++++++++++
 src/finn/builder/build_dataflow_steps.py |  2 +-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index cc25fc7ff7..1aab18dd28 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -441,6 +441,7 @@ def steps_full_build_flow(self):
                 cfg.auto_fifo_depths = False
                 cfg.live_fifo_sizing = True
                 cfg.enable_instrumentation = True
+                cfg.synth_clk_period_ns = 10 # force conservative 100 MHz clock
             else:
                 cfg.auto_fifo_depths = True
                 cfg.auto_fifo_strategy = self.params["fifo_method"]
@@ -468,6 +469,12 @@ def steps_full_build_flow(self):
         if "floorplan_path" in self.build_inputs:
             cfg.floorplan_path = self.build_inputs["floorplan_path"]
 
+        if "target_fps" in self.params:
+            if self.params["target_fps"] == "None":
+                cfg.target_fps = None
+            else:
+                cfg.target_fps = self.params["target_fps"]
+
         # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
         # TODO: make configurable or set on pipeline level?
         os.environ["LIVENESS_THRESHOLD"] = "10000000"
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 45f6073d1b..8a5bce3663 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -58,6 +58,12 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
     run_ids.sort()
     print("Found %d runs" % len(run_ids))
 
+    follow_up_bench_cfg = list()
+    # Prepare (local) output directory where follow-up bench configs will be stored
+    output_cfg_dir = os.path.join(os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"))
+    output_folding_dir = os.path.join(output_cfg_dir, "folding")
+    output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json")
+
     for id in run_ids:
         print("Processing run %d" % id)
         experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
@@ -212,4 +218,32 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                 shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True)
             live.log_artifact(dvc_report_dir)
 
+        # Prepare benchmarking config for follow-up runs after live FIFO-sizing
+        folding_config_lfs_path = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "folding_config_lfs.json")
+        if os.path.isfile(folding_config_lfs_path):
+            # Copy folding config produced by live FIFO-sizing
+            output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json")
+            os.makedirs(output_folding_dir, exist_ok=True)
+            shutil.copy(folding_config_lfs_path, output_folding_path)
+
+            # Create benchmarking config
+            metadata_bench = open_json_report(id, "metadata_bench.json")   
+            configuration = dict()
+            for key in metadata_bench["params"]:
+                # wrap in list
+                configuration[key] = [metadata_bench["params"][key]]
+            # overwrite FIFO-related params
+            import_folding_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"), "folding", experiment_name + ".json")
+            configuration["fifo_method"] = ["manual"]
+            configuration["target_fps"] = ["None"]
+            configuration["folding_path"] = [import_folding_path]
+
+            follow_up_bench_cfg.append(configuration)
+
+    # Save aggregated benchmarking config for follow-up job
+    if follow_up_bench_cfg:
+        print("Saving follow-up bench config for lfs: %s" % output_cfg_path)
+        with open(output_cfg_path, "w") as f:
+            json.dump(follow_up_bench_cfg, f, indent=2)
+
     print("Done")
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index c508a2d505..7ff957af0a 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -423,7 +423,7 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
             "depth_trigger_uram",
             "depth_trigger_bram",
         ]
-        extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs)
+        extract_model_config_to_json(model, cfg.output_dir + "/report/auto_folding_config.json", hw_attrs)
 
     return model
 

From ea808b25fdca1568ed3f1be6c65a49ebbbfd11ec Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 25 Mar 2025 13:58:05 +0100
Subject: [PATCH 083/125] Fix collection of lfs-generated folding config

---
 benchmarking/collect.py | 1 +
 benchmarking/measure.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 8a5bce3663..f59f3a3607 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -224,6 +224,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             # Copy folding config produced by live FIFO-sizing
             output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json")
             os.makedirs(output_folding_dir, exist_ok=True)
+            print("Saving lfs-generated folding config of this run to use in a future follow-up run: %s" % output_folding_path)
             shutil.copy(folding_config_lfs_path, output_folding_path)
 
             # Create benchmarking config
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index d0e5a64aa8..a79632c168 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -42,6 +42,7 @@
                            "fifo_sizing_report.json",
                            "fifo_depth_export.json",
                            "fifo_sizing_graph.png",
+                           "folding_config_lfs.json",
                            ]:
                 report_path = os.path.join(extract_dir, report)
                 if os.path.isfile(report_path):

From 15fef09eaf7c31f924c3474a960fa278f898c9fe Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 28 Mar 2025 15:22:58 +0100
Subject: [PATCH 084/125] Increase virtual FIFO depth offset to 8

---
 src/finn/qnn-data/templates/driver/driver_fifosizing.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index 1cbc5053cf..a87342f79e 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -32,8 +32,9 @@ def __init__(
         self.error = False
         self.fifo_widths = fifo_widths
         self.num_fifos = len(self.fifo_widths)
-        # Try to account for additional registers introduced by virtual FIFO HLS implementation
-        self.fifo_depth_offset = 4
+        # Account for additional FIFO depth and implicit registers introduced by the virtual FIFO HLS implementation that are not present in real FIFOs
+        # This results in a minimum possible FIFO depth of 1 + 8 = 9, which should be improved in a future virtual FIFO implementation (TODO)
+        self.fifo_depth_offset = 8
 
         # Sanity check
         # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps

From 5e8c888fcb562de22fffdc74ee55340393f36e30 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 8 Apr 2025 08:47:40 +0200
Subject: [PATCH 085/125] Allow IODMA wrapper

---
 benchmarking/bench_base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index 1aab18dd28..dc51f690ed 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -426,13 +426,18 @@ def steps_full_build_flow(self):
         #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
         cfg.force_python_rtlsim = False
         cfg.split_large_fifos = True
-        cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet
         cfg.save_intermediate_models = True # Save the intermediate model graphs
         cfg.verify_save_full_context = True, # Output full context dump for verification steps
         #rtlsim_use_vivado_comps # TODO ?
         #cfg.default_swg_exception
         #cfg.large_fifo_mem_style
 
+        # Switch between instrumentation or IODMA wrapper (TODO: combine both in one bitstream)
+        if "enable_instrumentation" in self.params:
+            cfg.enable_instrumentation = self.params["enable_instrumentation"]
+        else:
+            cfg.enable_instrumentation = True
+
         # "manual or "characterize" or "largefifo_rtlsim" or "live"
         if "fifo_method" in self.params:
             if self.params["fifo_method"] == "manual":

From 2687ae013d0d0fab1e7a4886934c049e18582c7d Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 8 Apr 2025 09:48:36 +0200
Subject: [PATCH 086/125] Parse DCP resource breakdown

---
 benchmarking/collect.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index f59f3a3607..491c29d043 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -154,6 +154,31 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             # fifo_sizing.json
             log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/")
 
+            # stitched IP DCP synth resource report
+            log_nested_metrics_from_report(id, live, "post_synth_resources_dcp.json", "(top)", [
+                "LUT",
+                "FF",
+                "SRL",
+                "DSP",
+                "BRAM_18K",
+                "BRAM_36K",
+                "URAM",
+                ], prefix="synth(dcp)/resources/")
+
+            # stitched IP DCP synth resource breakdown
+            # TODO: generalize to all build flows and bitfile synth
+            layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"]
+            for category in layer_categories:
+                log_nested_metrics_from_report(id, live, "res_breakdown_build_output.json", category, [
+                    "LUT",
+                    "FF",
+                    "SRL",
+                    "DSP",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                    ], prefix="synth(dcp)/resources(breakdown)/" + category + "/")
+
             # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
             log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
                 "LUT",

From 00ec0f94ddb30a84b0005ec9fb47bdfba0479a5e Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 8 Apr 2025 09:52:15 +0200
Subject: [PATCH 087/125] Put pipeline and run IDs in DVC exp msg

---
 benchmarking/collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 491c29d043..6fcd3be948 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -67,7 +67,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
     for id in run_ids:
         print("Processing run %d" % id)
         experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
-        experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME")
+        experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + " (" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + ")"
         #TODO: cache images once we switch to a cache provider that works with DVC Studio
         with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live:
             ### PARAMS ###

From c4f7437fd6be9354997e261fba8be51d3efd3af9 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 24 Apr 2025 11:03:50 +0200
Subject: [PATCH 088/125] Validate accuracy when synthesized with IODMA wrapper

---
 benchmarking/bench_base.py                    |   3 +
 benchmarking/collect.py                       |   5 +
 benchmarking/measure.py                       |  21 ++-
 src/finn/builder/build_dataflow_config.py     |   3 +
 src/finn/builder/build_dataflow_steps.py      |   2 +-
 .../qnn-data/templates/driver/validate.py     | 172 +++++++++++++++---
 .../fpgadataflow/make_pynq_driver.py          |  13 +-
 7 files changed, 183 insertions(+), 36 deletions(-)

diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py
index dc51f690ed..39a16dd7bc 100644
--- a/benchmarking/bench_base.py
+++ b/benchmarking/bench_base.py
@@ -480,6 +480,9 @@ def steps_full_build_flow(self):
             else:
                 cfg.target_fps = self.params["target_fps"]
 
+        if "validation_dataset" in self.params:
+            cfg.validation_dataset = self.params["validation_dataset"]
+
         # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
         # TODO: make configurable or set on pipeline level?
         os.environ["LIVENESS_THRESHOLD"] = "10000000"
diff --git a/benchmarking/collect.py b/benchmarking/collect.py
index 6fcd3be948..81dfbe339f 100644
--- a/benchmarking/collect.py
+++ b/benchmarking/collect.py
@@ -213,6 +213,11 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
             # instrumentation measurement
             log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/")
 
+            # IODMA validation accuracy
+            log_metrics_from_report(id, live, "validation.json", [
+                "top-1_accuracy",
+                ], prefix="measurement/validation/")
+
             # power measurement
             # TODO
 
diff --git a/benchmarking/measure.py b/benchmarking/measure.py
index a79632c168..7231991bde 100644
--- a/benchmarking/measure.py
+++ b/benchmarking/measure.py
@@ -26,11 +26,21 @@
 
             # Run driver
             print("Running driver..")
-            result = subprocess.run(["python", f"{extract_dir}/driver/driver.py",
-                            "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
-                            "--settingsfile", f"{extract_dir}/driver/settings.json",
-                            "--reportfile", f"{extract_dir}/measured_performance.json",
-                            ])
+            # run validate.py (from IODMA driver) if present, otherwise driver.py from instrumentation
+            # TODO: unify IODMA/instrumentation shell & driver
+            if os.path.isfile(f"{extract_dir}/driver/validate.py"):
+                result = subprocess.run(["python", f"{extract_dir}/driver/validate.py",
+                                "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
+                                "--settingsfile", f"{extract_dir}/driver/settings.json",
+                                "--reportfile", f"{extract_dir}/validation.json",
+                                "--dataset_root", "/home/xilinx/datasets", #TODO: env var
+                                ])
+            else:
+                result = subprocess.run(["python", f"{extract_dir}/driver/driver.py",
+                                "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
+                                "--settingsfile", f"{extract_dir}/driver/settings.json",
+                                "--reportfile", f"{extract_dir}/measured_performance.json",
+                                ])
             if result.returncode != 0:
                 print("Driver reported error!")
                 exit_code = 1
@@ -43,6 +53,7 @@
                            "fifo_depth_export.json",
                            "fifo_sizing_graph.png",
                            "folding_config_lfs.json",
+                           "validation.json",
                            ]:
                 report_path = os.path.join(extract_dir, report)
                 if os.path.isfile(report_path):
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index a3db23a714..b2814f31ab 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -369,6 +369,9 @@ class DataflowBuildConfig:
     #: rtlsim, otherwise they will be replaced by RTL implementations.
     rtlsim_use_vivado_comps: Optional[bool] = True
 
+    #: Specify validation dataset to be used for deployment of the generated driver
+    validation_dataset: Optional[str] = None
+
     def _resolve_hls_clk_period(self):
         if self.hls_clk_period_ns is None:
             # use same clk for synth and hls if not explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 7ff957af0a..1bd78c7f0a 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -852,7 +852,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
                 )
             )
         else:
-            model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
+            model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform(), cfg.validation_dataset))
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
         print("PYNQ Python driver written into " + driver_dir)
     return model
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index c8bc1c009d..55e7603650 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -27,10 +27,65 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
+import os
 import numpy as np
+from PIL import Image
+from dataset_loading import FileQueue, ImgQueue
+import json
+from pynq import PL
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
 
+def img_resize(img, size):
+    w, h = img.size
+    if (w <= h and w == size) or (h <= w and h == size):
+        return img
+    if w < h:
+        ow = size
+        oh = int(size * h / w)
+        return img.resize((ow, oh), Image.BILINEAR)
+    else:
+        oh = size
+        ow = int(size * w / h)
+        return img.resize((ow, oh), Image.BILINEAR)
+
+def img_center_crop(img, size):
+    crop_height, crop_width = (size, size)
+    image_width, image_height = img.size
+    crop_top = int(round((image_height - crop_height) / 2.))
+    crop_left = int(round((image_width - crop_width) / 2.))
+    return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height))
+
+def pre_process(img_np):
+    img = Image.fromarray(img_np.astype(np.uint8))
+    img = img_resize(img, 256)
+    img = img_center_crop(img, 224)
+    img = np.array(img, dtype=np.uint8)
+    return img
+
+def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images = 50000):
+    if label_file_path is None:
+        val_folders = [ f.name for f in os.scandir(val_path) if f.is_dir() ]
+        val_folders = sorted(val_folders)
+        assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val"
+        files = []
+        labels = []
+        for idx, folder in enumerate(val_folders):
+            current_files = sorted(os.listdir(os.path.join(val_path, folder)))
+            current_files = [os.path.join(folder, file) for file in current_files]
+            files.extend(current_files)
+            labels.extend([idx]*len(current_files))
+        files = files[:n_images]
+    else:
+        files = ['ILSVRC2012_val_{:08d}.JPEG'.format(i) for i in range(1,n_images+1)]
+        labels = np.loadtxt(label_file_path, dtype=int, usecols=1)
+
+    file_queue = FileQueue()
+    file_queue.load_epochs(list(zip(files,labels)), shuffle=False)
+    img_queue = ImgQueue(maxsize=batch_size)
+    img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process)
+    return img_queue
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Validate top-1 accuracy for FINN-generated accelerator"
@@ -38,7 +93,7 @@
     parser.add_argument(
         "--batchsize", help="number of samples for inference", type=int, default=100
     )
-    parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True)
+    parser.add_argument("--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="")
     parser.add_argument(
         "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
     )
@@ -48,14 +103,43 @@
     parser.add_argument(
         "--dataset_root", help="dataset root dir for download/reuse", default="/tmp"
     )
+    parser.add_argument(
+        "--reportfile",
+        help="Name of output .json report file",
+        type=str,
+        default="validation.json",
+    )
+    parser.add_argument(
+        "--settingsfile", help="Name of optional input .json settings file", type=str, default=""
+    )
     # parse arguments
     args = parser.parse_args()
     bsize = args.batchsize
     dataset = args.dataset
     bitfile = args.bitfile
     platform = args.platform
+    reportfile = args.reportfile
+    settingsfile = args.settingsfile
     dataset_root = args.dataset_root
 
+    # overwrite settings if specified in settings file
+    if settingsfile != "":
+        with open(settingsfile, "r") as f:
+            settings = json.load(f)
+            if "validation_dataset" in settings:
+                dataset = settings["validation_dataset"]
+
+    # program FPGA and load driver
+    PL.reset()  # reset PYNQ cache
+    driver = FINNExampleOverlay(
+        bitfile_name=bitfile,
+        platform=platform,
+        io_shape_dict=io_shape_dict,
+        batch_size=bsize,
+        runtime_weight_dir="runtime_weights/",
+    )
+
+    # prepare dataset
     if dataset == "mnist":
         from dataset_loading import mnist
 
@@ -68,40 +152,72 @@
         trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
             dataset_root, download=True, one_hot=False
         )
+    elif dataset == "cifar100":
+        from dataset_loading import cifar
+        trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
+            dataset_root, download=True, one_hot=False, cifar10=False
+        )
+    elif dataset == "imagenet":
+        val_dir = dataset_root + "/ImageNet/2012/val"
+        label_file = dataset_root + "/ImageNet/2012/val.txt"
+        img_queue = setup_dataloader(val_dir, label_file, bsize)
+        total = 50000
     else:
         raise Exception("Unrecognized dataset")
 
-    test_imgs = testx
-    test_labels = testy
-
-    ok = 0
-    nok = 0
-    total = test_imgs.shape[0]
+    # run accelerator on dataset
+    if dataset in ["mnist", "cifar10", "cifar100"]:
+        test_imgs = testx
+        test_labels = testy
 
-    driver = FINNExampleOverlay(
-        bitfile_name=bitfile,
-        platform=platform,
-        io_shape_dict=io_shape_dict,
-        batch_size=bsize,
-        runtime_weight_dir="runtime_weights/",
-    )
+        ok = 0
+        nok = 0
+        total = test_imgs.shape[0]
 
-    n_batches = int(total / bsize)
+        n_batches = int(total / bsize)
 
-    test_imgs = test_imgs.reshape(n_batches, bsize, -1)
-    test_labels = test_labels.reshape(n_batches, bsize)
+        test_imgs = test_imgs.reshape(n_batches, bsize, -1)
+        test_labels = test_labels.reshape(n_batches, bsize)
 
-    for i in range(n_batches):
-        ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape)
-        exp = test_labels[i]
-        driver.copy_input_data_to_device(ibuf_normal)
-        driver.execute_on_buffers()
-        obuf_normal = np.empty_like(driver.obuf_packed_device[0])
-        driver.copy_output_data_from_device(obuf_normal)
-        ret = np.bincount(obuf_normal.flatten() == exp.flatten())
-        nok += ret[0]
-        ok += ret[1]
-        print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+        print("Starting validation..")
+        for i in range(n_batches):
+            ibuf_normal = test_imgs[i].reshape(driver.ishape_normal())
+            exp = test_labels[i]
+            obuf_normal = driver.execute(ibuf_normal)
+            #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            #TODO: detect automatically if argmax is needed or output is already top-1
+            obuf_normal = np.argmax(obuf_normal, axis=1)
+            ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
+            nok += ret[0]
+            ok += ret[1]
+            print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
+    elif dataset in ["imagenet"]:
+        ok = 0
+        nok = 0
+        i = 0
+        print("Starting validation..")
+        while not img_queue.last_batch:
+            imgs, lbls = img_queue.get_batch(bsize, timeout=None)
+            imgs = np.array(imgs)
+            exp = np.array(lbls)
+            ibuf_normal = imgs.reshape(driver.ishape_normal())
+            obuf_normal = driver.execute(ibuf_normal)
+            #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            #TODO: detect automatically if argmax is needed or output is already top-1
+            obuf_normal = np.argmax(obuf_normal, axis=1)
+            ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
+            nok += ret[0]
+            ok += ret[1]
+            i += 1
+            print("batch %d : total OK %d NOK %d" % (i, ok, nok))
 
+    # calculate top-1 accuracy
     acc = 100.0 * ok / (total)
     print("Final accuracy: %f" % acc)
+
+    # write report to file
+    report = {
+        "top-1_accuracy": acc,
+    }
+    with open(reportfile, "w") as f:
+        json.dump(report, f, indent=2)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index 42cc017d30..c6ddfbd173 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -74,9 +74,10 @@ class MakePYNQDriverIODMA(Transformation):
     under the runtime_weights/ subfolder of the pynq_driver_dir.
     """
 
-    def __init__(self, platform):
+    def __init__(self, platform, validation_datset):
         super().__init__()
         self.platform = platform
+        self.validation_datset = validation_datset
 
     def apply(self, model):
         # create a temporary folder for the generated driver
@@ -270,8 +271,16 @@ def apply(self, model):
         )
         shutil.copy(validate_template, validate_py)
 
-        # generate weight files for runtime-writable layers
+        # generate settings.json for generated driver
+        if self.validation_datset is not None:
+            settings = {
+                "validation_datset": self.validation_datset,
+            }
+            settingsfile = pynq_driver_dir + "/settings.json"
+            with open(settingsfile, "w") as f:
+                json.dump(settings, f, indent=2)
 
+        # generate weight files for runtime-writable layers
         for sdp_ind, sdp_node in enumerate(model.graph.node):
             assert sdp_node.op_type == "StreamingDataflowPartition"
             # get dataflow model

From d0e33d005cb82225dfdfb98eda6b4a43210752c4 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 24 Apr 2025 11:05:17 +0200
Subject: [PATCH 089/125] Update gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index f40370b443..dbac36d4f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,9 @@ __pycache__/*
 .settings
 .idea
 tags
+poetry.lock
+*.code-workspace
+.env
 
 # Package files
 *.egg
@@ -97,6 +100,7 @@ MANIFEST
 # downloaded dep repos
 /deps/
 
+# local test directories for benchmarking infrastructure
 bench_input
 bench_output
 bench_save

From 5b45cde002b350fcc919d7ada8c41931342ed0fc Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 25 Apr 2025 15:44:39 +0200
Subject: [PATCH 090/125] Fix typo

---
 src/finn/transformation/fpgadataflow/make_pynq_driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index c6ddfbd173..6dad5dc1d8 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -274,7 +274,7 @@ def apply(self, model):
         # generate settings.json for generated driver
         if self.validation_datset is not None:
             settings = {
-                "validation_datset": self.validation_datset,
+                "validation_dataset": self.validation_datset,
             }
             settingsfile = pynq_driver_dir + "/settings.json"
             with open(settingsfile, "w") as f:

From b5aee28630d9958d0572ac927cda8b8cb9c9e69a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 28 Apr 2025 12:48:53 +0200
Subject: [PATCH 091/125] Update gitignore

---
 .dvc/.gitignore | 3 +++
 .gitignore      | 9 +++++++++
 2 files changed, 12 insertions(+)
 create mode 100644 .dvc/.gitignore

diff --git a/.dvc/.gitignore b/.dvc/.gitignore
new file mode 100644
index 0000000000..528f30c71c
--- /dev/null
+++ b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.gitignore b/.gitignore
index be61378730..dbac36d4f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,9 @@ __pycache__/*
 .settings
 .idea
 tags
+poetry.lock
+*.code-workspace
+.env
 
 # Package files
 *.egg
@@ -96,3 +99,9 @@ MANIFEST
 
 # downloaded dep repos
 /deps/
+
+# local test directories for benchmarking infrastructure
+bench_input
+bench_output
+bench_save
+bench_work

From 4f9dc7ee13006b004bc3700c354011ae38608add Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 28 Apr 2025 13:08:51 +0200
Subject: [PATCH 092/125] [Driver] Increase recursion limit

---
 src/finn/qnn-data/templates/driver/driver_fifosizing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
index a87342f79e..e86b28772d 100644
--- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py
+++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py
@@ -279,7 +279,12 @@ def determine_start_depth(
                 folding_config_lfs = settings["folding_config_before_lfs"]
 
     print("Programming FPGA..")
-    PL.reset()  # reset PYNQ cache
+    # Increase recursion limit because the default value (1000) caused pickle RecursionErrors
+    # during PYNQ cache handling for accelerators with many FIFOs (exact reason unknown)
+    sys.setrecursionlimit(10000)
+    # Reset PYNQ cache, without this we encountered issues where PYNQ would try to load
+    # an incorrect combination of .bit and .hwh file, see https://github.com/Xilinx/PYNQ/issues/1409
+    PL.reset()
     accel = FINNLiveFIFOOverlay(
         bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths
     )

From 46995244766d2f629ae2354c3a20ea907ee958d7 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 6 May 2025 20:47:20 +0200
Subject: [PATCH 093/125] [Driver] Support top1 output

---
 src/finn/qnn-data/templates/driver/validate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index 55e7603650..16f1e7a029 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -185,8 +185,8 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
             exp = test_labels[i]
             obuf_normal = driver.execute(ibuf_normal)
             #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
-            #TODO: detect automatically if argmax is needed or output is already top-1
-            obuf_normal = np.argmax(obuf_normal, axis=1)
+            if obuf_normal.shape[1] > 1:
+                obuf_normal = np.argmax(obuf_normal, axis=1)
             ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
             nok += ret[0]
             ok += ret[1]
@@ -203,8 +203,8 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
             ibuf_normal = imgs.reshape(driver.ishape_normal())
             obuf_normal = driver.execute(ibuf_normal)
             #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
-            #TODO: detect automatically if argmax is needed or output is already top-1
-            obuf_normal = np.argmax(obuf_normal, axis=1)
+            if obuf_normal.shape[1] > 1:
+                obuf_normal = np.argmax(obuf_normal, axis=1)
             ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
             nok += ret[0]
             ok += ret[1]

From 215b6ca272118a7ac6fccde7d63223a0fe3b213a Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 19 May 2025 19:13:12 +0200
Subject: [PATCH 094/125] [CI] Fix artifact pull from parent pipeline

---
 benchmarking/bench-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml
index 7e9376f3cf..28b3e9d83b 100644
--- a/benchmarking/bench-ci.yml
+++ b/benchmarking/bench-ci.yml
@@ -19,7 +19,7 @@ FINN Build:
       aud: https://git.uni-paderborn.de
   stage: build
   needs:
-    - job: Fetch Repos
+    - job: Build
       pipeline: $PARENT_PIPELINE_ID
   variables:
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"

From ffc9fd9650155570d19d198e3be31cc5ade31ec9 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 13:41:59 +0200
Subject: [PATCH 095/125] Fix make driver step name

---
 benchmarking/dut/metafi.py      | 2 +-
 benchmarking/dut/mobilenetv1.py | 2 +-
 benchmarking/dut/mvau.py        | 2 +-
 benchmarking/dut/resnet50.py    | 2 +-
 benchmarking/dut/transformer.py | 2 +-
 benchmarking/dut/vgg10.py       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py
index 4c9dec2521..72912c45fc 100644
--- a/benchmarking/dut/metafi.py
+++ b/benchmarking/dut/metafi.py
@@ -39,7 +39,7 @@ def step_build_setup(self):
             "step_measure_rtlsim_performance",
             "step_out_of_context_synthesis",
             "step_synthesize_bitfile",
-            "step_make_pynq_driver",
+            "step_make_driver",
             "step_deployment_package",
         ]
 
diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py
index 06042816cf..a3899b1382 100644
--- a/benchmarking/dut/mobilenetv1.py
+++ b/benchmarking/dut/mobilenetv1.py
@@ -136,7 +136,7 @@ def step_build_setup(self):
             "step_hw_ipgen",
             "step_create_stitched_ip",
             "step_synthesize_bitfile",
-            "step_make_pynq_driver",
+            "step_make_driver",
             "step_deployment_package",
         ]
         # mobilenet_build_steps_alveo = [
diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py
index f62c6b59a7..d67a926160 100644
--- a/benchmarking/dut/mvau.py
+++ b/benchmarking/dut/mvau.py
@@ -315,7 +315,7 @@ def step_build_setup(self):
                 "step_measure_rtlsim_performance",
                 "step_out_of_context_synthesis",
                 "step_synthesize_bitfile",
-                "step_make_pynq_driver",
+                "step_make_driver",
                 "step_deployment_package",
             ]
         )
diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py
index bf5aed8ab4..0535db7269 100644
--- a/benchmarking/dut/resnet50.py
+++ b/benchmarking/dut/resnet50.py
@@ -31,7 +31,7 @@ def step_build_setup(self):
             "step_measure_rtlsim_performance", # was not in finn-examples
             "step_out_of_context_synthesis", # was not in finn-examples
             "step_synthesize_bitfile",
-            "step_make_pynq_driver",
+            "step_make_driver",
             "step_deployment_package",
         ]
 
diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py
index 1798ea1410..d1b14fca72 100644
--- a/benchmarking/dut/transformer.py
+++ b/benchmarking/dut/transformer.py
@@ -971,7 +971,7 @@ def step_build_setup(self):
                 # "step_measure_rtlsim_performance", # not possible due to float components
                 "step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
                 "step_synthesize_bitfile", 
-                "step_make_pynq_driver",
+                "step_make_driver",
                 "step_deployment_package",
             ]
         )
diff --git a/benchmarking/dut/vgg10.py b/benchmarking/dut/vgg10.py
index e64a58fb2f..516d5c47de 100644
--- a/benchmarking/dut/vgg10.py
+++ b/benchmarking/dut/vgg10.py
@@ -41,7 +41,7 @@ def step_build_setup(self):
             "step_measure_rtlsim_performance",
             "step_out_of_context_synthesis",
             "step_synthesize_bitfile",
-            "step_make_pynq_driver",
+            "step_make_driver",
             "step_deployment_package",
         ]
 

From d20b10d23366cb9eba3c856e6b0020fd4b1e2dfa Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 14:05:33 +0200
Subject: [PATCH 096/125] Move benchmarking scripts to FINN package

---
 .gitlab-ci.yml                                |  2 +-
 .../finn/benchmarking}/bench-ci.yml           |  9 +++---
 .../finn/benchmarking}/bench.py               | 22 ++++++---------
 .../finn/benchmarking}/bench_base.py          |  4 +--
 .../finn/benchmarking}/bench_rtl_swg.py       |  0
 .../finn/benchmarking}/cfg/metafi_test.json   |  0
 .../benchmarking}/cfg/mobilenetv1_test.json   |  0
 .../finn/benchmarking}/cfg/mvau_test.json     |  0
 .../finn/benchmarking}/cfg/resnet50_test.json |  0
 .../benchmarking}/cfg/synthetic_fifotest.json |  0
 .../cfg/transformer_gpt_all.json              |  0
 .../cfg/transformer_radioml_all.json          |  0
 .../benchmarking}/cfg/transformer_sweep.json  |  0
 .../benchmarking}/cfg/transformer_test.json   |  0
 .../finn/benchmarking}/cfg/vgg10_test.json    |  0
 .../finn/benchmarking}/collect.py             |  4 +--
 .../finn/benchmarking}/dut/metafi.py          |  2 +-
 .../finn/benchmarking}/dut/mobilenetv1.py     |  2 +-
 .../finn/benchmarking}/dut/mvau.py            |  2 +-
 .../finn/benchmarking}/dut/resnet50.py        |  4 +--
 .../dut/resnet50_custom_steps.py              |  0
 .../benchmarking}/dut/synthetic_nonlinear.py  |  4 +--
 .../finn/benchmarking}/dut/transformer.py     |  4 +--
 .../dut/transformer_custom_steps.py           |  0
 .../finn/benchmarking}/dut/vgg10.py           |  2 +-
 .../finn/benchmarking}/measure.py             |  2 +-
 .../finn/benchmarking}/templates.py           |  0
 .../finn/benchmarking}/util.py                |  0
 src/finn/interface/run_finn.py                | 28 +++++++++++++++++++
 29 files changed, 57 insertions(+), 34 deletions(-)
 rename {benchmarking => src/finn/benchmarking}/bench-ci.yml (88%)
 rename {benchmarking => src/finn/benchmarking}/bench.py (93%)
 rename {benchmarking => src/finn/benchmarking}/bench_base.py (98%)
 rename {benchmarking => src/finn/benchmarking}/bench_rtl_swg.py (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/metafi_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/mobilenetv1_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/mvau_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/resnet50_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/synthetic_fifotest.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/transformer_gpt_all.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/transformer_radioml_all.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/transformer_sweep.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/transformer_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/cfg/vgg10_test.json (100%)
 rename {benchmarking => src/finn/benchmarking}/collect.py (99%)
 rename {benchmarking => src/finn/benchmarking}/dut/metafi.py (97%)
 rename {benchmarking => src/finn/benchmarking}/dut/mobilenetv1.py (99%)
 rename {benchmarking => src/finn/benchmarking}/dut/mvau.py (99%)
 rename {benchmarking => src/finn/benchmarking}/dut/resnet50.py (92%)
 rename {benchmarking => src/finn/benchmarking}/dut/resnet50_custom_steps.py (100%)
 rename {benchmarking => src/finn/benchmarking}/dut/synthetic_nonlinear.py (98%)
 rename {benchmarking => src/finn/benchmarking}/dut/transformer.py (99%)
 rename {benchmarking => src/finn/benchmarking}/dut/transformer_custom_steps.py (100%)
 rename {benchmarking => src/finn/benchmarking}/dut/vgg10.py (97%)
 rename {benchmarking => src/finn/benchmarking}/measure.py (98%)
 rename {benchmarking => src/finn/benchmarking}/templates.py (100%)
 rename {benchmarking => src/finn/benchmarking}/util.py (100%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 990f2758ff..4d89ef0853 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -180,7 +180,7 @@ Bench:
       when: never
     - if: $MANUAL_CFG_PATH == ""
   trigger:
-    include: benchmarking/bench-ci.yml
+    include: src/finn/benchmarking/bench-ci.yml
     strategy: depend
     forward:
       pipeline_variables: true
diff --git a/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml
similarity index 88%
rename from benchmarking/bench-ci.yml
rename to src/finn/benchmarking/bench-ci.yml
index 28b3e9d83b..9e960f8ecd 100644
--- a/benchmarking/bench-ci.yml
+++ b/src/finn/benchmarking/bench-ci.yml
@@ -28,11 +28,10 @@ FINN Build:
   script:
     # Launch additional monitoring
     - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
-    # Launch benchmarking script directly (TODO: deeper integration)
+    # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
     - |
       source ./finn-plus-venv/bin/activate
-      finn deps update
-      python ./finn-plus/benchmarking/bench.py $BENCH_CFG
+      finn bench $BENCH_CFG
   cache:
     key: $CI_COMMIT_SHA
     policy: pull
@@ -56,7 +55,7 @@ Measurement:
     - when: always
   script:
     # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment
-    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py"
+    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python src/finn/benchmarking/measure.py"
   artifacts:
     name: "measurement_artifacts"
     when: always
@@ -74,5 +73,5 @@ Result Collection:
     # Also run on failure of previous tasks to collect partial results
     - when: always
   script:
-    - python3.10 benchmarking/collect.py
+    - python3.10 src/finn/benchmarking/collect.py
     - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git
diff --git a/benchmarking/bench.py b/src/finn/benchmarking/bench.py
similarity index 93%
rename from benchmarking/bench.py
rename to src/finn/benchmarking/bench.py
index 54788ac6a5..8d87036477 100644
--- a/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -7,15 +7,15 @@
 import onnxruntime as ort
 import importlib
 
-from util import delete_dir_contents
+from finn.benchmarking.util import delete_dir_contents
 
-from dut.mvau import bench_mvau
-from dut.resnet50 import bench_resnet50
-from dut.metafi import bench_metafi
-from dut.synthetic_nonlinear import bench_synthetic_nonlinear
-from dut.transformer import bench_transformer
-from dut.vgg10 import bench_vgg10
-from dut.mobilenetv1 import bench_mobilenetv1
+from finn.benchmarking.dut.mvau import bench_mvau
+from finn.benchmarking.dut.resnet50 import bench_resnet50
+from finn.benchmarking.dut.metafi import bench_metafi
+from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear
+from finn.benchmarking.dut.transformer import bench_transformer
+from finn.benchmarking.dut.vgg10 import bench_vgg10
+from finn.benchmarking.dut.mobilenetv1 import bench_mobilenetv1
 
 dut = dict()
 dut["mvau"] = bench_mvau
@@ -27,7 +27,7 @@
 dut["mobilenetv1"] = bench_mobilenetv1
 
 
-def main(config_name):
+def start_bench_run(config_name):
     exit_code = 0
     # Attempt to work around onnxruntime issue on Slurm-managed clusters:
     # See https://github.com/microsoft/onnxruntime/issues/8313
@@ -193,7 +193,3 @@ def get_default_session_options_new():
 
     print("Stopping job")
     return exit_code
-
-if __name__ == "__main__":
-    exit_code = main(sys.argv[1])
-    sys.exit(exit_code)
diff --git a/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
similarity index 98%
rename from benchmarking/bench_base.py
rename to src/finn/benchmarking/bench_base.py
index 39a16dd7bc..16ef757389 100644
--- a/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -27,8 +27,8 @@
 from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map
-from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
-from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
+from finn.benchmarking.templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
+from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
     ReplaceVerilogRelPaths,
 )
diff --git a/benchmarking/bench_rtl_swg.py b/src/finn/benchmarking/bench_rtl_swg.py
similarity index 100%
rename from benchmarking/bench_rtl_swg.py
rename to src/finn/benchmarking/bench_rtl_swg.py
diff --git a/benchmarking/cfg/metafi_test.json b/src/finn/benchmarking/cfg/metafi_test.json
similarity index 100%
rename from benchmarking/cfg/metafi_test.json
rename to src/finn/benchmarking/cfg/metafi_test.json
diff --git a/benchmarking/cfg/mobilenetv1_test.json b/src/finn/benchmarking/cfg/mobilenetv1_test.json
similarity index 100%
rename from benchmarking/cfg/mobilenetv1_test.json
rename to src/finn/benchmarking/cfg/mobilenetv1_test.json
diff --git a/benchmarking/cfg/mvau_test.json b/src/finn/benchmarking/cfg/mvau_test.json
similarity index 100%
rename from benchmarking/cfg/mvau_test.json
rename to src/finn/benchmarking/cfg/mvau_test.json
diff --git a/benchmarking/cfg/resnet50_test.json b/src/finn/benchmarking/cfg/resnet50_test.json
similarity index 100%
rename from benchmarking/cfg/resnet50_test.json
rename to src/finn/benchmarking/cfg/resnet50_test.json
diff --git a/benchmarking/cfg/synthetic_fifotest.json b/src/finn/benchmarking/cfg/synthetic_fifotest.json
similarity index 100%
rename from benchmarking/cfg/synthetic_fifotest.json
rename to src/finn/benchmarking/cfg/synthetic_fifotest.json
diff --git a/benchmarking/cfg/transformer_gpt_all.json b/src/finn/benchmarking/cfg/transformer_gpt_all.json
similarity index 100%
rename from benchmarking/cfg/transformer_gpt_all.json
rename to src/finn/benchmarking/cfg/transformer_gpt_all.json
diff --git a/benchmarking/cfg/transformer_radioml_all.json b/src/finn/benchmarking/cfg/transformer_radioml_all.json
similarity index 100%
rename from benchmarking/cfg/transformer_radioml_all.json
rename to src/finn/benchmarking/cfg/transformer_radioml_all.json
diff --git a/benchmarking/cfg/transformer_sweep.json b/src/finn/benchmarking/cfg/transformer_sweep.json
similarity index 100%
rename from benchmarking/cfg/transformer_sweep.json
rename to src/finn/benchmarking/cfg/transformer_sweep.json
diff --git a/benchmarking/cfg/transformer_test.json b/src/finn/benchmarking/cfg/transformer_test.json
similarity index 100%
rename from benchmarking/cfg/transformer_test.json
rename to src/finn/benchmarking/cfg/transformer_test.json
diff --git a/benchmarking/cfg/vgg10_test.json b/src/finn/benchmarking/cfg/vgg10_test.json
similarity index 100%
rename from benchmarking/cfg/vgg10_test.json
rename to src/finn/benchmarking/cfg/vgg10_test.json
diff --git a/benchmarking/collect.py b/src/finn/benchmarking/collect.py
similarity index 99%
rename from benchmarking/collect.py
rename to src/finn/benchmarking/collect.py
index 81dfbe339f..fa71c2a2aa 100644
--- a/benchmarking/collect.py
+++ b/src/finn/benchmarking/collect.py
@@ -1,9 +1,9 @@
 import json
 import os
 import shutil
-from dvclive import Live
+from dvclive.live import Live
 
-from util import delete_dir_contents
+from finn.benchmarking.util import delete_dir_contents
 
 
 def log_dvc_metric(live, prefix, name, value):
diff --git a/benchmarking/dut/metafi.py b/src/finn/benchmarking/dut/metafi.py
similarity index 97%
rename from benchmarking/dut/metafi.py
rename to src/finn/benchmarking/dut/metafi.py
index 72912c45fc..05c75eee08 100644
--- a/benchmarking/dut/metafi.py
+++ b/src/finn/benchmarking/dut/metafi.py
@@ -1,6 +1,6 @@
 import finn.builder.build_dataflow_config as build_cfg
 
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 # # custom steps
 # from custom_steps import (
diff --git a/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py
similarity index 99%
rename from benchmarking/dut/mobilenetv1.py
rename to src/finn/benchmarking/dut/mobilenetv1.py
index a3899b1382..d3c0968d1a 100644
--- a/benchmarking/dut/mobilenetv1.py
+++ b/src/finn/benchmarking/dut/mobilenetv1.py
@@ -1,4 +1,4 @@
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
 from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
diff --git a/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py
similarity index 99%
rename from benchmarking/dut/mvau.py
rename to src/finn/benchmarking/dut/mvau.py
index d67a926160..8ce89fdccc 100644
--- a/benchmarking/dut/mvau.py
+++ b/src/finn/benchmarking/dut/mvau.py
@@ -22,7 +22,7 @@
 )
 import finn.builder.build_dataflow_config as build_cfg
 
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 class bench_mvau(bench):
 
diff --git a/benchmarking/dut/resnet50.py b/src/finn/benchmarking/dut/resnet50.py
similarity index 92%
rename from benchmarking/dut/resnet50.py
rename to src/finn/benchmarking/dut/resnet50.py
index 0535db7269..efcd0de275 100644
--- a/benchmarking/dut/resnet50.py
+++ b/src/finn/benchmarking/dut/resnet50.py
@@ -1,14 +1,14 @@
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import alveo_default_platform
 
-from dut.resnet50_custom_steps import (
+from finn.benchmarking.dut.resnet50_custom_steps import (
         step_resnet50_tidy,
         step_resnet50_streamline,
         step_resnet50_convert_to_hw,
         step_resnet50_slr_floorplan,
     )
 
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 class bench_resnet50(bench):
     def step_build_setup(self):
diff --git a/benchmarking/dut/resnet50_custom_steps.py b/src/finn/benchmarking/dut/resnet50_custom_steps.py
similarity index 100%
rename from benchmarking/dut/resnet50_custom_steps.py
rename to src/finn/benchmarking/dut/resnet50_custom_steps.py
diff --git a/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py
similarity index 98%
rename from benchmarking/dut/synthetic_nonlinear.py
rename to src/finn/benchmarking/dut/synthetic_nonlinear.py
index eb91999b2e..b912e8b319 100644
--- a/benchmarking/dut/synthetic_nonlinear.py
+++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py
@@ -24,13 +24,13 @@
 import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
 from finn.util.basic import make_build_dir
-from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
+from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
 from finn.util.test import get_trained_network_and_ishape
 from finn.util.basic import alveo_default_platform
 
 
 
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 def generate_random_threshold_values(
     data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
diff --git a/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py
similarity index 99%
rename from benchmarking/dut/transformer.py
rename to src/finn/benchmarking/dut/transformer.py
index d1b14fca72..27583ec5e1 100644
--- a/benchmarking/dut/transformer.py
+++ b/src/finn/benchmarking/dut/transformer.py
@@ -22,14 +22,14 @@
 import finn.builder.build_dataflow_config as build_cfg
 from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
 from qonnx.core.modelwrapper import ModelWrapper
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 # Range information structure for seeding the range analysis for converting
 # quantized activations to MultiThreshold
 from qonnx.util.range_analysis import RangeInfo
 
 # Custom build steps required to streamline and convert the attention operator
-from dut.transformer_custom_steps import (
+from finn.benchmarking.dut.transformer_custom_steps import (
     prepare_graph,
     step_streamline,
     step_convert_attention_to_hw,
diff --git a/benchmarking/dut/transformer_custom_steps.py b/src/finn/benchmarking/dut/transformer_custom_steps.py
similarity index 100%
rename from benchmarking/dut/transformer_custom_steps.py
rename to src/finn/benchmarking/dut/transformer_custom_steps.py
diff --git a/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py
similarity index 97%
rename from benchmarking/dut/vgg10.py
rename to src/finn/benchmarking/dut/vgg10.py
index 516d5c47de..f799759108 100644
--- a/benchmarking/dut/vgg10.py
+++ b/src/finn/benchmarking/dut/vgg10.py
@@ -5,7 +5,7 @@
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
 
-from bench_base import bench
+from finn.benchmarking.bench_base import bench
 
 
 def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
diff --git a/benchmarking/measure.py b/src/finn/benchmarking/measure.py
similarity index 98%
rename from benchmarking/measure.py
rename to src/finn/benchmarking/measure.py
index 7231991bde..9a44ff3192 100644
--- a/benchmarking/measure.py
+++ b/src/finn/benchmarking/measure.py
@@ -3,7 +3,7 @@
 import subprocess
 import shutil
 
-from util import delete_dir_contents
+from finn.benchmarking.util import delete_dir_contents
 
 
 if __name__ == "__main__":
diff --git a/benchmarking/templates.py b/src/finn/benchmarking/templates.py
similarity index 100%
rename from benchmarking/templates.py
rename to src/finn/benchmarking/templates.py
diff --git a/benchmarking/util.py b/src/finn/benchmarking/util.py
similarity index 100%
rename from benchmarking/util.py
rename to src/finn/benchmarking/util.py
diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index ca5faef96d..82f71316e0 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -32,6 +32,7 @@
 from finn.interface.manage_deps import install_pyxsi, update_dependencies
 from finn.interface.manage_tests import run_test
 
+from finn.benchmarking.bench import start_bench_run
 
 # Resolves the path to modules which are not part of the FINN package hierarchy
 def _resolve_module_path(name: str) -> str:
@@ -260,6 +261,32 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) ->
     )
 
 
+@click.command(help="Run a given benchmark configuration.")
+@click.option(
+    "--bench_config",
+    help="Name or path of experiment configuration file",
+    default="",
+)
+@click.option("--dependency-path", "-d", default="")
+@click.option("--num-workers", "-n", default=-1, show_default=True)
+@click.option(
+    "--build-path",
+    "-b",
+    help="Specify a build temp path of your choice",
+    default="",
+)
+def bench(
+    bench_config: str, dependency_path: str, num_workers: int, build_path: str
+) -> None:
+    console = Console()
+    build_dir = Path(build_path).expanduser() if build_path != "" else None
+    dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None
+    prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True)
+    console.rule("RUNNING BENCHMARK")
+    exit_code = start_bench_run(bench_config)
+    sys.exit(exit_code)
+
+
 @click.command(help="Run a given test. Uses /tmp/FINN_TMP as the temporary file location")
 @click.option(
     "--variant",
@@ -385,6 +412,7 @@ def main() -> None:
     main_group.add_command(config)
     main_group.add_command(deps)
     main_group.add_command(build)
+    main_group.add_command(bench)
     main_group.add_command(test)
     main_group.add_command(run)
     main_group()

From 33921b84df16e881db47d4ddc16b3d9615528f63 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 14:13:25 +0200
Subject: [PATCH 097/125] Fix early import

---
 src/finn/interface/run_finn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index 82f71316e0..3661b414ab 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -32,7 +32,6 @@
 from finn.interface.manage_deps import install_pyxsi, update_dependencies
 from finn.interface.manage_tests import run_test
 
-from finn.benchmarking.bench import start_bench_run
 
 # Resolves the path to modules which are not part of the FINN package hierarchy
 def _resolve_module_path(name: str) -> str:
@@ -275,14 +274,16 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) ->
     help="Specify a build temp path of your choice",
     default="",
 )
-def bench(
-    bench_config: str, dependency_path: str, num_workers: int, build_path: str
-) -> None:
+def bench(bench_config: str, dependency_path: str, num_workers: int, build_path: str) -> None:
     console = Console()
     build_dir = Path(build_path).expanduser() if build_path != "" else None
     dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None
     prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True)
     console.rule("RUNNING BENCHMARK")
+
+    # Late import because we need prepare_finn to setup remaining dependencies first
+    from finn.benchmarking.bench import start_bench_run
+
     exit_code = start_bench_run(bench_config)
     sys.exit(exit_code)
 

From a49d003cb67023e8370522e6315906dc63cd0201 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 14:29:57 +0200
Subject: [PATCH 098/125] Introduce custom step library

---
 src/finn/benchmarking/dut/mobilenetv1.py      | 119 +-----------------
 src/finn/benchmarking/dut/vgg10.py            |  18 ---
 .../builder/custom_step_library/__init__.py   |   0
 .../builder/custom_step_library/conv1d.py     |  18 +++
 .../builder/custom_step_library/mobilenet.py  | 119 ++++++++++++++++++
 .../custom_step_library/resnet.py}            |   0
 .../custom_step_library/transformer.py}       |   0
 7 files changed, 138 insertions(+), 136 deletions(-)
 create mode 100644 src/finn/builder/custom_step_library/__init__.py
 create mode 100644 src/finn/builder/custom_step_library/conv1d.py
 create mode 100644 src/finn/builder/custom_step_library/mobilenet.py
 rename src/finn/{benchmarking/dut/resnet50_custom_steps.py => builder/custom_step_library/resnet.py} (100%)
 rename src/finn/{benchmarking/dut/transformer_custom_steps.py => builder/custom_step_library/transformer.py} (100%)

diff --git a/src/finn/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py
index d3c0968d1a..efcfb7b521 100644
--- a/src/finn/benchmarking/dut/mobilenetv1.py
+++ b/src/finn/benchmarking/dut/mobilenetv1.py
@@ -1,122 +1,5 @@
 from finn.benchmarking.bench_base import bench
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
-from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
-from qonnx.transformation.general import (
-    ApplyConfig,
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-)
-from qonnx.transformation.infer_data_layouts import InferDataLayouts
-from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.transformation.remove import RemoveIdentityOps
-
-import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-import finn.transformation.streamline.absorb as absorb
-import finn.transformation.streamline.reorder as reorder
-from finn.builder.build_dataflow_config import (
-    DataflowBuildConfig,
-    ShellFlowType,
-    VerificationStepType,
-)
-from finn.builder.build_dataflow_steps import verify_step
-from finn.transformation.streamline import Streamline
-from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
-from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
-
-
-def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(Streamline())
-    additional_streamline_transformations = [
-        DoubleToSingleFloat(),
-        reorder.MoveMulPastDWConv(),
-        absorb.AbsorbMulIntoMultiThreshold(),
-        ChangeDataLayoutQuantAvgPool2d(),
-        InferDataLayouts(),
-        reorder.MoveTransposePastScalarMul(),
-        absorb.AbsorbTransposeIntoFlatten(),
-        reorder.MoveFlattenPastAffine(),
-        reorder.MoveFlattenPastTopK(),
-        reorder.MoveScalarMulPastMatMul(),
-        CollapseRepeatedMul(),
-        RemoveIdentityOps(),
-        RoundAndClipThresholds(),
-    ]
-    for trn in additional_streamline_transformations:
-        model = model.transform(trn)
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(GiveReadableTensorNames())
-        model = model.transform(InferDataTypes())
-
-    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():
-        verify_step(model, cfg, "streamlined_python", need_parent=False)
-
-    return model
-
-
-def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(LowerConvsToMatMul())
-    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
-    model = model.transform(absorb.AbsorbConsecutiveTransposes())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-    model = model.transform(InferDataTypes())
-    model = model.transform(RoundAndClipThresholds())
-    model = model.transform(InferDataLayouts())
-    return model
-
-
-def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(to_hw.InferPool())
-    model = model.transform(to_hw.InferConvInpGen())
-    model = model.transform(to_hw.InferVectorVectorActivation())
-    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
-    model = model.transform(to_hw.InferChannelwiseLinearLayer())
-    model = model.transform(to_hw.InferLabelSelectLayer())
-    model = model.transform(InferShapes())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-    return model
-
-
-def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
-    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
-        try:
-            from finnexperimental.analysis.partitioning import partition
-
-            # apply partitioning of the model, restricting the first and last layers
-            # to SLR0
-            default_slr = 0
-            abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
-            floorplan = partition(
-                model,
-                cfg.synth_clk_period_ns,
-                cfg.board,
-                abs_anchors=abs_anchors,
-                multivariant=False,
-            )[0]
-            # apply floorplan to model
-            model = model.transform(ApplyConfig(floorplan))
-            print("SLR floorplanning applied")
-        except Exception:
-            print("No SLR floorplanning applied")
-    return model
-
-
-def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(to_hw.InferPool())
-    model = model.transform(to_hw.InferConvInpGen())
-    model = model.transform(to_hw.InferThresholdingLayer())
-    model = model.transform(to_hw.InferVectorVectorActivation())
-    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
-    model = model.transform(to_hw.InferChannelwiseLinearLayer())
-    model = model.transform(to_hw.InferLabelSelectLayer())
-    model = model.transform(InferShapes())
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(GiveReadableTensorNames())
-    return model
+from finn.builder.build_dataflow_config import DataflowBuildConfig
 
 
 class bench_mobilenetv1(bench):
diff --git a/src/finn/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py
index f799759108..d34c186387 100644
--- a/src/finn/benchmarking/dut/vgg10.py
+++ b/src/finn/benchmarking/dut/vgg10.py
@@ -1,24 +1,6 @@
-from qonnx.core.modelwrapper import ModelWrapper
 from finn.builder.build_dataflow_config import DataflowBuildConfig
-from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
-from qonnx.transformation.general import GiveUniqueNodeNames
-import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-import finn.transformation.streamline.absorb as absorb
-
 from finn.benchmarking.bench_base import bench
 
-
-def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(Change3DTo4DTensors())
-    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
-    return model
-
-def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
-    model = model.transform(to_hw.InferChannelwiseLinearLayer())
-    model = model.transform(to_hw.InferLabelSelectLayer())
-    model = model.transform(GiveUniqueNodeNames())
-    return model
-
 class bench_vgg10(bench):
     def step_build_setup(self):
         # create build config for VGG-10 (based on finn-examples)
diff --git a/src/finn/builder/custom_step_library/__init__.py b/src/finn/builder/custom_step_library/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py
new file mode 100644
index 0000000000..5545f66536
--- /dev/null
+++ b/src/finn/builder/custom_step_library/conv1d.py
@@ -0,0 +1,18 @@
+from qonnx.core.modelwrapper import ModelWrapper
+from finn.builder.build_dataflow_config import DataflowBuildConfig
+from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
+from qonnx.transformation.general import GiveUniqueNodeNames
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+
+
+def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Change3DTo4DTensors())
+    model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
+    return model
+
+def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(GiveUniqueNodeNames())
+    return model
diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py
new file mode 100644
index 0000000000..6a2d8053b2
--- /dev/null
+++ b/src/finn/builder/custom_step_library/mobilenet.py
@@ -0,0 +1,119 @@
+from finn.benchmarking.bench_base import bench
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
+from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
+from qonnx.transformation.general import (
+    ApplyConfig,
+    GiveReadableTensorNames,
+    GiveUniqueNodeNames,
+)
+from qonnx.transformation.infer_data_layouts import InferDataLayouts
+from qonnx.transformation.infer_datatypes import InferDataTypes
+from qonnx.transformation.infer_shapes import InferShapes
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.transformation.remove import RemoveIdentityOps
+
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
+import finn.transformation.streamline.absorb as absorb
+import finn.transformation.streamline.reorder as reorder
+from finn.builder.build_dataflow_config import (
+    DataflowBuildConfig,
+    ShellFlowType,
+    VerificationStepType,
+)
+from finn.builder.build_dataflow_steps import verify_step
+from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+
+
+def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(Streamline())
+    additional_streamline_transformations = [
+        DoubleToSingleFloat(),
+        reorder.MoveMulPastDWConv(),
+        absorb.AbsorbMulIntoMultiThreshold(),
+        ChangeDataLayoutQuantAvgPool2d(),
+        InferDataLayouts(),
+        reorder.MoveTransposePastScalarMul(),
+        absorb.AbsorbTransposeIntoFlatten(),
+        reorder.MoveFlattenPastAffine(),
+        reorder.MoveFlattenPastTopK(),
+        reorder.MoveScalarMulPastMatMul(),
+        CollapseRepeatedMul(),
+        RemoveIdentityOps(),
+        RoundAndClipThresholds(),
+    ]
+    for trn in additional_streamline_transformations:
+        model = model.transform(trn)
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(GiveReadableTensorNames())
+        model = model.transform(InferDataTypes())
+
+    if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps():
+        verify_step(model, cfg, "streamlined_python", need_parent=False)
+
+    return model
+
+
+def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(LowerConvsToMatMul())
+    model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbConsecutiveTransposes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model = model.transform(RoundAndClipThresholds())
+    model = model.transform(InferDataLayouts())
+    return model
+
+
+def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
+
+
+def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
+    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
+        try:
+            from finnexperimental.analysis.partitioning import partition
+
+            # apply partitioning of the model, restricting the first and last layers
+            # to SLR0
+            default_slr = 0
+            abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
+            floorplan = partition(
+                model,
+                cfg.synth_clk_period_ns,
+                cfg.board,
+                abs_anchors=abs_anchors,
+                multivariant=False,
+            )[0]
+            # apply floorplan to model
+            model = model.transform(ApplyConfig(floorplan))
+            print("SLR floorplanning applied")
+        except Exception:
+            print("No SLR floorplanning applied")
+    return model
+
+
+def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig):
+    model = model.transform(to_hw.InferPool())
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(to_hw.InferThresholdingLayer())
+    model = model.transform(to_hw.InferVectorVectorActivation())
+    model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
+    model = model.transform(to_hw.InferChannelwiseLinearLayer())
+    model = model.transform(to_hw.InferLabelSelectLayer())
+    model = model.transform(InferShapes())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    return model
\ No newline at end of file
diff --git a/src/finn/benchmarking/dut/resnet50_custom_steps.py b/src/finn/builder/custom_step_library/resnet.py
similarity index 100%
rename from src/finn/benchmarking/dut/resnet50_custom_steps.py
rename to src/finn/builder/custom_step_library/resnet.py
diff --git a/src/finn/benchmarking/dut/transformer_custom_steps.py b/src/finn/builder/custom_step_library/transformer.py
similarity index 100%
rename from src/finn/benchmarking/dut/transformer_custom_steps.py
rename to src/finn/builder/custom_step_library/transformer.py

From cfdb04239a53227aa284b5716226650f18e68e3b Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 17:23:35 +0200
Subject: [PATCH 099/125] Switch to YAML-based build config

---
 src/finn/benchmarking/bench.py                |  21 ++--
 src/finn/benchmarking/bench_base.py           | 119 +++++++-----------
 src/finn/benchmarking/cfg/metafi_test.json    |  14 ---
 src/finn/benchmarking/cfg/metafi_test.yml     |  14 +++
 .../benchmarking/cfg/mobilenetv1_test.json    |  32 -----
 .../benchmarking/cfg/mobilenetv1_test.yml     |  31 +++++
 .../cfg/{mvau_test.json => mvau_test.yml}     |   2 +-
 src/finn/benchmarking/cfg/resnet50_test.json  |  33 -----
 src/finn/benchmarking/cfg/resnet50_test.yml   |  33 +++++
 ...c_fifotest.json => synthetic_fifotest.yml} |  28 +++--
 ...r_gpt_all.json => transformer_gpt_all.yml} |   4 +-
 ...l_all.json => transformer_radioml_all.yml} |   8 +-
 ...ormer_sweep.json => transformer_sweep.yml} |  20 +--
 ...sformer_test.json => transformer_test.yml} |   4 +-
 src/finn/benchmarking/cfg/vgg10_test.json     |  32 -----
 src/finn/benchmarking/cfg/vgg10_test.yml      |  33 +++++
 src/finn/benchmarking/dut/metafi.py           |  61 ---------
 src/finn/benchmarking/dut/metafi.yml          |  28 +++++
 src/finn/benchmarking/dut/mobilenetv1.py      |  48 -------
 src/finn/benchmarking/dut/mobilenetv1.yml     |  16 +++
 src/finn/benchmarking/dut/resnet50.py         |  42 -------
 src/finn/benchmarking/dut/resnet50.yml        |  19 +++
 src/finn/benchmarking/dut/transformer.py      |   2 +-
 src/finn/benchmarking/dut/vgg10.py            |  35 ------
 src/finn/benchmarking/dut/vgg10.yml           |  23 ++++
 src/finn/interface/run_finn.py                |   2 +-
 26 files changed, 280 insertions(+), 424 deletions(-)
 delete mode 100644 src/finn/benchmarking/cfg/metafi_test.json
 create mode 100644 src/finn/benchmarking/cfg/metafi_test.yml
 delete mode 100644 src/finn/benchmarking/cfg/mobilenetv1_test.json
 create mode 100644 src/finn/benchmarking/cfg/mobilenetv1_test.yml
 rename src/finn/benchmarking/cfg/{mvau_test.json => mvau_test.yml} (75%)
 delete mode 100644 src/finn/benchmarking/cfg/resnet50_test.json
 create mode 100644 src/finn/benchmarking/cfg/resnet50_test.yml
 rename src/finn/benchmarking/cfg/{synthetic_fifotest.json => synthetic_fifotest.yml} (57%)
 rename src/finn/benchmarking/cfg/{transformer_gpt_all.json => transformer_gpt_all.yml} (72%)
 rename src/finn/benchmarking/cfg/{transformer_radioml_all.json => transformer_radioml_all.yml} (57%)
 rename src/finn/benchmarking/cfg/{transformer_sweep.json => transformer_sweep.yml} (82%)
 rename src/finn/benchmarking/cfg/{transformer_test.json => transformer_test.yml} (77%)
 delete mode 100644 src/finn/benchmarking/cfg/vgg10_test.json
 create mode 100644 src/finn/benchmarking/cfg/vgg10_test.yml
 delete mode 100644 src/finn/benchmarking/dut/metafi.py
 create mode 100644 src/finn/benchmarking/dut/metafi.yml
 delete mode 100644 src/finn/benchmarking/dut/mobilenetv1.py
 create mode 100644 src/finn/benchmarking/dut/mobilenetv1.yml
 delete mode 100644 src/finn/benchmarking/dut/resnet50.py
 create mode 100644 src/finn/benchmarking/dut/resnet50.yml
 delete mode 100644 src/finn/benchmarking/dut/vgg10.py
 create mode 100644 src/finn/benchmarking/dut/vgg10.yml

diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 8d87036477..745d6c62b2 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -1,30 +1,24 @@
 import itertools
-import sys
 import os
 import json
+import yaml
 import time
 import traceback
 import onnxruntime as ort
-import importlib
 
 from finn.benchmarking.util import delete_dir_contents
+from finn.benchmarking.bench_base import bench
 
 from finn.benchmarking.dut.mvau import bench_mvau
-from finn.benchmarking.dut.resnet50 import bench_resnet50
-from finn.benchmarking.dut.metafi import bench_metafi
 from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear
 from finn.benchmarking.dut.transformer import bench_transformer
-from finn.benchmarking.dut.vgg10 import bench_vgg10
-from finn.benchmarking.dut.mobilenetv1 import bench_mobilenetv1
 
+
+# Register custom bench subclasses that offer more control than YAML-based flow
 dut = dict()
 dut["mvau"] = bench_mvau
-dut["resnet50"] = bench_resnet50
-dut["metafi"] = bench_metafi
 dut["synthetic_nonlinear"] = bench_synthetic_nonlinear
 dut["transformer"] = bench_transformer
-dut["vgg10"] = bench_vgg10
-dut["mobilenetv1"] = bench_mobilenetv1
 
 
 def start_bench_run(config_name):
@@ -96,7 +90,7 @@ def get_default_session_options_new():
     print("Loading config %s" % (config_path))
     if os.path.exists(config_path):
         with open(config_path, "r") as f:
-            config = json.load(f)
+            config = yaml.load(f, Loader=yaml.SafeLoader)
     else:
         print("ERROR: config file not found")
         return
@@ -150,8 +144,9 @@ def get_default_session_options_new():
             if params["dut"] in dut:
                 bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir)
             else:
-                print("ERROR: unknown DUT specified")
-                return 1
+                # If no custom bench subclass is defined, fall back to base class,
+                # expect DUT-specific YAML definition instead
+                bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir)
         else:
             print("ERROR: no DUT specified")
             return 1
diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
index 16ef757389..dc1b40cee2 100644
--- a/src/finn/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -3,6 +3,7 @@
 import subprocess
 import copy
 import json
+import yaml
 import time
 import traceback
 import glob
@@ -130,7 +131,14 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously
         #TODO: coordinate with new builder loggin setup
 
-        # General configuration
+        # Setup some basic global default configuration
+        # TODO: are these class members even used anymore?
+        if "synth_clk_period_ns" in params:
+            self.clock_period_ns = params["synth_clk_period_ns"]
+        else:
+            self.clock_period_ns = 10
+            self.params["synth_clk_period_ns"] = self.clock_period_ns
+
         # TODO: do not allow multiple targets in a single bench job due to measurement?
         if "board" in params:
             self.board = params["board"]
@@ -144,12 +152,12 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
             self.part = part_map[self.board]
         else:
             raise Exception("No part specified for board %s" % self.board)
-
-        if "clock_period_ns" in params:
-            self.clock_period_ns = params["clock_period_ns"]
+    
+        if self.board in alveo_part_map:
+            self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO
+            self.params["vitis_platform"] = alveo_default_platform[self.board]
         else:
-            self.clock_period_ns = 10
-            self.params["clock_period_ns"] = self.clock_period_ns
+            self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ
 
         # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts)
         print("Clearing FINN BUILD DIR ahead of run")
@@ -214,14 +222,20 @@ def save_local_artifacts_collection(self):
         for (name, source_path, archive) in self.local_artifacts_collection:
             target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
             self.save_artifact(target_path, source_path, archive)
-    
+
     # must be defined by subclass
     def step_export_onnx(self):
         pass
 
-    # must be defined by subclass
+    # can be overwritten by subclass if setup is too complex for YAML definition
     def step_build_setup(self):
-        pass
+        dut_yaml_name = self.params["dut"] + ".yml"
+        dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name)
+        if os.path.isfile(dut_path):
+            with open(dut_path, "r") as f:
+                return DataflowBuildConfig.from_yaml(f)
+        else:
+            raise Exception("No DUT-specific YAML build definition found") 
 
     # defaults to normal build flow, may be overwritten by subclass
     def run(self):
@@ -381,6 +395,13 @@ def step_parse_builder_output(self, build_dir):
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
 
+        ### LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig)
+        custom_params = [
+            "model_dir", # used to setup onnx/npy input
+            "model_path", # used to setup onnx/npy input
+            # model-gen parameters, such as seed, simd, pe, etc. (TODO: separate from builder options)
+        ]
+
         ### MODEL CREATION/IMPORT ###
         # TODO: track fixed input onnx models with DVC
         if "model_dir" in self.params:
@@ -398,26 +419,12 @@ def steps_full_build_flow(self):
                 # microbenchmarks might skip because no valid model can be generated for given params
                 return "skipped"
 
-        if "folding_path" in self.params:
-            self.build_inputs["folding_path"] = self.params["folding_path"]
-        if "specialize_path" in self.params:
-            self.build_inputs["specialize_path"] = self.params["specialize_path"]
-        if "floorplan_path" in self.params:
-            self.build_inputs["floorplan_path"] = self.params["floorplan_path"]
-
         ### BUILD SETUP ###
-        # TODO: convert to YAML-based builder config
-        # TODO: split up into default config, dut-specific config, and run-specific config
+        # Initialize from YAML (default) or custom script (if dedicated subclass is defined)
         cfg = self.step_build_setup()
-        cfg.generate_outputs = self.params["output_products"]
+
+        # Set some global defaults (could still be overwritten by run-specific YAML)
         cfg.output_dir = self.build_inputs["build_dir"]
-        cfg.synth_clk_period_ns = self.clock_period_ns
-        cfg.board = self.board
-        if self.board in alveo_part_map:
-            cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO
-            cfg.vitis_platform=alveo_default_platform[self.board]
-        else:
-            cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ
         # enable extra performance optimizations (physopt)
         # TODO: check OMX synth strategy again!
         cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST
@@ -427,61 +434,21 @@ def steps_full_build_flow(self):
         cfg.force_python_rtlsim = False
         cfg.split_large_fifos = True
         cfg.save_intermediate_models = True # Save the intermediate model graphs
-        cfg.verify_save_full_context = True, # Output full context dump for verification steps
+        cfg.verify_save_full_context = True # Output full context dump for verification steps
+        cfg.enable_instrumentation = True
         #rtlsim_use_vivado_comps # TODO ?
         #cfg.default_swg_exception
         #cfg.large_fifo_mem_style
 
-        # Switch between instrumentation or IODMA wrapper (TODO: combine both in one bitstream)
-        if "enable_instrumentation" in self.params:
-            cfg.enable_instrumentation = self.params["enable_instrumentation"]
-        else:
-            cfg.enable_instrumentation = True
-
-        # "manual or "characterize" or "largefifo_rtlsim" or "live"
-        if "fifo_method" in self.params:
-            if self.params["fifo_method"] == "manual":
-                cfg.auto_fifo_depths = False
-            elif self.params["fifo_method"] == "live":
-                cfg.auto_fifo_depths = False
-                cfg.live_fifo_sizing = True
-                cfg.enable_instrumentation = True
-                cfg.synth_clk_period_ns = 10 # force conservative 100 MHz clock
+        # Overwrite build config settings with run-specific YAML build definition
+        for key in self.params:
+            if hasattr(cfg, key):
+                setattr(cfg, key, self.params[key])
             else:
-                cfg.auto_fifo_depths = True
-                cfg.auto_fifo_strategy = self.params["fifo_method"]
-        # only relevant for "characterize" method: "rtlsim" or "analytical"
-        if "fifo_strategy" in self.params:
-            cfg.characteristic_function_strategy = self.params["fifo_strategy"]
-
-        # Batch size used for RTLSim performance measurement (and in-depth FIFO test here)
-        # TODO: determine automatically or replace by exact instr wrapper sim
-        if "rtlsim_n" in self.params:
-            cfg.rtlsim_batch_size=self.params["rtlsim_n"]
-
-        # Batch size used for FIFO sizing (largefifo_rtlsim only)
-        if "fifo_rtlsim_n" in self.params:
-            cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"]
-
-        # Manual correction factor for FIFO-Sim input throttling
-        if "fifo_throttle_factor" in self.params:
-            cfg.fifo_throttle_factor = self.params["fifo_throttle_factor"]
-
-        if "folding_path" in self.build_inputs:
-            cfg.folding_config_file = self.build_inputs["folding_path"]
-        if "specialize_path" in self.build_inputs:
-            cfg.specialize_layers_config_file = self.build_inputs["specialize_path"]
-        if "floorplan_path" in self.build_inputs:
-            cfg.floorplan_path = self.build_inputs["floorplan_path"]
-
-        if "target_fps" in self.params:
-            if self.params["target_fps"] == "None":
-                cfg.target_fps = None
-            else:
-                cfg.target_fps = self.params["target_fps"]
-
-        if "validation_dataset" in self.params:
-            cfg.validation_dataset = self.params["validation_dataset"]
+                if key not in custom_params:
+                    pass
+                    #TODO: be more strict? support custom extra options like MetaFi uses?
+                    #raise Exception("Unrecognized builder config defined in YAML: %s" % key)
 
         # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
         # TODO: make configurable or set on pipeline level?
diff --git a/src/finn/benchmarking/cfg/metafi_test.json b/src/finn/benchmarking/cfg/metafi_test.json
deleted file mode 100644
index bc10f857c3..0000000000
--- a/src/finn/benchmarking/cfg/metafi_test.json
+++ /dev/null
@@ -1,14 +0,0 @@
-[
-    {
-        "dut": ["metafi"],
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["live"],
-
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-    ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/metafi_test.yml b/src/finn/benchmarking/cfg/metafi_test.yml
new file mode 100644
index 0000000000..711250bbdb
--- /dev/null
+++ b/src/finn/benchmarking/cfg/metafi_test.yml
@@ -0,0 +1,14 @@
+[
+    {
+        "dut": ["metafi"],
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "live_fifo_sizing": [True],
+
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+    ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.json b/src/finn/benchmarking/cfg/mobilenetv1_test.json
deleted file mode 100644
index d080638722..0000000000
--- a/src/finn/benchmarking/cfg/mobilenetv1_test.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-    {
-        "dut": ["mobilenetv1"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["manual"],
-
-        "rtlsim_n": [5],
-        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["mobilenetv1"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["live"],
-
-        "rtlsim_n": [5],
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.yml b/src/finn/benchmarking/cfg/mobilenetv1_test.yml
new file mode 100644
index 0000000000..040fa380e4
--- /dev/null
+++ b/src/finn/benchmarking/cfg/mobilenetv1_test.yml
@@ -0,0 +1,31 @@
+[
+    {
+        "dut": ["mobilenetv1"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "auto_fifo_depths": [False],
+
+        "rtlsim_batch_sizauto_fifo_depths": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "live_fifo_sizing": [True],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/mvau_test.json b/src/finn/benchmarking/cfg/mvau_test.yml
similarity index 75%
rename from src/finn/benchmarking/cfg/mvau_test.json
rename to src/finn/benchmarking/cfg/mvau_test.yml
index c42b16782c..7e0b3d14d2 100644
--- a/src/finn/benchmarking/cfg/mvau_test.json
+++ b/src/finn/benchmarking/cfg/mvau_test.yml
@@ -21,6 +21,6 @@
 
         "dut_duplication": [1],
 
-        "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
     }
     ]
diff --git a/src/finn/benchmarking/cfg/resnet50_test.json b/src/finn/benchmarking/cfg/resnet50_test.json
deleted file mode 100644
index 06a96729ab..0000000000
--- a/src/finn/benchmarking/cfg/resnet50_test.json
+++ /dev/null
@@ -1,33 +0,0 @@
-[
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["U250"],
-        "clock_period_ns": [4],
-
-        "fifo_method": ["manual"],
-
-        "rtlsim_n": [5],
-        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]]
-    },
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["live"],
-
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-    ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/resnet50_test.yml b/src/finn/benchmarking/cfg/resnet50_test.yml
new file mode 100644
index 0000000000..e3acf9fa7d
--- /dev/null
+++ b/src/finn/benchmarking/cfg/resnet50_test.yml
@@ -0,0 +1,33 @@
+[
+    {
+        "dut": ["resnet50"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["U250"],
+        "synth_clk_period_ns": [4],
+
+        "auto_fifo_depths": [False],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]]
+    },
+    {
+        "dut": ["resnet50"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
+        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "live_fifo_sizing": [True],
+
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+    ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/synthetic_fifotest.json b/src/finn/benchmarking/cfg/synthetic_fifotest.yml
similarity index 57%
rename from src/finn/benchmarking/cfg/synthetic_fifotest.json
rename to src/finn/benchmarking/cfg/synthetic_fifotest.yml
index 7e362200af..58a49d108d 100644
--- a/src/finn/benchmarking/cfg/synthetic_fifotest.json
+++ b/src/finn/benchmarking/cfg/synthetic_fifotest.yml
@@ -12,11 +12,11 @@
         "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
         "rtlsim_n": [5],
 
-        "fifo_method": ["live"],
+        "live_fifo_sizing": [True],
         "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
     },
     {
@@ -32,13 +32,15 @@
         "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "rtlsim_n": [5],
+        "rtlsim_batch_size": [5],
+
+        "auto_fifo_depths": [True],
+        "auto_fifo_strategy": ["characterize"],
+        "characteristic_function_strategy": ["analytical", "rtlsim"],
 
-        "fifo_method": ["characterize"],
-        "fifo_strategy": ["analytical", "rtlsim"],
-        "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["synthetic_nonlinear"],
@@ -53,12 +55,14 @@
         "rb_num_layers": [4],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "rtlsim_n": [5],
+        "rtlsim_batch_size": [5],
+
+        "auto_fifo_depths": [True],
+        "auto_fifo_strategy": ["largefifo_rtlsim"],
 
-        "fifo_method": ["largefifo_rtlsim"],
-        "fifo_rtlsim_n": [2],
-        "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
+        "fifosim_n_inferences": [2],
+        "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/transformer_gpt_all.json b/src/finn/benchmarking/cfg/transformer_gpt_all.yml
similarity index 72%
rename from src/finn/benchmarking/cfg/transformer_gpt_all.json
rename to src/finn/benchmarking/cfg/transformer_gpt_all.yml
index b0b70fb0aa..e0610c3d7e 100644
--- a/src/finn/benchmarking/cfg/transformer_gpt_all.json
+++ b/src/finn/benchmarking/cfg/transformer_gpt_all.yml
@@ -5,8 +5,8 @@
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
 
         "board": ["U280"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "output_products": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
+        "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
     }
 ]
diff --git a/src/finn/benchmarking/cfg/transformer_radioml_all.json b/src/finn/benchmarking/cfg/transformer_radioml_all.yml
similarity index 57%
rename from src/finn/benchmarking/cfg/transformer_radioml_all.json
rename to src/finn/benchmarking/cfg/transformer_radioml_all.yml
index 5eeea031b2..dede0988c8 100644
--- a/src/finn/benchmarking/cfg/transformer_radioml_all.json
+++ b/src/finn/benchmarking/cfg/transformer_radioml_all.yml
@@ -5,9 +5,9 @@
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["transformer"],
@@ -15,8 +15,8 @@
         "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/transformer_sweep.json b/src/finn/benchmarking/cfg/transformer_sweep.yml
similarity index 82%
rename from src/finn/benchmarking/cfg/transformer_sweep.json
rename to src/finn/benchmarking/cfg/transformer_sweep.yml
index e1795ff3f8..7fa9420d01 100644
--- a/src/finn/benchmarking/cfg/transformer_sweep.json
+++ b/src/finn/benchmarking/cfg/transformer_sweep.yml
@@ -14,9 +14,7 @@
         "model_bits": [2],
         "model_norm": ["none"],
         "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "dut_duplication": [1]
+        "model_positional_encoding": ["binary"]
     },
     {
         "dut": ["transformer"],
@@ -33,9 +31,7 @@
         "model_bits": [2],
         "model_norm": ["none"],
         "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "dut_duplication": [1]
+        "model_positional_encoding": ["binary"]
     },
     {
         "dut": ["transformer"],
@@ -52,9 +48,7 @@
         "model_bits": [2],
         "model_norm": ["none"],
         "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "dut_duplication": [1]
+        "model_positional_encoding": ["binary"]
     },
     {
         "dut": ["transformer"],
@@ -71,9 +65,7 @@
         "model_bits": [2],
         "model_norm": ["none"],
         "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "dut_duplication": [1]
+        "model_positional_encoding": ["binary"]
     },
     {
         "dut": ["transformer"],
@@ -90,8 +82,6 @@
         "model_bits": [2, 4, 6, 8],
         "model_norm": ["none"],
         "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "dut_duplication": [1]
+        "model_positional_encoding": ["binary"]
     }
 ]
diff --git a/src/finn/benchmarking/cfg/transformer_test.json b/src/finn/benchmarking/cfg/transformer_test.yml
similarity index 77%
rename from src/finn/benchmarking/cfg/transformer_test.json
rename to src/finn/benchmarking/cfg/transformer_test.yml
index e0fcbc160d..a529981fdc 100644
--- a/src/finn/benchmarking/cfg/transformer_test.json
+++ b/src/finn/benchmarking/cfg/transformer_test.yml
@@ -17,8 +17,8 @@
         "model_positional_encoding": ["binary"],
 
         "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
+        "synth_clk_period_ns": [10],
 
-        "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
diff --git a/src/finn/benchmarking/cfg/vgg10_test.json b/src/finn/benchmarking/cfg/vgg10_test.json
deleted file mode 100644
index 7a6e1a5deb..0000000000
--- a/src/finn/benchmarking/cfg/vgg10_test.json
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["largefifo_rtlsim"],
-
-        "rtlsim_n": [5],
-        "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
-        "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
-        "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "clock_period_ns": [10],
-
-        "fifo_method": ["live"],
-
-        "rtlsim_n": [5],
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/vgg10_test.yml b/src/finn/benchmarking/cfg/vgg10_test.yml
new file mode 100644
index 0000000000..e16122b130
--- /dev/null
+++ b/src/finn/benchmarking/cfg/vgg10_test.yml
@@ -0,0 +1,33 @@
+[
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "auto_fifo_depths": [True],
+        "auto_fifo_strategy": ["largefifo_rtlsim"],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
+        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
+        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "live_fifo_sizing": [True],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/src/finn/benchmarking/dut/metafi.py b/src/finn/benchmarking/dut/metafi.py
deleted file mode 100644
index 05c75eee08..0000000000
--- a/src/finn/benchmarking/dut/metafi.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import finn.builder.build_dataflow_config as build_cfg
-
-from finn.benchmarking.bench_base import bench
-
-# # custom steps
-# from custom_steps import (
-#     step_extract_absorb_bias,
-#     step_pre_streamline,
-#     step_residual_convert_to_hw,
-#     step_residual_streamline,
-#     step_residual_tidy,
-#     step_residual_topo,
-#     step_set_preferred_impl_style,
-#     step_convert_final_layers
-# )
-
-class bench_metafi(bench):
-    def step_build_setup(self):
-        # create build config for MetaFi models
-
-        steps = [
-            # step_residual_tidy,
-            # step_extract_absorb_bias,
-            # step_residual_topo,
-            # step_pre_streamline,
-            # step_residual_streamline,
-            # step_residual_convert_to_hw,
-            "step_create_dataflow_partition",
-            # step_set_preferred_impl_style,
-            "step_specialize_layers",
-            "step_target_fps_parallelization",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            "step_create_stitched_ip",
-            "step_measure_rtlsim_performance",
-            "step_out_of_context_synthesis",
-            "step_synthesize_bitfile",
-            "step_make_driver",
-            "step_deployment_package",
-        ]
-
-        cfg = build_cfg.DataflowBuildConfig(
-            steps=steps,
-            target_fps=None, #23
-            # folding_config_file=folding_config_file,
-            # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json",
-            # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json",
-            # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json",
-
-            #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO,
-            # standalone_thresholds=True,
-        )
-
-        # where is this used and why?
-        cfg.use_conv_rtl = True,  # use rtl for conv layers (MVAU cannot use rtl in our model)
-
-        return cfg
\ No newline at end of file
diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml
new file mode 100644
index 0000000000..d3ea2c69ff
--- /dev/null
+++ b/src/finn/benchmarking/dut/metafi.yml
@@ -0,0 +1,28 @@
+steps:
+  - # step_residual_tidy
+  - # step_extract_absorb_bias
+  - # step_residual_topo
+  - # step_pre_streamline
+  - # step_residual_streamline
+  - # step_residual_convert_to_hw
+  - step_create_dataflow_partition
+  - # step_set_preferred_impl_style
+  - step_specialize_layers
+  - step_target_fps_parallelization
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_measure_rtlsim_performance
+  - step_out_of_context_synthesis
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
+
+target_fps: null # 23
+
+#TODO: where is this used and why?
+use_conv_rtl: True # use rtl for conv layers (MVAU cannot use rtl in our model)
diff --git a/src/finn/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py
deleted file mode 100644
index efcfb7b521..0000000000
--- a/src/finn/benchmarking/dut/mobilenetv1.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from finn.benchmarking.bench_base import bench
-from finn.builder.build_dataflow_config import DataflowBuildConfig
-
-
-class bench_mobilenetv1(bench):
-    def step_build_setup(self):
-        # create build config for MobileNetV1 (based on finn-examples)
-        mobilenet_build_steps = [
-            step_mobilenet_streamline,
-            step_mobilenet_lower_convs,
-            step_mobilenet_convert_to_hw_layers_separate_th,
-            "step_create_dataflow_partition",
-            "step_specialize_layers",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            "step_create_stitched_ip",
-            "step_synthesize_bitfile",
-            "step_make_driver",
-            "step_deployment_package",
-        ]
-        # mobilenet_build_steps_alveo = [
-        #     step_mobilenet_streamline,
-        #     step_mobilenet_lower_convs,
-        #     step_mobilenet_convert_to_hw_layers,
-        #     "step_create_dataflow_partition",
-        #     "step_specialize_layers",
-        #     "step_apply_folding_config",
-        #     "step_minimize_bit_width",
-        #     "step_generate_estimate_reports",
-        #     "step_hw_codegen",
-        #     "step_hw_ipgen",
-        #     "step_set_fifo_depths",
-        #     "step_create_stitched_ip",
-        #     step_mobilenet_slr_floorplan,
-        #     "step_synthesize_bitfile",
-        #     "step_make_pynq_driver",
-        #     "step_deployment_package",
-        # ]
-
-        cfg = DataflowBuildConfig(
-            steps=mobilenet_build_steps,
-        )
-
-        return cfg
diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml
new file mode 100644
index 0000000000..71a80c4f2a
--- /dev/null
+++ b/src/finn/benchmarking/dut/mobilenetv1.yml
@@ -0,0 +1,16 @@
+steps:
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step
+  - finn.builder.custom_step_library.mobilenet.step_mobilenet_convert_to_hw_layers_separate_th # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
diff --git a/src/finn/benchmarking/dut/resnet50.py b/src/finn/benchmarking/dut/resnet50.py
deleted file mode 100644
index efcd0de275..0000000000
--- a/src/finn/benchmarking/dut/resnet50.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import finn.builder.build_dataflow_config as build_cfg
-from finn.util.basic import alveo_default_platform
-
-from finn.benchmarking.dut.resnet50_custom_steps import (
-        step_resnet50_tidy,
-        step_resnet50_streamline,
-        step_resnet50_convert_to_hw,
-        step_resnet50_slr_floorplan,
-    )
-
-from finn.benchmarking.bench_base import bench
-
-class bench_resnet50(bench):
-    def step_build_setup(self):
-        # create build config for ResNet-50 (based on finn-examples)
-
-        resnet50_build_steps = [
-            step_resnet50_tidy,
-            step_resnet50_streamline,
-            step_resnet50_convert_to_hw,
-            "step_create_dataflow_partition",
-            "step_specialize_layers",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            step_resnet50_slr_floorplan,
-            "step_create_stitched_ip", # was not in finn-examples
-            "step_measure_rtlsim_performance", # was not in finn-examples
-            "step_out_of_context_synthesis", # was not in finn-examples
-            "step_synthesize_bitfile",
-            "step_make_driver",
-            "step_deployment_package",
-        ]
-
-        cfg = build_cfg.DataflowBuildConfig(
-            steps=resnet50_build_steps,
-        )
-
-        return cfg
\ No newline at end of file
diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml
new file mode 100644
index 0000000000..6d6d4bcc31
--- /dev/null
+++ b/src/finn/benchmarking/dut/resnet50.yml
@@ -0,0 +1,19 @@
+steps:
+  - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step
+  - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step
+  - finn.builder.custom_step_library.resnet.step_resnet50_convert_to_hw # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step
+  - step_create_stitched_ip
+  - step_measure_rtlsim_performance
+  - step_out_of_context_synthesis
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py
index 27583ec5e1..48152ce9d5 100644
--- a/src/finn/benchmarking/dut/transformer.py
+++ b/src/finn/benchmarking/dut/transformer.py
@@ -29,7 +29,7 @@
 from qonnx.util.range_analysis import RangeInfo
 
 # Custom build steps required to streamline and convert the attention operator
-from finn.benchmarking.dut.transformer_custom_steps import (
+from finn.builder.custom_step_library.transformer import (
     prepare_graph,
     step_streamline,
     step_convert_attention_to_hw,
diff --git a/src/finn/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py
deleted file mode 100644
index d34c186387..0000000000
--- a/src/finn/benchmarking/dut/vgg10.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from finn.builder.build_dataflow_config import DataflowBuildConfig
-from finn.benchmarking.bench_base import bench
-
-class bench_vgg10(bench):
-    def step_build_setup(self):
-        # create build config for VGG-10 (based on finn-examples)
-        vgg10_build_steps = [
-            "step_tidy_up",
-            step_pre_streamline,
-            "step_streamline",
-            "step_convert_to_hw",
-            step_convert_final_layers,
-            "step_create_dataflow_partition",
-            "step_specialize_layers",
-            "step_target_fps_parallelization",
-            "step_apply_folding_config",
-            "step_minimize_bit_width",
-            "step_generate_estimate_reports",
-            "step_set_fifo_depths",
-            "step_hw_codegen",
-            "step_hw_ipgen",
-            "step_create_stitched_ip",
-            "step_measure_rtlsim_performance",
-            "step_out_of_context_synthesis",
-            "step_synthesize_bitfile",
-            "step_make_driver",
-            "step_deployment_package",
-        ]
-
-        cfg = DataflowBuildConfig(
-            steps=vgg10_build_steps,
-            standalone_thresholds=True,
-        )
-
-        return cfg
diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml
new file mode 100644
index 0000000000..9e271a6921
--- /dev/null
+++ b/src/finn/benchmarking/dut/vgg10.yml
@@ -0,0 +1,23 @@
+steps:
+  - step_tidy_up
+  - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step
+  - step_streamline
+  - step_convert_to_hw
+  - finn.builder.custom_step_library.conv1d.step_convert_final_layers # Custom step
+  - step_create_dataflow_partition
+  - step_specialize_layers
+  - step_target_fps_parallelization
+  - step_apply_folding_config
+  - step_minimize_bit_width
+  - step_generate_estimate_reports
+  - step_set_fifo_depths
+  - step_hw_codegen
+  - step_hw_ipgen
+  - step_create_stitched_ip
+  - step_measure_rtlsim_performance
+  - step_out_of_context_synthesis
+  - step_synthesize_bitfile
+  - step_make_driver
+  - step_deployment_package
+
+standalone_thresholds: True
diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index 3661b414ab..40c186a434 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -264,7 +264,7 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) ->
 @click.option(
     "--bench_config",
     help="Name or path of experiment configuration file",
-    default="",
+    required=True
 )
 @click.option("--dependency-path", "-d", default="")
 @click.option("--num-workers", "-n", default=-1, show_default=True)

From 7a3f928dc83ea8b98fe4464d6b8a9217a8d879b4 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 17:50:02 +0200
Subject: [PATCH 100/125] Adapt to FINN_ROOT refactoring

---
 src/finn/transformation/fpgadataflow/instrumentation.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py
index 7f37c5ed14..a22d770307 100644
--- a/src/finn/transformation/fpgadataflow/instrumentation.py
+++ b/src/finn/transformation/fpgadataflow/instrumentation.py
@@ -28,7 +28,7 @@ def collect_ip_dirs(model, ipstitch_path):
     ip_dirs += [ipstitch_path + "/ip"]
     if need_memstreamer:
         # add RTL streamer IP
-        ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream")
+        ip_dirs.append("$::env(FINN_RTLLIB)/memstream")
     return ip_dirs
 
 
@@ -71,7 +71,7 @@ def apply(self, model):
         ko = out_shape_folded[-1]
         # fill out instrumentation wrapper template
         with open(
-            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation.template.cpp"), "r"
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation.template.cpp"), "r"
         ) as f:
             instrwrp_cpp = f.read()
         instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending))
@@ -150,7 +150,7 @@ def apply(self, model):
         # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines)
         # fill in testbench template
         with open(
-            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_tb.template.sv"),
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_tb.template.sv"),
             "r",
         ) as f:
             testbench_sv = f.read()
@@ -158,7 +158,7 @@ def apply(self, model):
             f.write(testbench_sv)
         # fill in testbench project creator template
         with open(
-            os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_sim.template.tcl"),
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_sim.template.tcl"),
             "r",
         ) as f:
             testbench_tcl = f.read()

From ccebbdca2b6eb88dffded9b1e794ce9912b7af89 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 17:59:48 +0200
Subject: [PATCH 101/125] Fix use of deprecated FINN_ROOT

---
 src/finn/transformation/fpgadataflow/make_driver.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py
index b17cb9c8e8..1cea95f9c5 100644
--- a/src/finn/transformation/fpgadataflow/make_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_driver.py
@@ -477,8 +477,7 @@ def apply(self, model):
 
         # create (copy) the static instrumentation driver
         driver_template = (
-            os.environ["FINN_ROOT"]
-            + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py"
+            os.environ["FINN_QNN_DATA"] + "/templates/driver/driver_instrumentation.py"
         )
         driver_py = pynq_driver_dir + "/driver.py"
         shutil.copy(driver_template, driver_py)

From 6511559f8038e2551ec01dfca966251a5c120e01 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 20 May 2025 21:17:32 +0200
Subject: [PATCH 102/125] Fix bench cmd

---
 src/finn/benchmarking/bench-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml
index 9e960f8ecd..2738ad3d56 100644
--- a/src/finn/benchmarking/bench-ci.yml
+++ b/src/finn/benchmarking/bench-ci.yml
@@ -31,7 +31,7 @@ FINN Build:
     # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
     - |
       source ./finn-plus-venv/bin/activate
-      finn bench $BENCH_CFG
+      finn bench --bench_config $BENCH_CFG
   cache:
     key: $CI_COMMIT_SHA
     policy: pull

From cf6254dcfb3f4ed372e1fb4bb0a03fa7bb157d5e Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 13:56:39 +0200
Subject: [PATCH 103/125] Fix CLI call

---
 src/finn/benchmarking/bench-ci.yml | 2 +-
 src/finn/benchmarking/bench.py     | 2 +-
 src/finn/interface/run_finn.py     | 8 ++------
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml
index 2738ad3d56..8a1269ff9e 100644
--- a/src/finn/benchmarking/bench-ci.yml
+++ b/src/finn/benchmarking/bench-ci.yml
@@ -31,7 +31,7 @@ FINN Build:
     # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
     - |
       source ./finn-plus-venv/bin/activate
-      finn bench --bench_config $BENCH_CFG
+      finn bench --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG
   cache:
     key: $CI_COMMIT_SHA
     policy: pull
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 745d6c62b2..7a9b0877e6 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -47,7 +47,7 @@ def get_default_session_options_new():
             config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
         else:
             configs_path = os.path.join(os.path.dirname(__file__), "cfg")
-            config_select = config_name + ".json"
+            config_select = config_name + ".yml"
             config_path = os.path.join(configs_path, config_select)
         print("Job launched with SLURM ID: %d" % (job_id))
     except KeyError:
diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py
index 40c186a434..a01b70bfb4 100644
--- a/src/finn/interface/run_finn.py
+++ b/src/finn/interface/run_finn.py
@@ -261,11 +261,7 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) ->
 
 
 @click.command(help="Run a given benchmark configuration.")
-@click.option(
-    "--bench_config",
-    help="Name or path of experiment configuration file",
-    required=True
-)
+@click.option("--bench_config", help="Name or path of experiment configuration file", required=True)
 @click.option("--dependency-path", "-d", default="")
 @click.option("--num-workers", "-n", default=-1, show_default=True)
 @click.option(
@@ -278,7 +274,7 @@ def bench(bench_config: str, dependency_path: str, num_workers: int, build_path:
     console = Console()
     build_dir = Path(build_path).expanduser() if build_path != "" else None
     dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None
-    prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True)
+    prepare_finn(dep_path, Path(), build_dir, num_workers)
     console.rule("RUNNING BENCHMARK")
 
     # Late import because we need prepare_finn to setup remaining dependencies first

From cc0be94bb0ae15e8721ad6c9c5a525602ae9de81 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 16:16:05 +0200
Subject: [PATCH 104/125] [CI] Adapt to recent runner version change

---
 .gitlab-ci.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ebdad54bee..a2f9527976 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,11 +93,11 @@ Sync finn-dev:
 
 .setup_venv_from_whl: &setup_venv_from_whl
   # Move everything to working directory (e.g., RAMdisk)
-  - cp -dfR .. $PATH_WORKDIR
+  - cp -dfR . $PATH_WORKDIR
   - cd $PATH_WORKDIR
   # Create fresh virtual environment and install finn-plus from .whl (artifact)
   - python3 -m venv finn-plus-venv
-  - finn-plus-venv/bin/pip install ./finn-plus/dist/*.whl
+  - finn-plus-venv/bin/pip install dist/*.whl
 
 Build:
   id_tokens:
@@ -171,8 +171,8 @@ FINN Test Suite 2022.2:
     - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
     # Launch FINN via test command, includes preparation of (cached) dependencies
     - |
-      source ./finn-plus-venv/bin/activate
-      finn test --variant $TEST_SUITE --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
+      source finn-plus-venv/bin/activate
+      finn test --variant $TEST_SUITE --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
   artifacts:
     name: "test_reports"
     when: always

From d1708971c55285fabaa6fbdf5e24fe284ceedbfb Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 17:39:57 +0200
Subject: [PATCH 105/125] Minor fixes

---
 src/finn/benchmarking/dut/metafi.yml | 14 +++++++-------
 src/finn/builder/build_dataflow.py   |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml
index d3ea2c69ff..fba5a68fe5 100644
--- a/src/finn/benchmarking/dut/metafi.yml
+++ b/src/finn/benchmarking/dut/metafi.yml
@@ -1,12 +1,12 @@
 steps:
-  - # step_residual_tidy
-  - # step_extract_absorb_bias
-  - # step_residual_topo
-  - # step_pre_streamline
-  - # step_residual_streamline
-  - # step_residual_convert_to_hw
+  #- step_residual_tidy
+  #- step_extract_absorb_bias
+  #- step_residual_topo
+  #- step_pre_streamline
+  #- step_residual_streamline
+  #- step_residual_convert_to_hw
   - step_create_dataflow_partition
-  - # step_set_preferred_impl_style
+  #- step_set_preferred_impl_style
   - step_specialize_layers
   - step_target_fps_parallelization
   - step_apply_folding_config
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index b14d69a1f9..f6f3f6127d 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -253,7 +253,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             print("Build failed")
             metadata = {
                 "status": "failed",
-                "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")),
+                "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")),
             }
             with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
                 json.dump(metadata, f, indent=2)
@@ -264,7 +264,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
         json.dump(time_per_step, f, indent=2)
     metadata = {
         "status": "ok",
-        "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")),
+        "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")),
     }
     with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
         json.dump(metadata, f, indent=2)

From 9718a30442e99a6525431fbb6070c459ad3473e8 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 17:48:26 +0200
Subject: [PATCH 106/125] [CI] Use empty git strategy for benchmarking as well

---
 src/finn/benchmarking/bench-ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml
index 691ddeb5fe..0212aee122 100644
--- a/src/finn/benchmarking/bench-ci.yml
+++ b/src/finn/benchmarking/bench-ci.yml
@@ -22,6 +22,7 @@ FINN Build:
     - job: Build
       pipeline: $PARENT_PIPELINE_ID
   variables:
+    GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"
     NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH"
   extends: .setup_full_2022_2

From bd36b8fbccd8b10a9677f88dce5d5775eea4a760 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 21:06:35 +0200
Subject: [PATCH 107/125] Fix typo

---
 .gitlab-ci.yml                     | 2 +-
 src/finn/benchmarking/bench-ci.yml | 2 +-
 src/finn/builder/build_dataflow.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0a7aaab37e..09fa9e0930 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -132,7 +132,7 @@ FINN Test Suite 2022.2:
     paths:
       - deps
   variables:
-    GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead
+    GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive"
     PYTEST_PARALLEL: "$CPU_CORES"
   extends: .setup_full_2022_2
diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml
index 0212aee122..0f039180d1 100644
--- a/src/finn/benchmarking/bench-ci.yml
+++ b/src/finn/benchmarking/bench-ci.yml
@@ -22,7 +22,7 @@ FINN Build:
     - job: Build
       pipeline: $PARENT_PIPELINE_ID
   variables:
-    GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead
+    GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"
     NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH"
   extends: .setup_full_2022_2
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index f6f3f6127d..b29e36ab56 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -253,7 +253,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             print("Build failed")
             metadata = {
                 "status": "failed",
-                "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")),
+                "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
             }
             with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
                 json.dump(metadata, f, indent=2)
@@ -264,7 +264,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
         json.dump(time_per_step, f, indent=2)
     metadata = {
         "status": "ok",
-        "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")),
+        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
     }
     with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
         json.dump(metadata, f, indent=2)

From a942390d20d27e8d2c9a1ea70e95bea523b91442 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 21 May 2025 21:09:59 +0200
Subject: [PATCH 108/125] Refactor remaining MakePYNQDriver calls

---
 notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb | 4 ++--
 notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb | 4 ++--
 src/finn/qnn-data/templates/driver/driver_base.py            | 2 +-
 tests/end2end/test_end2end_bnn_pynq.py                       | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 2b01f24557..014a13db27 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -456,8 +456,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n",
-    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n",
+    "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))"
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index b0510b0fdb..de6de23d3f 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -751,8 +751,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n",
-    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n",
+    "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))"
    ]
   },
   {
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index a6ff29d608..af55ee13df 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -38,7 +38,7 @@
 
 # Driver base class for FINN-generated dataflow accelerators.
 # The particulars of the generated accelerator are specified via the
-# io_shape_dict (generated by the MakePYNQDriver transformation).
+# io_shape_dict (generated by the MakePYNQDriverIODMA transformation).
 
 
 class FINNExampleOverlay(Overlay):
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 9a2da7a45e..9d40b3ba93 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -73,7 +73,7 @@
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -812,7 +812,7 @@ def test_make_pynq_driver(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "build")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma"
-        model = model.transform(MakePYNQDriver(board_to_driver_platform))
+        model = model.transform(MakePYNQDriverIODMA(board_to_driver_platform))
         model.save(get_checkpoint_name(board, topology, wbits, abits, "driver"))
 
     def test_deploy(self, topology, wbits, abits, board):

From 4ee4da19f8ec46c1b701f4218ea7041f8bbbf840 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 22 May 2025 19:34:15 +0200
Subject: [PATCH 109/125] Adapt virtual FIFO output stream naming

---
 src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
index f17bc48fc6..e7d02a4915 100644
--- a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py
@@ -66,7 +66,7 @@ def strm_decl(self):
             )
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out_{} ("out_{}");'.format(
+            'hls::stream<ap_uint<{}>> out0_{} ("out0_{}");'.format(
                 self.get_outstream_width(), self.hls_sname(), self.hls_sname()
             )
         )
@@ -88,7 +88,7 @@ def docompute(self):
             VirtualFIFO<Width>(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy);
 
             // FIFO -> AXI-Stream
-            move(out_fifo, out_%s);
+            move(out_fifo, out0_%s);
             """
             % (self.hls_sname(), self.hls_sname())
         ]
@@ -99,7 +99,7 @@ def blackboxfunction(self):
         out_packed_bits = self.get_outstream_width()
         out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode,
+            """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out0_%s, ap_uint<32> mode,
             ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)"""
             % (
                 self.onnx_node.name,
@@ -115,7 +115,7 @@ def pragmas(self):
             "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
         ]
         self.code_gen_dict["$PRAGMAS$"].append(
-            "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+            "#pragma HLS INTERFACE axis port=out0_" + self.hls_sname()
         )
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode")
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth")

From fb1853751c84d5b89299bcfad6a1e81c6dbac877 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Thu, 22 May 2025 21:40:58 +0200
Subject: [PATCH 110/125] Move CI-specific scripts

---
 .gitlab-ci.yml                                |   6 +-
 .../bench-ci.yml => ci/.gitlab-bench.yml      |   6 +-
 .gitlab-ci-base.yml => ci/.gitlab-setup.yml   |   0
 ci/collect.py                                 | 412 +++++++++
 {src/finn/benchmarking => ci}/measure.py      |  70 +-
 driver/iterative_live_fifosizing_driver.ipynb | 833 ------------------
 src/finn/benchmarking/bench_rtl_swg.py        | 403 ---------
 src/finn/benchmarking/collect.py              | 280 ------
 8 files changed, 466 insertions(+), 1544 deletions(-)
 rename src/finn/benchmarking/bench-ci.yml => ci/.gitlab-bench.yml (93%)
 rename .gitlab-ci-base.yml => ci/.gitlab-setup.yml (100%)
 create mode 100644 ci/collect.py
 rename {src/finn/benchmarking => ci}/measure.py (51%)
 delete mode 100644 driver/iterative_live_fifosizing_driver.ipynb
 delete mode 100644 src/finn/benchmarking/bench_rtl_swg.py
 delete mode 100644 src/finn/benchmarking/collect.py

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 09fa9e0930..ad524d0fd7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-include: .gitlab-ci-base.yml
+include: ci/.gitlab-setup.yml
 
 stages:
   - sync
@@ -164,7 +164,7 @@ Bench (Manual):
       when: never
     - if: $MANUAL_CFG_PATH != ""
   trigger:
-    include: benchmarking/bench-ci.yml
+    include: ci/.gitlab-bench.yml
     strategy: depend
     forward:
       pipeline_variables: true
@@ -180,7 +180,7 @@ Bench:
       when: never
     - if: $MANUAL_CFG_PATH == ""
   trigger:
-    include: src/finn/benchmarking/bench-ci.yml
+    include: ci/.gitlab-bench.yml
     strategy: depend
     forward:
       pipeline_variables: true
diff --git a/src/finn/benchmarking/bench-ci.yml b/ci/.gitlab-bench.yml
similarity index 93%
rename from src/finn/benchmarking/bench-ci.yml
rename to ci/.gitlab-bench.yml
index 0f039180d1..f3139c0fbd 100644
--- a/src/finn/benchmarking/bench-ci.yml
+++ b/ci/.gitlab-bench.yml
@@ -1,4 +1,4 @@
-include: .gitlab-ci-base.yml
+include: ci/.gitlab-setup.yml
 
 stages:
   - build
@@ -56,7 +56,7 @@ Measurement:
     - when: always
   script:
     # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment
-    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python src/finn/benchmarking/measure.py"
+    - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python ci/measure.py"
   artifacts:
     name: "measurement_artifacts"
     when: always
@@ -74,5 +74,5 @@ Result Collection:
     # Also run on failure of previous tasks to collect partial results
     - when: always
   script:
-    - python3.10 src/finn/benchmarking/collect.py
+    - python3.10 ci/collect.py
     - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git
diff --git a/.gitlab-ci-base.yml b/ci/.gitlab-setup.yml
similarity index 100%
rename from .gitlab-ci-base.yml
rename to ci/.gitlab-setup.yml
diff --git a/ci/collect.py b/ci/collect.py
new file mode 100644
index 0000000000..b833278fe9
--- /dev/null
+++ b/ci/collect.py
@@ -0,0 +1,412 @@
+import json
+import os
+import shutil
+from dvclive.live import Live
+
+
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
+
+
+def log_dvc_metric(live, prefix, name, value):
+    # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
+    live.log_metric(prefix + name.replace("/", "-"), value, plot=False)
+
+
+def open_json_report(id, report_name):
+    # look in both, build & measurement, artifacts
+    path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
+    path2 = os.path.join(
+        "measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name
+    )
+    if os.path.isfile(path1):
+        with open(path1, "r") as f:
+            report = json.load(f)
+        return report
+    elif os.path.isfile(path2):
+        with open(path2, "r") as f:
+            report = json.load(f)
+        return report
+    else:
+        return None
+
+
+def log_all_metrics_from_report(id, live, report_name, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in report:
+            log_dvc_metric(live, prefix, key, report[key])
+
+
+def log_metrics_from_report(id, live, report_name, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        for key in keys:
+            if key in report:
+                log_dvc_metric(live, prefix, key, report[key])
+
+
+def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""):
+    report = open_json_report(id, report_name)
+    if report:
+        if key_top in report:
+            for key in keys:
+                if key in report[key_top]:
+                    log_dvc_metric(live, prefix, key, report[key_top][key])
+
+
+if __name__ == "__main__":
+    # Go through all runs found in the artifacts and log their results to DVC
+    run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output"))
+    print("Looking for runs in build artifacts")
+    run_ids = []
+    for run_dir in run_dir_list:
+        if run_dir.startswith("run_"):
+            run_id = int(run_dir[4:])
+            run_ids.append(run_id)
+    run_ids.sort()
+    print("Found %d runs" % len(run_ids))
+
+    follow_up_bench_cfg = list()
+    # Prepare (local) output directory where follow-up bench configs will be stored
+    output_cfg_dir = os.path.join(
+        os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID")
+    )
+    output_folding_dir = os.path.join(output_cfg_dir, "folding")
+    output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json")
+
+    for id in run_ids:
+        print("Processing run %d" % id)
+        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
+        experiment_msg = (
+            "[CI] "
+            + os.environ.get("CI_PIPELINE_NAME")
+            + " ("
+            + os.environ.get("CI_PIPELINE_ID")
+            + "_"
+            + str(id)
+            + ")"
+        )
+        # TODO: cache images once we switch to a cache provider that works with DVC Studio
+        with Live(exp_name=experiment_name, exp_message=experiment_msg, cache_images=False) as live:
+            # PARAMS
+            # input parameters logged by benchmarking infrastructure
+            metadata_bench = open_json_report(id, "metadata_bench.json")
+            params = {"params": metadata_bench["params"]}
+            live.log_params(params)
+
+            # optional metadata logged by builder
+            metadata_builder = open_json_report(id, "metadata_builder.json")
+            if metadata_builder:
+                metadata = {
+                    "metadata": {
+                        "tool_version": metadata_builder["tool_version"],
+                    }
+                }
+                live.log_params(metadata)
+
+            # optional dut_info.json (additional information generated during model generation)
+            dut_info_report = open_json_report(id, "dut_info.json")
+            if dut_info_report:
+                dut_info = {"dut_info": dut_info_report}
+                live.log_params(dut_info)
+
+            # METRICS
+            # TODO: for microbenchmarks, only summarize results for target node (surrounding SDP?)
+            # TODO: make all logs consistent (at generation), e.g., BRAM vs BRAM18 vs BRAM36)
+
+            # status
+            status = metadata_bench["status"]
+            if status == "ok":
+                # mark as failed if either bench or builder indicates failure
+                if metadata_builder:
+                    status_builder = metadata_builder["status"]
+                    if status_builder == "failed":
+                        status = "failed"
+            log_dvc_metric(live, "", "status", status)
+
+            # verification steps
+            if "output" in metadata_bench:
+                if "builder_verification" in metadata_bench["output"]:
+                    log_dvc_metric(
+                        live,
+                        "",
+                        "verification",
+                        metadata_bench["output"]["builder_verification"]["verification"],
+                    )
+
+            # estimate_layer_resources.json
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "estimate_layer_resources.json",
+                "total",
+                [
+                    "LUT",
+                    "DSP",
+                    "BRAM_18K",
+                    "URAM",
+                ],
+                prefix="estimate/resources/",
+            )
+
+            # estimate_layer_resources_hls.json
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "estimate_layer_resources_hls.json",
+                "total",
+                [
+                    "LUT",
+                    "FF",
+                    "DSP",
+                    "DSP48E",
+                    "DSP58E",  # TODO: aggregate/unify DSP reporting
+                    "BRAM_18K",
+                    "URAM",
+                ],
+                prefix="hls_estimate/resources/",
+            )
+
+            # estimate_network_performance.json
+            log_metrics_from_report(
+                id,
+                live,
+                "estimate_network_performance.json",
+                [
+                    "critical_path_cycles",
+                    "max_cycles",
+                    "max_cycles_node_name",
+                    "estimated_throughput_fps",
+                    "estimated_latency_ns",
+                ],
+                prefix="estimate/performance/",
+            )
+
+            # rtlsim_performance.json
+            log_metrics_from_report(
+                id,
+                live,
+                "rtlsim_performance.json",
+                [
+                    "N",
+                    "TIMEOUT",
+                    "latency_cycles",
+                    "cycles",
+                    "fclk[mhz]",
+                    "throughput[images/s]",
+                    "stable_throughput[images/s]",
+                    # add INPUT_DONE, OUTPUT_DONE, number transactions?
+                ],
+                prefix="rtlsim/performance/",
+            )
+
+            # fifo_sizing.json
+            log_metrics_from_report(
+                id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/"
+            )
+
+            # stitched IP DCP synth resource report
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "post_synth_resources_dcp.json",
+                "(top)",
+                [
+                    "LUT",
+                    "FF",
+                    "SRL",
+                    "DSP",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth(dcp)/resources/",
+            )
+
+            # stitched IP DCP synth resource breakdown
+            # TODO: generalize to all build flows and bitfile synth
+            layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"]
+            for category in layer_categories:
+                log_nested_metrics_from_report(
+                    id,
+                    live,
+                    "res_breakdown_build_output.json",
+                    category,
+                    [
+                        "LUT",
+                        "FF",
+                        "SRL",
+                        "DSP",
+                        "BRAM_18K",
+                        "BRAM_36K",
+                        "URAM",
+                    ],
+                    prefix="synth(dcp)/resources(breakdown)/" + category + "/",
+                )
+
+            # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
+            log_metrics_from_report(
+                id,
+                live,
+                "ooc_synth_and_timing.json",
+                [
+                    "LUT",
+                    "LUTRAM",
+                    "FF",
+                    "DSP",
+                    "BRAM",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth(ooc)/resources/",
+            )
+            log_metrics_from_report(
+                id,
+                live,
+                "ooc_synth_and_timing.json",
+                [
+                    "WNS",
+                    "fmax_mhz",
+                    # add TNS? what is "delay"?
+                ],
+                prefix="synth(ooc)/timing/",
+            )
+
+            # post_synth_resources.json (shell synth / step_synthesize_bitfile)
+            log_nested_metrics_from_report(
+                id,
+                live,
+                "post_synth_resources.json",
+                "(top)",
+                [
+                    "LUT",
+                    "FF",
+                    "SRL",
+                    "DSP",
+                    "BRAM_18K",
+                    "BRAM_36K",
+                    "URAM",
+                ],
+                prefix="synth/resources/",
+            )
+
+            # post synth timing report
+            # TODO: only exported as post_route_timing.rpt, not .json
+
+            # instrumentation measurement
+            log_all_metrics_from_report(
+                id, live, "measured_performance.json", prefix="measurement/performance/"
+            )
+
+            # IODMA validation accuracy
+            log_metrics_from_report(
+                id,
+                live,
+                "validation.json",
+                [
+                    "top-1_accuracy",
+                ],
+                prefix="measurement/validation/",
+            )
+
+            # power measurement
+            # TODO
+
+            # live fifosizing report + graph png
+            log_metrics_from_report(
+                id,
+                live,
+                "fifo_sizing_report.json",
+                [
+                    "error",
+                    "fifo_size_total_kB",
+                ],
+                prefix="fifosizing/live/",
+            )
+
+            image = os.path.join(
+                "measurement_artifacts",
+                "runs_output",
+                "run_%d" % (id),
+                "reports",
+                "fifo_sizing_graph.png",
+            )
+            if os.path.isfile(image):
+                live.log_image("fifosizing_pass_1", image)
+
+            # time_per_step.json
+            log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
+
+            # ARTIFACTS
+            # Log build reports as they come from GitLab artifacts,
+            # but copy them to a central dir first so all runs share the same path
+            run_report_dir1 = os.path.join(
+                "build_artifacts", "runs_output", "run_%d" % (id), "reports"
+            )
+            run_report_dir2 = os.path.join(
+                "measurement_artifacts", "runs_output", "run_%d" % (id), "reports"
+            )
+            dvc_report_dir = "reports"
+            os.makedirs(dvc_report_dir, exist_ok=True)
+            delete_dir_contents(dvc_report_dir)
+            if os.path.isdir(run_report_dir1):
+                shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True)
+            if os.path.isdir(run_report_dir2):
+                shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True)
+            live.log_artifact(dvc_report_dir)
+
+        # Prepare benchmarking config for follow-up runs after live FIFO-sizing
+        folding_config_lfs_path = os.path.join(
+            "measurement_artifacts",
+            "runs_output",
+            "run_%d" % (id),
+            "reports",
+            "folding_config_lfs.json",
+        )
+        if os.path.isfile(folding_config_lfs_path):
+            # Copy folding config produced by live FIFO-sizing
+            output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json")
+            os.makedirs(output_folding_dir, exist_ok=True)
+            print(
+                "Saving lfs-generated folding config of this run to use in future builds: %s"
+                % output_folding_path
+            )
+            shutil.copy(folding_config_lfs_path, output_folding_path)
+
+            # Create benchmarking config
+            metadata_bench = open_json_report(id, "metadata_bench.json")
+            configuration = dict()
+            for key in metadata_bench["params"]:
+                # wrap in list
+                configuration[key] = [metadata_bench["params"][key]]
+            # overwrite FIFO-related params
+            import_folding_path = os.path.join(
+                os.environ.get("LOCAL_CFG_DIR"),
+                "lfs",
+                "CI_" + os.environ.get("CI_PIPELINE_ID"),
+                "folding",
+                experiment_name + ".json",
+            )
+            configuration["fifo_method"] = ["manual"]
+            configuration["target_fps"] = ["None"]
+            configuration["folding_path"] = [import_folding_path]
+
+            follow_up_bench_cfg.append(configuration)
+
+    # Save aggregated benchmarking config for follow-up job
+    if follow_up_bench_cfg:
+        print("Saving follow-up bench config for lfs: %s" % output_cfg_path)
+        with open(output_cfg_path, "w") as f:
+            json.dump(follow_up_bench_cfg, f, indent=2)
+
+    print("Done")
diff --git a/src/finn/benchmarking/measure.py b/ci/measure.py
similarity index 51%
rename from src/finn/benchmarking/measure.py
rename to ci/measure.py
index 9a44ff3192..42db938d33 100644
--- a/src/finn/benchmarking/measure.py
+++ b/ci/measure.py
@@ -1,9 +1,19 @@
 import os
-import sys
-import subprocess
 import shutil
+import subprocess
+import sys
+
 
-from finn.benchmarking.util import delete_dir_contents
+def delete_dir_contents(dir):
+    for filename in os.listdir(dir):
+        file_path = os.path.join(dir, filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
 
 
 if __name__ == "__main__":
@@ -26,21 +36,36 @@
 
             # Run driver
             print("Running driver..")
-            # run validate.py (from IODMA driver) if present, otherwise driver.py from instrumentation
+            # run validate.py (from IODMA driver) if present, otherwise driver.py (instrumentation)
             # TODO: unify IODMA/instrumentation shell & driver
             if os.path.isfile(f"{extract_dir}/driver/validate.py"):
-                result = subprocess.run(["python", f"{extract_dir}/driver/validate.py",
-                                "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
-                                "--settingsfile", f"{extract_dir}/driver/settings.json",
-                                "--reportfile", f"{extract_dir}/validation.json",
-                                "--dataset_root", "/home/xilinx/datasets", #TODO: env var
-                                ])
+                result = subprocess.run(
+                    [
+                        "python",
+                        f"{extract_dir}/driver/validate.py",
+                        "--bitfile",
+                        f"{extract_dir}/bitfile/finn-accel.bit",
+                        "--settingsfile",
+                        f"{extract_dir}/driver/settings.json",
+                        "--reportfile",
+                        f"{extract_dir}/validation.json",
+                        "--dataset_root",
+                        "/home/xilinx/datasets",  # TODO: env var
+                    ]
+                )
             else:
-                result = subprocess.run(["python", f"{extract_dir}/driver/driver.py",
-                                "--bitfile",  f"{extract_dir}/bitfile/finn-accel.bit",
-                                "--settingsfile", f"{extract_dir}/driver/settings.json",
-                                "--reportfile", f"{extract_dir}/measured_performance.json",
-                                ])
+                result = subprocess.run(
+                    [
+                        "python",
+                        f"{extract_dir}/driver/driver.py",
+                        "--bitfile",
+                        f"{extract_dir}/bitfile/finn-accel.bit",
+                        "--settingsfile",
+                        f"{extract_dir}/driver/settings.json",
+                        "--reportfile",
+                        f"{extract_dir}/measured_performance.json",
+                    ]
+                )
             if result.returncode != 0:
                 print("Driver reported error!")
                 exit_code = 1
@@ -48,13 +73,14 @@
                 print("Driver finished successfully.")
 
             # Copy results back to artifact directory
-            for report in ["measured_performance.json", 
-                           "fifo_sizing_report.json",
-                           "fifo_depth_export.json",
-                           "fifo_sizing_graph.png",
-                           "folding_config_lfs.json",
-                           "validation.json",
-                           ]:
+            for report in [
+                "measured_performance.json",
+                "fifo_sizing_report.json",
+                "fifo_depth_export.json",
+                "fifo_sizing_graph.png",
+                "folding_config_lfs.json",
+                "validation.json",
+            ]:
                 report_path = os.path.join(extract_dir, report)
                 if os.path.isfile(report_path):
                     print("Copying %s to %s" % (report_path, reports_dir))
diff --git a/driver/iterative_live_fifosizing_driver.ipynb b/driver/iterative_live_fifosizing_driver.ipynb
deleted file mode 100644
index 83a329d263..0000000000
--- a/driver/iterative_live_fifosizing_driver.ipynb
+++ /dev/null
@@ -1,833 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "0ee21ecb",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/javascript": [
-       "\n",
-       "try {\n",
-       "require(['notebook/js/codecell'], function(codecell) {\n",
-       "  codecell.CodeCell.options_default.highlight_modes[\n",
-       "      'magic_text/x-csrc'] = {'reg':[/^%%microblaze/]};\n",
-       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
-       "      Jupyter.notebook.get_cells().map(function(cell){\n",
-       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
-       "  });\n",
-       "});\n",
-       "} catch (e) {};\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/javascript": [
-       "\n",
-       "try {\n",
-       "require(['notebook/js/codecell'], function(codecell) {\n",
-       "  codecell.CodeCell.options_default.highlight_modes[\n",
-       "      'magic_text/x-csrc'] = {'reg':[/^%%pybind11/]};\n",
-       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
-       "      Jupyter.notebook.get_cells().map(function(cell){\n",
-       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
-       "  });\n",
-       "});\n",
-       "} catch (e) {};\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import time\n",
-    "import json\n",
-    "import matplotlib as mpl\n",
-    "import matplotlib.pyplot as plt\n",
-    "from IPython.display import clear_output\n",
-    "import numpy as np\n",
-    "from pynq import Overlay\n",
-    "\n",
-    "path = \"bitstreams/resnet50/live_instrumentation\"\n",
-    "bitstream = path + \"/finn-accel.bit\"\n",
-    "\n",
-    "# Program FPGA\n",
-    "ol = Overlay(bitstream, download=True, device=None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f476fd87",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "#FIFO IP detected: 266\n",
-      "#FIFO width information found: 266\n"
-     ]
-    }
-   ],
-   "source": [
-    "### Sanity checks\n",
-    "# We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps\n",
-    "# We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights\n",
-    "print(\"#FIFO IP detected: %d\" % (len(ol.ip_dict.keys()) - 3))\n",
-    "\n",
-    "# We expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,\n",
-    "# {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}\n",
-    "with open(path + \"/fifo_widths.json\", \"r\") as f:\n",
-    "    fifo_info = json.load(f)\n",
-    "print(\"#FIFO width information found: %d\" % len(fifo_info[\"fifo_widths\"]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e419656f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Instrumentation driver\n",
-    "# Register map\n",
-    "#ap_uint<32>  cfg,   \t// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed\n",
-    "#ap_uint<32> &status,\t// [0] - timestamp overflow; [1] - timestamp underflow\n",
-    "#ap_uint<32> &latency,\n",
-    "#ap_uint<32> &interval,\n",
-    "#ap_uint<32> &checksum,\n",
-    "#ap_uint<32> &min_latency\n",
-    "\n",
-    "def read_register(ol, name):\n",
-    "    return ol.instrumentation_wrap_0.read(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"])\n",
-    "\n",
-    "def write_register(ol, name, value):\n",
-    "    return ol.instrumentation_wrap_0.write(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"], value=value)\n",
-    "\n",
-    "def observe_instrumentation(debug_print=True):\n",
-    "    status_reg = read_register(ol, \"status\")\n",
-    "    chksum_reg = read_register(ol, \"checksum\")\n",
-    "    min_latency = read_register(ol, \"min_latency\")\n",
-    "    latency = read_register(ol, \"latency\")\n",
-    "    interval =  read_register(ol, \"interval\")\n",
-    "\n",
-    "    frame = (chksum_reg >> 24) & 0x000000ff\n",
-    "    checksum = chksum_reg & 0x00ffffff\n",
-    "    overflow_err = (status_reg & 0x00000001) != 0\n",
-    "    underflow_err = (status_reg & 0x00000002) != 0\n",
-    "\n",
-    "    if debug_print:\n",
-    "        print(\"---INSTRUMENTATION_REPORT---\")\n",
-    "        if overflow_err or underflow_err:\n",
-    "            print(\"Status ERROR\")\n",
-    "            print(\"Overflow error: %s\" % overflow_err)\n",
-    "            print(\"Underflow error: %s\" % underflow_err)\n",
-    "        else:\n",
-    "            print(\"Status OK\")\n",
-    "        print(\"Frame number (8-bit): %d\" % frame)\n",
-    "        print(\"Checksum: 0x%06x\" % checksum)\n",
-    "        print(\"Min Latency (cycles): %d\" % min_latency)\n",
-    "        print(\"Latency (cycles): %d\" % latency)\n",
-    "        print(\"Interval (cycles): %d\" % interval)\n",
-    "        print(\"----------------------------\")\n",
-    "\n",
-    "    return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)\n",
-    "\n",
-    "def start_accelerator():\n",
-    "    lfsr_seed = 0x00010000 # upper 16 bits\n",
-    "    write_register(ol, \"cfg\", lfsr_seed + 1) # start operation\n",
-    "\n",
-    "### Virtual FIFO driver\n",
-    "# Register map\n",
-    "mode_offset = 0x10\n",
-    "depth_offset = 0x18\n",
-    "occupancy_offset = 0x20\n",
-    "occupancy_ctrl_offset = 0x24\n",
-    "max_occupancy_offset = 0x30\n",
-    "max_occupancy_ctrl_offset = 0x34\n",
-    "\n",
-    "def configure_fifo(ol, i, mode, depth = 2):\n",
-    "    ip_name = \"StreamingDataflowPartition_%d\" % i\n",
-    "    getattr(ol, ip_name).write(offset=mode_offset, value = mode)\n",
-    "    getattr(ol, ip_name).write(offset=depth_offset, value = depth)\n",
-    "\n",
-    "def total_fifo_size(depths):\n",
-    "    # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs\n",
-    "    total_size_bits = 0\n",
-    "    for i, depth in enumerate(depths):\n",
-    "        total_size_bits += depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % i]\n",
-    "    total_size_kB = total_size_bits / 8.0 / 1000.0\n",
-    "    return total_size_kB\n",
-    "\n",
-    "### GPIO Reset Driver\n",
-    "def reset_accelerator():\n",
-    "    ol.axi_gpio_0.write(offset=ol.ip_dict[\"axi_gpio_0\"][\"registers\"][\"GPIO_DATA\"][\"address_offset\"], value=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "2e2a4b88",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Iterative FIFO-sizing function\n",
-    "def size_iteratively(start_depth, iteration_runtime, reduction_factor = 0.5):\n",
-    "    num_fifos = len(fifo_info[\"fifo_widths\"])\n",
-    "    fifo_minimum_reached = [False] * num_fifos\n",
-    "    \n",
-    "    if isinstance(start_depth, list):\n",
-    "        # Individual start depth for each FIFO has been supplied\n",
-    "        fifo_depths = start_depth\n",
-    "    else:\n",
-    "        # Initialize all depths to the same start depth\n",
-    "        fifo_depths = [start_depth] * num_fifos\n",
-    "    \n",
-    "    # Reset accelerator and configure FIFOs\n",
-    "    reset_accelerator()\n",
-    "    for i in range(0, num_fifos):\n",
-    "        configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n",
-    "\n",
-    "    # Run once to determine target interval\n",
-    "    start_accelerator()\n",
-    "    time.sleep(1)\n",
-    "    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n",
-    "    log_total_fifo_size = [int(total_fifo_size(fifo_depths))]\n",
-    "    log_interval = [interval]\n",
-    "    log_min_latency = [min_latency]\n",
-    "    log_latency = [latency]\n",
-    "    target_interval = interval\n",
-    "    \n",
-    "    # Iteratively reduce FIFO depth until all FIFOs are minimized\n",
-    "    iteration = 0\n",
-    "    start_time = time.time()\n",
-    "    while not all(fifo_minimum_reached):\n",
-    "        for fifo_id in range(0, num_fifos):\n",
-    "            if not fifo_minimum_reached[fifo_id]:\n",
-    "                fifo_depth_before = fifo_depths[fifo_id]\n",
-    "                fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor)\n",
-    "\n",
-    "                # Reset accelerator\n",
-    "                reset_accelerator()\n",
-    "\n",
-    "                # Configure all FIFOs\n",
-    "                for i in range(0, num_fifos):\n",
-    "                    configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n",
-    "\n",
-    "                # Start accelerator\n",
-    "                start_accelerator()\n",
-    "\n",
-    "                # Let it run\n",
-    "                time.sleep(iteration_runtime)\n",
-    "\n",
-    "                # Check if throughput dropped or deadlock occured \n",
-    "                (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n",
-    "\n",
-    "                if interval > target_interval or interval == 0 or overflow_err or underflow_err:\n",
-    "                    # Revert depth reduction and mark FIFO as minimized\n",
-    "                    fifo_depths[fifo_id] = fifo_depth_before\n",
-    "                    fifo_minimum_reached[fifo_id] = True\n",
-    "                else:\n",
-    "                    log_total_fifo_size.append(int(total_fifo_size(fifo_depths)))\n",
-    "                    log_interval.append(interval)\n",
-    "                    log_min_latency.append(min_latency)\n",
-    "                    log_latency.append(latency) \n",
-    "\n",
-    "                if fifo_depths[fifo_id] == 1:\n",
-    "                    fifo_minimum_reached[fifo_id] = True\n",
-    "\n",
-    "                # Report status\n",
-    "                clear_output(wait=True)\n",
-    "                print(\"Iteration: %d\" % iteration)\n",
-    "                print(\"Reducing depth of FIFO: %d/%d\" % (fifo_id, num_fifos))\n",
-    "                print(\"Numer of minimized FIFOs: %d/%d\" % (sum(fifo_minimum_reached), num_fifos))\n",
-    "                print(\"Interval: %d\" % log_interval[-1])\n",
-    "                print(\"Min. latency / latency: %d/%d\" % (log_min_latency[-1], log_latency[-1]))\n",
-    "                print(\"Total FIFO Size (kB): %d\" % log_total_fifo_size[-1])\n",
-    "\n",
-    "        iteration += 1\n",
-    "\n",
-    "    end_time = time.time()\n",
-    "    print(\"Done (%d seconds)\" % int(end_time - start_time))\n",
-    "    \n",
-    "    return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "2ebb2aa3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Testing start depth of 64\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 128\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 256\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 512\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 1024\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 2048\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 4096\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 0\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 4294967295\n",
-      "Latency (cycles): 0\n",
-      "Interval (cycles): 0\n",
-      "----------------------------\n",
-      "Testing start depth of 8192\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 108\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 2548522\n",
-      "Latency (cycles): 5030984\n",
-      "Interval (cycles): 903174\n",
-      "----------------------------\n",
-      "Testing start depth of 16384\n",
-      "---INSTRUMENTATION_REPORT---\n",
-      "Status OK\n",
-      "Frame number (8-bit): 108\n",
-      "Checksum: 0x000000\n",
-      "Min Latency (cycles): 2548522\n",
-      "Latency (cycles): 7496520\n",
-      "Interval (cycles): 903174\n",
-      "----------------------------\n",
-      "Determined start depth for all FIFOs: 8192\n",
-      "Determined iteration runtime based on performance: 0.127426 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "### Attempt to determine start depth for all FIFOs automatically\n",
-    "# If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis\n",
-    "start_depth = 64\n",
-    "last_interval = 0\n",
-    "start_depth_found = False\n",
-    "\n",
-    "while not start_depth_found:\n",
-    "    print(\"Testing start depth of %d\" % start_depth)\n",
-    "    reset_accelerator()\n",
-    "\n",
-    "    # Configure FIFOs\n",
-    "    num_fifos = len(fifo_info[\"fifo_widths\"])\n",
-    "    for i in range(0, num_fifos):\n",
-    "        configure_fifo(ol, i, mode = 1, depth = start_depth)\n",
-    "    \n",
-    "    # Start accelerator and let it run for a long time\n",
-    "    start_accelerator()\n",
-    "    time.sleep(1)\n",
-    "    \n",
-    "    # Examine performance\n",
-    "    (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation()\n",
-    "    if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:\n",
-    "        # Accelerator runs with stable interval, reset to previous start depth\n",
-    "        start_depth_found = True\n",
-    "        start_depth = last_start_depth\n",
-    "    else:\n",
-    "        # Start depth is still too small, increase for next try\n",
-    "        last_start_depth = start_depth\n",
-    "        start_depth = start_depth * 2\n",
-    "    \n",
-    "    last_interval = interval\n",
-    "    \n",
-    "# Determine runtime per iteration based on performance, so that stable-state is guaranteed\n",
-    "# Use a simple overestimation for now to be safe\n",
-    "iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)\n",
-    "\n",
-    "print(\"Determined start depth for all FIFOs: %d\" % start_depth)\n",
-    "print(\"Determined iteration runtime based on performance: %f s\" % iteration_runtime)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "4ba40f96",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Iteration: 12\n",
-      "Reducing depth of FIFO: 265/266\n",
-      "Numer of minimized FIFOs: 266/266\n",
-      "Interval: 903174\n",
-      "Min. latency / latency: 2549314/2580777\n",
-      "Total FIFO Size (kB): 244\n",
-      "Done (389 seconds)\n"
-     ]
-    }
-   ],
-   "source": [
-    "### First pass\n",
-    "(fifo_depths,\n",
-    " log_total_fifo_size,\n",
-    " log_interval,\n",
-    " log_min_latency,\n",
-    " log_latency) = size_iteratively(start_depth, iteration_runtime)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ebf027a4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdgAAAE3CAYAAAAJy1DOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAxOAAAMTgF/d4wjAABNoElEQVR4nO3dd5wU5f3A8c+ze527oyPlhKHpDjZEUEFRMRjLGjTRoCZijMZIJImKbWPys0XjGiOaWGLFCnZAdAELitgQVCAis1KXKkXKHe3a7vz+mNljOa7M7u3eXvm+X6993e48U76znnzveeYpyjRNhBBCCJFcrnQHIIQQQrREkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECkgCVYIIYRIAUmwQgghRApkpDsAIYQQLYPmC/wHGAX0Ao4K+b1LHByTDTwAnAmUAwtDfu+lKQ20kUgNVgghRLK8AZwMrInjGD8QAQ4L+b1HADelIrB0UDKTkxBCiGTSfIEQcG60Bqv5Av2Bh4AuQBbwRMjvfUzzBdoAG4CikN+7O03hpow0EQshhEgZzRdwA5OBMSG/N6j5AnnAPM0XmAdUAtuAv2m+wEhgH3BHyO+dnb6Ik0eaiIUQQqTS4cARwCuaL7AI+BwoAAYAmUAfYGnI7x0M/NHer3OaYk0qqcEKIYRIJQX8GPJ7B1Yv0HyBTljPXycBhPzexZovsBorIc9pxBhTQmqwQgghUul7YK/mC1wW3aD5Av00X6BDyO/9EZiN1YMYzRfoBfS2j2n2pJOTEEKIpNB8gUeB84CuwI/A7pDf28/u5PQg0BNwA1uBX4f83g2aL9AHmAh0BMLAnSG/d2pabiDJJMEKIYQQKSBNxEIIIUQKtKpOTkop0+VK8G8K07ReiR4vhBAiLpFIBNM0VbrjSFSrSrAul4twOJzQsfv+9z9Coy+i0zV/oPOf/5zkyIQQQlSnlErsH+wmQqpjDmX17AlA5dYf0xyJEEKI5kASrEOuNm0ACO/cmd5AhBBCNAuSYB1SmZlk9evLrg8+YNfsFjGLlxBCiBRqVcN03G63megzWIDydetYecZPyR8xgkP/+1gSIxMtkWmaVS8hxMGUUtTV8VQpFTZNs9n2FWq2gadD1qGHkqVp7J0/n/DOnbjbtUt3SKIJikQibNmyhZ07d0pyFaIemZmZ9OzZk6ysrHSHknSSYOPU7sIL2PKvB9j+wgvSm1jUaM2aNbhcLjRNIzMzM93hCNFkmabJtm3bWLt2Lf369Ut3OEknCTZO7UaPZsu/HqBy69Z0hyKaoEgkQmlpKf379ycjQ/73EqI+HTt2ZPv27UQikTqbi5ujlnU3jcCVlwdAxcYf0hyJaIqiTcJKNdux8UI0quj/Ky3xcYokWIciEZPSijCmy01G926Ur1mT7pCEEEI0YZJgHZrw/jI8/zeLVT/uIW/gQCrWr2fntGnpDksIRzRNo0uXLlRUVFRt+/DDD1FKceONNwIwffp0brrppnrPtXHjRkaMGJGyWBMxfvx4XnnlFQAWLFjAsGHDyMvL48ILL3R0/LRp05g/f36t5StXrmTQoEEce+yxPPvss0mJOV433HADL7/8cq3lp512Gu+8805c57zjjjsoLy9vaGiiFvKQyKEMt9WMURmJ0OWWW9i74Cu2PvgQ7c4/P72BCeFQz549mT59OhdccAEAEydOZPDgwVXlo0aNYtSoUfWep3v37nz00UcpizNeGzZsYObMmTzwwAMAdOvWjYceeoiFCxfy/vvvOzrHtGnTGDx4MMcff3yN5W+88QZDhw7l0UcfPaissrKyUZ6333LLLQwfPpyLLrooac8q77zzTm688cZm34NX8wVCQKn9Arg35Pe+WsN+VwI+rMrlbOCakN9bmaq4JME6lOm2fqErwyaZPQ4hb8hgSmbMZO/CheQde2yaoxNN2aiXR7Fyx8qUnLtv+75Mv2S6o32vuOIKJk6cyAUXXEBxcTHz5s3jkksuYd++fQA899xzvPPOO7zxxhvMmTOH6667jmHDhvHZZ59RWVnJ888/z+DBgwmFQgwePJgff7SmDVVK8Y9//IOpU6fy448/8uSTTzJ79mxmzZpFeXk5r732GkcccQRz5szhxhtv5KuvvgJgyZIlnHvuuYRCoapzjh07lkAgwL59+3jppZd48sknmTdvHjk5OUybNo3u3bsfdF8TJ07kwgsvrHqWV1RURFFREUuXLj1o33nz5jFu3DjC4TCVlZWMGzeOXr16MX36dD744AOefvpp/vjHP/K73/2u6pgXXniBBx98kEgkwmeffcbkyZO55pprOOmkk5g3bx4A7777Ll6vl23btrFv3z4GDhzIU089RV5eHs899xyTJ0+mQ4cOLFq0iO7du/Pwww9z8803s3z5cgYNGsTkyZNxuVzs2rWL8ePHs3jxYkpLSxk2bBgPP/wwmZmZdOnShd69ezN79mzOOOMMx78jEyZM4OWXX6ayspLMzEwefvhhTjjhBMaOHQvAsGHDcLlcvPfee+Tm5tZ6/dNOO40TTjiBzz//nI0bN3LGGWfw+OOPA1BcXMwNN9zAl19+icvl4rjjjuOxxx5D0zQWLFjAoYceCsBf/vIXIpEI9913n+P443BhyO9dUluh5gv0Bv4OHAtsAd4CrgSeSEUwIE3EjmW4rP95K8IRANpfeikAJTNmpi0mIeJxyimnsGrVKjZs2MDLL7/ML3/5S9xud637f/fdd1xxxRUsXryYP/3pT/z1r3+tdd/CwkLmz5/Pfffdx3nnncfJJ5/MwoUL+c1vfsM999zjKL5t27YxdOhQFi5cyJVXXsnIkSO55ppr+N///sfgwYN55JFHajxuzpw5DBs2zNE17r33Xm644QYWLVrEkiVLuPjiiznnnHMYNWoUPp+PRYsWHZBcAS677DLGjh3LZZddxqJFixgwYAAAixYtYtasWcyePRu3283kyZP56quvWLJkCYWFhTz22P7JaBYsWMC//vUvgsEgeXl5/OpXv2Ly5MksXbqUpUuX8sEHHwBWM/App5zC/PnzWbx4MZWVlQfc97Bhw5gd50xyY8aMYcGCBSxcuJD//Oc/XHnllQBVyfHzzz9n0aJFdOnSpd7rr1y5kjlz5rBkyRLeffddvvjiCwCuu+46cnNzWbx4MYsXL+a+++4jJyeHK6+8kieesPJXWVkZzz77LH/4wx/iij+JLgSmhvzezSG/1wQeBy5J5QWlButQRrQGG7F6uuUcfjgAkV270haTaB6c1jAbw5gxY3j++eeZNm0akyZNYtKkSbXue/jhh1c1IQ8dOpR//etfte570UUXATBo0CBcLhderxeA4447jilTpjiKLT8/v+q4QYMGUVRUxMCBA6vOU1tz7/r16+natauja4wYMYK7776bFStWcPrpp3PyySc7Oq4mY8aMqRrnbJomDz74IIFAgMrKSoqLiznllFOq9j3ppJMoKioC4Nhjj0XTNNq2bQvAMcccw6pVqwCrqXrevHlVzd379u07oPm2a9euzJ07N644Fy5cyD333MO2bdvIyMhg6dKllJeX19gsXN/1L774YtxuN7m5uQwcOJCVK1cydOhQ3nnnHb7++uuqpuvOnTsDcM0113DCCSdw22238corr3DCCSegaVo84buUUutjPk8wTXNCLftO0nwBF/Al8JeQ31t9LGVPILZ3asjeljKSYB3KdB9Yg1V5eZCZSdmKFekMS4i4XH755QwaNIjDDjuM/v3717lvTk5O1Xu3201lZe2PqqL7ut1usrOzazwuIyPjgOUiS0tLDzhH9eOcXj8vL6+qmbs+1113HaNGjWL27NnceuutHHnkkQfUNOORn59f9X7y5Ml8/PHHzJ07l4KCAv7zn/8ckAir30tt92aaJtOmTaNPnz41XrO0tJTc3FzHMZaXl3PBBRcwZ84cjjvuOEpKSmjbtm2tCba+68fzOwHQo0cPhg8fzhtvvMGjjz7quDUjRsQ0zSIH+50S8nvXar5AJnA38DxwTg37xY4FSvlYOmkidijDtf8ZLFjPnbL79aNi3Toi1f6hEKKp6t69O/fee2+qnoHVqXfv3qxevZpt27YB8OKLLyblvEcffTTBYNDRvt9//z19+vThqquu4tZbb616hlpYWEhxcXHCMezYsYOOHTtSUFDArl27eO655xI6z6hRo/D7/VWJa8eOHayI+SPeMAyOOeYYx+crLS2loqKi6hnoww8/fEB5QUHBAfdd3/Xrivv+++8nErEqIFtjJuK59tprueWWWygpKWHkyJGOY49HyO9da/+sAB4Chtew21pAi/ncy96WMpJgHYrtRRxVeOaZhIuL2XTHnekKS4i4/fa3v2Xo0KGNft0ePXpw4403MnjwYEaMGEG7JM3lfeGFFzJz5v6+ECtXrqSoqIjx48czY8YMioqKqmqpDz/8MEcccQTHHnssf/vb36qaQseMGcPkyZMZOHAgTz/9dNwxXHbZZezevZsBAwbwi1/8guHDa/r3vX4PPfQQGRkZDBw4kKOPPpqRI0cSCoUAq3Y5e/ZszjvvvFqPv/zyy6s6eRUVFfHdd99x1113cfzxx3PKKacc0EoA1jPf008/nYEDB7Jly5Y6r1+XBx98kL1793LkkUcycOBAbr311qqyE088kXbt2jFu3LiUTMCi+QJtNF+gXcymS4CFNez6JvBzzRc4RPMFFDAWeCXpAcWQ1XQcmrpwPde/upgnxhzHmUdYz3vMSITQL0dTtmwZhy9aiKqjw4hoHcLhMMuWLeOwww6rswORSJ5IJMKQIUN46623qp5ztkSzZs1i0qRJSav5N5Z169Zx/PHHs2zZMgoKCg4qr+v/GSer6Wi+QB+s5OnGavZdBVwb8ntDmi/wNDA95PdOt/e9CrgFq3L5IfAHu9abEvIM1qHqTcQAyuUi74QTKP3uO3a88godfv3rdIUnRKvlcrl44oknCIVCLTrBFhcXp6VpvyFuu+02Jk6ciN/vrzG5JkPI712FNfSmprLfVfv8FPBUSgKpgSRYhzJraCIG6HDZGLZPnCidnYRIo9gJM1qqaE/t5uSuu+7irrvuSncYaSPPYB2K1mArwgc2qbvbtwcgvG17o8ckhBCi6Up5Ddbw6NnAA8CZQDmwUA8alxoevQvwAtAXKAPG6kHjU/uYPOAZYAgQAXx60Jhil7mAf2N1wTaBCXrQSKyffRyqOjmFD6zBqqwsXHl5lK9NaWc00Uy05JVBhEiFlrwCVWM0EfuxkuRhetAwDY/eLWb7PD1onGV49CHAG4ZH76sHjUrgRqBMDxr9DI/eG/jC8Ogf6UFjB3ApMAA4DGgLfGN49A/1oOGsn36ColMlVkQO/IdTKUWWphHeuTOVlxfNhMvlIicnhw0bNnDIIYfIgutC1CG64HpmZmaLWwsWUpxgDY/eBvgtUKQHDRNADxrRhVRHA73tbQsMj74ZOBmYA1wEXG6XrTY8+lzgPOA5u+xxPWiEge2GR38NuBi4I5X3kpNp9W7bU3bwwGpXQQFlq1dTsWkTmQ5nlBEtV69evdiyZQuhUEhqskLUIzMzk549UzqhUtqkugbbF9gG/M3w6COBfViJcBHg0oNG7FRWIfZPW1XXlFY1ldXYw0EpNR4YH/M5kXsAoGtbawaTLSVlB5W1u+AX7P3ySzaMvwFtcu1Tz4nWweVy0bVrVw455BBM05QkK0QtlFItsuYaleoEmwn0AZbqQcNnePRjgA+AIzlwyio4eNqquqa0cjTdlT1nZdW8lW63O+F/6dzRZ2sHhQ1tR41i+wsvUr56daKnFy2QUqpFPlcSQjiT6j8d1mA9f50EoAeNxcBqQAcwPHrnmH1jp62qa0qrRp/uCsBeTIfaKiMZnToR3rEDsyJlY5aFEEI0IylNsHrQ+BFrUdszAQyP3gvruev3wOvAOHv7EKAr8Kl9aGxZb+BUYHpM2dWGR3cbHr0D1jPZgxbWTbqqBFtzhs3qa02OXTJrVspDEUII0fQ1RuP3WOBmw6N/i7XA7e/tjk63AMMMj74cq/PSGLsHMcD9QK7h0VcA7wLj9KARHWj6IlaCXgYsAO7Xg4aR6ptQRJuIa1ZoL4BcuXlzqkMRQgjRDKR8mI4eNFYBp9WwfTPw01qO2YNVM62pLIxdu21Mqp4mYldhIQAVm7c0UkRCCCGaspbbfSvJol1VaurkBJDZvTuuwkJKZs2ssVwIIUTrIgnWIVfVDD21lOfk0OaE4wn/uE2GZQghhJAE61S0iThSR+505ReAacqsTkIIISTBOqViGolrk9XLmgujLJjSWRuFEEI0A5JgnaqnkxNA7rGDAPjx0ccwy8sbISghhBBNlSRYh+rrRQyQd/wQ2l10EXu/+oo9X85vnMCEEEI0SZJgHXLVMVVilFKKwjOtkUfFU6c0SlxCCCGaJkmwDkWfwNbVyQkg74QTyOrbl10fzUl1SEIIIZowSbAOOWkiBlBuN9mH9cfctw+z8uCl7YQQQrQOkmAd2j9VYv1jXN1t2wJQsWFDSmMSQgjRdEmCdUjVP0qnSnb//gDsW7QoZfEIIYRo2iTBxsnJHE0Fp52Gq6CATX+/mwqZ/F8IIVolSbAORXsRRxxMg5jZowddbryRyO7d7Pns81SHJoQQogmSBOuQ005OUXlDBgOw/dlnUxSREEKIpkwSrENxPIIFILtPH9qcdBLloVCKIhJCCNGUSYJ1SFWtpuN8pRx3+/aYFRVEyspSFZYQQogmShKsQ/HWYAEyunQBoMwwkh6PEEKIpk0SrEPxDNOJyj36aABKZs5KfkBCCCGaNEmwDqk4ehFH5Z8+gpwBA9j+/POULl2aqtCEEEI0QZJg46CU817EAK6sLDr89nIASmbOTE1QQgghmiRJsHFQOJsqMVb+iBG4O3Zk5xtvpiYoIYQQTZIk2DgopeKqwQK48/PJG3Qs4eLiuHogCyGEaN4kwcZBUf9ydTVxtcmHSITw9u1Jj0kIIUTTJAk2Dm2yM9hdVhH3cVl9+gBQulSG6wghRGshCTYO3drmsKm4NO7j2px4AgA/3H4b4ZKSZIclhBCiCZIEG4eubXP4obg07mepuUcfTYcrrqBy4w+yhJ0QQrQSkmDj0LUwh7LKCDv3xt9MnHvUkQBE9uxJdlhCCCGaoIx0B9CctMm2vq59FWHax3msu2NHAHbN/pDCs89OcmRCCNG6ab7A7cAdwFEhv3dJtbLTgBnAspjNQ0N+775UxiQJNg5ZGVaFv7wyEvexeYMH4+7UibJly+rfWQghhGOaLzAIOBFYW8duS0N+7+BGCglohARrePQQUGq/AO7Vg8arhkfvArwA9AXKgLF60PjUPiYPeAYYAkQAnx40pthlLuDfwDlYMwNP0IPGY6m+D4Ast5VgyxJIsMrlIvOQQ6jcIUN1hBAiWTRfIBt4FPgV8FGawzlAYz2DvVAPGgPt16v2Nj8wTw8a/YHfApMMjx5N+DcCZXrQ6AecCTxmePRoq+ylwADgMOB44GbDo3sa4yYaUoMFyOjalcqNP1D6vdRihRDCAZdSan3Ma3wN+9wFvBTye1fXc67DNV/gG80XWKD5AtekINaDpLOT02isvzrQg8YCYDNwsl12UUzZamAucF5M2eN60AjrQWM78BpwcWMEnB1NsOFwQscXnnUWAGXfB5MWkxBCtGAR0zSLYl4TYgs1X2AoVktnfa2Y3wBFIb93EPBzYKzmC4xOTcj7NVaCnWR49G8Nj/604dE7Gx69I+DSg8bWmH1CQE/7fU9gTQJlB1BKjY/966ehUxVGE2wiTcQAGZ07AVC5TZqJhRAiCU4FPMBqzRcIAUXAu5ovcEBP0pDfWxLye4vt9+uBl4HhqQ6uMTo5naIHjbWGR88E7gaeB8Zw8MqqqtpnM8Gy/TtZf+1U/cXjdrsblGEb2kSc2a2bdfyqVQ0JQwghBBDye/1YjxsBsJPsuTX0Iu4GbA75vRHNFygAzsXq55NSKa/B6kFjrf2zAngIGK4HjW0AhkfvHLNrL/b3AFsLaAmUpVSDE2zPnqjsbPZ+9RXh3TIeVgghUkXzBZ7WfIFR9scLgG81X2AxMA94H3g21TGktAZrePQ2QKYeNHbamy4BFtrvXwfGAXcYHn0I0BX4tFrZ5YZH743VDDA2puxqw6NPAdpiPZM9K5X3EZXldgNQHk4swSql6PSHsWx96N/seOlFOo0dW/9BQgghHAn5vVrM+9/FvH8EeKSx40l1DfYQ4CPDo//P8OjfYiXKy+yyW4BhhkdfDjwHjNGDRqVddj+Qa3j0FcC7wDi7QxPAi8D3WAOGFwD360GjUWbRz8m0vq7SisQSLED7S8cAsOvDjzAjiZ9HCCFE05bSGqweNFYBx9ZSthn4aS1le7BqpjWVhbFqt40uJ9Oqwe6rSKwXMYA7vw35I3/C7g9mU7Z8OTmHH56s8IQQQjQhMhdxHKIJtmRf/HMRx2pz4lAAwjuLGxyTEEKIpkkSbBzy7bmIF4QaNszGXZAPQPmaUENDEkII0URJgo1DUftcADq2yW7QebLtZuHy0Jp69hRCCNFcSYKNg7JH3JoHDeGNT1afPmR07syOSZMoW748CZEJIYRoaiTBxkFRlWEbxJWVRdc778QsK2PXh01qbmohhBBJUm8vYntlm/pE9KBRWv9uzZuqdc6o+OUeOxCVnc32l16k45VXoDJk5UAhhGhJnNRgdwO77J/VX9HtK1MVYFPUwAosABnt21Po9RLe+iPhYulNLIQQLY2TatNiPWjUOJY1yvDoC+sqFzXL6NgBgHBxMRkdO6Y5GiGEEMnkpAb7pyTt02I0dFWeqIzO1lTMZd9/n5TzCSGEaDrqTbB60Pg0Gfu0BCo5fZyq5Bx1FAAl776XtKQthBCiaXDSySkXuBzYgbW4+T+BM7HmA75WDxobUhlgU6JqXxkvIbnHHEP+iBHsmjWLfZddRt6gOlvihRBCNCNOmoifAs4Bfg+8B7QDbgZWA4+nLLImLFmVTeVy0e7CCwDY9e67yTmpEEKIJsFJgh2kB42fYSXZwcDv9aAxUw8aNwG9UxpdE5PsJmKANiedREbXrhS//XYSzyqEECLdnCTYMgB7nOtqPWjErrFWnpKomqjkNhBbXDk55AwYQHjXLnkOK4QQLYiTYTrZhkfXsfJL7HuAnJRF1oQlOxG6CwqgooLwtm1kdOqU1HMLIYRwTvMFtjjYbVPI7z26vp2cJNg8YEbM5xm17djSqWRO5RQju38/AEq/+478U09NyTWEEEI4shXrkWhtFDDdyYnqTbB60NCcxdR6JLsht83w4fDQv9l05130mTUTV1ZWkq8ghBDCoTtDfm+dS51pvsDdTk7keLJ/w6OfWcO2sU6Pbwmq6q9JzrA5hx9Oh8suo2LjRsqWLk3uyYUQQjgW8ntfS8Y+EN9qOvcbHv2o6AfDo48Brojj+GYvRS3EALQZeiIA2yY+m7qLCCGEcETzBe7SfIF2mi+gNF8goPkCP2q+wAXxnCOeBHsxMNnw6N0Nj/4L4Ebg7Hgu1lI0dD3YmuQPH05G926Ur5FF2IUQogk4L+T37gRGApXAScBf4zmB4wSrB42lwJ+xJpv4O3CmHjS2xXOx5i5VnZyiMtq1p3KLkw5sQgghUiw6JPVU4PWQ3xv3pPFOpkr8Z7VNlcByYLzh0dGDxs3xXrS5S9Vw1YyuXSldupRwcTHutm1TcxEhhBBO7NF8AR9W6+1Jmi/gAuLqgeqkBrun2msqsCTmc6uTqgSbe7Q1rGrnlKmpuYAQQginLge6AjeH/N7NQB9gUjwnUK1p9iC3222Gw+EGnaP3XwKcOaArj485LklR7RfeuZPVoy+icssWDv/ma5QrnkfkQgjRsiilwqZpOpmvISU0X8ANHBrye0OJHF/vv+CGR6+3p7CTfVqSVHRyAnC3a0f+Kadglpay+6OPUnINIYQQ9dN8geHAGmCu/XmI5gu8GM85nPxlcKPh0b+g7ql4rwMmxnPh5kqRuiZigA6X/podL71ESWAGBT/5SeouJIQQoi7/xOrg9AZAyO9doPkCg+I5QSJTJdZkazwXbc5S3ZM4S9NQOTmEd+9K6XWEEELUKSPk967UfIHYbXEtcCNTJSYg1U+t3W3bUr5iZYqvIoQQog6lmi+Qj/1PvuYLHAGUxnMC6UUTp9TWXy3Z/fpRsWULlTt2NMLVhBBC1ODvwLtAd80XeA6YDfxfPCdotN5Zhke/HbgDOEoPGksMj94FeAHoi7Xm7Fg9aHxq75sHPAMMwRrs69ODxhS7zAX8G2u1AxOYoAeNxxrrPiC1z2ABCs85hz2ffca6q36P9uorKLc7tRcUQghxgJDf+57mCywHzsKqW90d8ntXxHOORkmwhkcfBJwIrI3Z7Afm6UHjLMOjDwHeMDx6Xz1oVGJNw1imB41+hkfvDXxhePSP9KCxA7gUGAAcBrQFvjE8+od60Ag2xr1Yj2BTm2HbXfAL9s7/kuK3plPxww9kFRWl9HpCCCEOFvJ7VwP/TfT4uBOs4dEz7CTodP9s4FHgV0Ds2JPRQG8APWgsMDz6ZuBkYA5wEdYgX/Sgsdrw6HOB84Dn7LLH9aARBrYbHv01rJk27oj3XhKhGqWRGDJ79gQgsnt3o1xPCCEEaL7AAuqoRYX83uOdnstxgjU8+hFYs1h0BA41PPpxwGg9aNxSz6F3AS/ZiTJ6ro6ASw8asb2PQ0BP+31PrPFHTssG13RhpdR4YHzM53pCdaYx5ubI6NQZgF0fzCbH40n9BYUQQoDVgpoU8XRyegT4I/Cj/fkbwFvXAYZHH4r1HLWmZ6TV01T17GcmWLZ/J9OcYJpmUfSVlATbOBVYCr3WV1u2Iq4mfyGEEA0Q8ns/Dvm9HwNfAnNjPn9ib3MsngRbEO2EBKAHDROoqOeYUwEPsNrw6CGgCKtX1vEAhkfvHLNvL/Y/o10LaAmUNYrGmFzSnd8GlZdHZJeMhxVCiDT4ECiM+VwAfBDPCeJ5BltpePRM7PxiePQi9i/nUyM9aPixOjNhHxMCzrV7Eb8OjAPusDs5dQWiCTxadrndyelUYGxM2dWGR5+C1cnpIqxeXo3CmsmpceZvzuzShb1ffSWr6wghRD00X6BqpErI711SQ/mVgA+rYjkbuCbk99bVnygv5PcWRz+E/N5izRdoE09M8TYRTwU6GR79Dqz5Ge+P52LV3AIMMzz6cqzOS2NiOk/dD+QaHn0FVo13nB40tttlLwLfA8uABcD9etAwGhBHXFI8kdMBCs4+C7OsjPJ16xvvokII0czYUxhWH6kSW94ba1zryUA/rArdlfWc1hWbUDVfoADIjCcuxzVYPWi8ZHj0VVi9efOA3+hB45N4LhY7K5QeNDYDP61lvz1YNdOaysJYtdu0aaz1hzI6dQIgLBNOCCFEjTRfoLaRKrEuBKbay86h+QKPAzcDT9Rx6knAe5ovEB2m8wfg+Xhic1yDNTz6cKxxq7foQeNmPWh8Yo9vbVUUqlF6EQNkdLYeUZevkmkThRCtkksptT7mNb6Gfe4CXrLHrNamrpEpNQr5vfcBTwKj7Nd/Q35vXK228TyD/Qh4z/DoF+pBY6+97WmgVSXZxmwizj3iCAB2f/wx7X/9a1RG2pZFFEKIdIiYplnrTDuaLxAdqeJzcC5Ho09izt0u5Pc+T5y11ljxPIP9FqsT0lzDox9ib2vEdNN0NFYTcWaPHhSO+hl7Pv+CXR/MbqSrCiFEs1E1UkXzBULYI1U0X+DsavslMvpkueYLPKX5AkcnGlw8VSJTDxr/MDz6Wqwkez6Nl2uajMb+i6LDmDGUTH+bXe+9S+FZZzby1YUQoukK+b0HjFSxk+y5NfQifhP4VPMF7gK2YI1KeaWe0/fD6gj1puYLbAIeBt4M+b1hp/HFU4NVYHV2wnrYOwPoEcfxLUZjDdMByBkwgOz+/SmZOYtIaVwrJQkhRKul+QJPa77AKICQ37sKuB34DFiJlWSfqev4kN9bHPJ7J4T83v5YSfxfwFrNF/ir0+E68dRgH4m+0YPGh4ZH/xkx0xC2FqlecP2g67nd5A0ZQtny5UR278aVk9Oo1xdCiOYi5PdqMe9/V63sKeCpeM5nD825HLgG+M4+/ifALGB4fcfHM0znmWqflwBXxBFri5COh86uggIAKjZurBq6I4QQInXsoTznYTUvnx/ye7+3i6ZovoCjuRfqTbCGR39RDxpjDI9e4woDetBwvLJAS9GILcQAZPftA0DZ8uXkHp3w83YhhBDOrQA8sbM5xTjdyQmc1GAfsn8mbYWBZi0NVdi8445DZWWx5YEJ5J9+Ohnt2zd+EEII0bp8TMx0wJovUAgcFvJ7vwr5vT84OUG9CVYPGl/bPz+ObjM8ejs9aOyMO9wWwmzkztOZPXrQ+dpr2XL//ez9cr70JhZCiNR7AmuMbdRee9txTk9Qby9iw6NfZ3h03X7vMjz621gLnW+1l6NrVazJ/hv/unnHWy3x2597rvEvLoQQrY8rdkiOvTBAXLP9OBmm8zusbs0Av8QaG9QNq2fVffFcrCVo7F7EUblHHUnuscdStrqu2cCEEEIkSbnmC/SNftB8gX7Uv0TrAZxk40o9aJTb738CvGhP1B8wPPrd8VyspUhHDRbA3aEDkYULMU0zbYleCCFaiTuxJqcI2J/Ppv4VeA7gpAabYXj06L/mQ4HPY8riWrqnJVCq8Z/BRmV0tobolK9alZbrCyFEaxHyewPAKcA39uuUkN87K55zOKnBzgZeNjz6JqwFzj8FMDx6V6AsrohbgHTWG3OPOoqdr7zKrtkfkt23b/0HCCGESFjI710OLE/0eCc12BuA+fb7s2IWRe8PTEj0ws1ZupqIC848k8yiIrY9/TSRslb3t40QQqSc5gtMS8Y+4GyYTiU1JNJ4F1tvKdL57NOdn0+HMZey+V4/u97/gLbnetMWixBCtFBDNV/gn/Xsc4STE8Uz2b+wpasGC1A4ahRkZrJj0qT0BSGEEC3XY8Ceel6POzmRrOAdJ5eC+aHtLN+8C5crHbXZTHaOPJfIzGlUbt9ORocOaYhBCCFappDfe2eyzqUac+m1dHO73WY47Hgpvxqd+/AnLNlQkqSIEnfR97O5as939Jk2FVdeXrrDEUKIpFNKhU3TbLYVQccJ1vDoI4BB9sdv9KDxUcqiSpFkJNhNxaW8umAdlZFI/TungGnCIx+t4Az3Dsa/eQ+F55xDjwkPpCUWIYRIpRafYA2PXggEAA34GmukyiBgDXCOHjTSX51zKBkJtinod+sMTuvXgVteupXKjT/Q//PPpKlYCNHiNPcE66ST0z+BhUAfPWicrweN84C+9rZ/pTI4UbOcTDdlpqLtz0YBsO2JJ9IckRBCtCyaL3C15gs06PmbkwQ7ErhODxpVczDaUydejzV1omhkOZlu9pWH6TDmUlyFhWx//gV2zZ6d7rCEEKIlORVYrfkCD9rzEMfNSYKt0IPGQQ8c7fGx5TXsL1IsN8vFvoowGZ060XPiRAC2PPhgmqMSQoiWI+T3/go4BtgJfKT5AjM0X+CceM7hJMHuMjz60dU3Gh79GKzxQKKR5WS4Ka2wniXnHnkEOUccQfmKlZQtT3hGLyGEENWE/N5N9rCdXwNHAi9pvkBQ8wUctd46eXh8F/tXzpkHmMAw4G/AHxILWzREXnYGq7furlpVp91Fo9l02+2sueIK+r37rgzbEUKIBtJ8gRzgV8A4oBS4CXgDa8H117A6/tbJyVSJ7xgevRL4K/unTPwauEoPGjMTilw0SI92OSxet5OKsElWhqL96NHsW7iI4qlT2f7CC3QaOzbdIQohRHMXAt4Hxob83gUx2+drvsD7Tk6Q8okmDI/+HtAViAC7gD/pQWOR4dG7AC9g9UguA8bqQSO6Uk8e8AwwxD7OpweNKXaZC/g3cA5WbXqCHjQecxJLSxmmc/2ri5i6cAPBv59FTqYbgModO1g+dBi5AweivfJymiMUQoiGS+cwHc0X6Bbye39oyDnqfQZrePTHYt6fl8A1RutB42g9aAwEHgAm2tv9wDw9aPQHfgtMMjx69Iu8ESjTg0Y/4EzgMcOjt7fLLgUGAIcBxwM3Gx7dk0BczVZ0vYHYv40y2rfHlZ9PeSjEvv/9Lz2BCSFEyzFW8wU6Rj9ovkAnzRe4PZ4TOOnkdGLM+7hODqAHjZ0xH9ti1UgBRgOP2vssADYDJ9tlF8WUrQbmAufFlD2uB42wHjS2Y7WFXxxvXM2Zy86w4WqtD11uuonwnj388H+3pSMsIYRoSc4L+b3boh9Cfu+PwPnxnMBJ1VvV8t4xw6O/AIywP55lePSOgEsPGltjdgsBPe33PbFminJaNrim6yqlxgPjYz4nEn6T47bvI1Itwba/aDS73nuPPZ99Run335Nz+OHpCE8IIVqCmhJGZjwncJJgsw2PrtsXi30PgB40ltZ3Aj1oXAZgePTfAPcDY7Cen8aqfjNmgmX7dzLNCcSsZet2u1vEygYuu93BrGE65EKvlz2ffUbxlKnk/MXXuIEJIUTLsUzzBcYDD2LlmeuBYDwncNJEnAfMwJqPODfmfQB4J56L6UHjefbXZDE8eueY4l7AWvv9Wg7sAu20rFVQtdRgAQrOGAmZmeycMqWxwxJCiJbkWuBcYB/WnA9nAX+K5wROhuloiUQGVQsF5OtBY6P9+efANmA78DrW+KI7DI8+BKun8af2odGyyw2P3htryqqxMWVXGx59CtYz3YuwbrzViC5DW/0ZLIC7oICC005j1/vvs+eLL2gzdGgjRyeEEM1fyO/dCJyu+QJt7M9xT6yU6u7PbYE3DY+ei9W5aStwrh40TMOj3wK8aHj05VhTLo6xp18Eqxl5ouHRV9jHjbM7NAG8iDV8Z1l0Xz1oGCm+jyaltmewUW1/fj673n+fdWP/QL8P3iejc+ca9xNCCFE7zRfoBvQGMjRfAICQ3zvX6fFOlqvbysHPS8Fqkzb1oNHFcbRp1lLGwd4x/Tue+zzEl7f+hEMKc2rc58ennmLrAxNoN3o03e66s5EjFEKIhkvzONi/Ys3etAqIJg4z5Pce7/QcTgKvsYeuSB9XPTVYgI6/+x0/PvIoJTNmcMjf/oorK6uxwhNCiJbgCqCfPTwnIU4S7B49aCR8AZF8Vc9gI7UnWKUUhed6KX5zChtvupmifz/UOMEJIUTLsKkhyRWcJdj3gEEAhkd/Rg8aVzbkgqLhXHaGrW+Wy663386ez79g17vvsmfel7Q58YRGiE4IIVqEdzVf4AFgEtZk/wCE/N56h6ZGORmmEzvO9FjnsYlUcdJEDODKyqLr3/4KQPHUqSmPSwghWpDfAr8A3iTBoalOarAtYnKGliTaRFxHC3GVNsOGAVD81lsUnnsu+cNPrucIIYQQIb+3d0PP4STB9jA8+j9reA+AHjRubmgQIj6FudZsXT/uLqN3pzZ17uvKzeXQp55i3VVXsfHmm+k35yNc2dmNEaYQQjRrmi9wHuAJ+b33ab5Ad6BjyO/91unxTpqIH8OaxWJPtffRl2hknq4FACxet9PR/vnDT6bQ6yW8Ywf7Fi1OYWRCCNEyaL7AHVgTHEX7HZnA4/Gcw8lMTjKIsokZ0L0QgNA253/f5I8YQUkgwIZrr6X31ClkduuWqvCEEKJRab7AQeuOh/zeRdX2OQ1rqt9lMZuHhvzefbWc9nzgOOArgJDf+4PmCxTEE1daBvCKhinMsZqId5dW1rNnzDHecyhbtoxtTz7Jjskv0+WG8fUfJIQQzcPokN+7E0DzBc7HWnd8UA37LQ35vU7ndigN+b3h6AxOiXDSRCyamJxMN1luF7viSLBKKTpc/hsAiqdPT1VoQgjR6KLJ1Ra77nhDrNF8gZMBU/MFXJov8DfA8fNXkBpss1WQkxFXggXI6NCB/JE/YfcHs9m3eDG5xxyTouiEECIpXEqp9TGfJ9jLkB5E8wUOWHe8lvMdrvkC32BNffhsyO99rI5r/xl4HjgS2At8AlwaV/Dx7Cyajra5mezYWx73ce0uvBCAHZMnJzskIYRItohpmkUxrxqTK0DI770s5PceCvwNa8GY6r4BikJ+7yDg58BYzRcYXcf5Nof83rOAdkCnkN97Rsjv3RxP8E4m+19AHWNh9aDheOLjdGspk/0DXPncAuYu38rSu84i0+3876TIvn18f+wgXHl5aK++Qnb//imMUgghEpfoZP+aL7APK5luq2OfvwDdQ35vjWu8ar7A/OoT+9e0rS5OAr/R6clE4+nWLoeKsMnOvRV0LnA+rtWVm0vX229j0513sfbK39F31kxceXkpjFQIIVJH8wUKgXx7/VY0XyB23fHY/boBm0N+b8TuDXwu8Ewdpz4gP2q+gBvIjyc2J8N0Po7nhKJxRKdLNBOYaKv9JZew95uFlLz9NjunTaPDr36V7PCEEKKxtAXe1HyBA9YdD/m9puYLPA1MD/m904ELgD9ovkAlVu57HXi2+sk0X+Am4GagreYLbIkpysOal9ixepuIowyP3gm4HTgGqFqEVJqI0+P2t5bw/BdrmH/rT+hSy5qwdSk1DFb//Bdk6zraKy/L7E5CiCYnHevBar5AW6A98F+siSaiSkJ+7454zhVP4BOBz4AzgRuAq4GF8VxMJI+qqsEmJkfXyR6gU7bUYNNtt9P9Pn/yghNCiGYq5PcWA8XA2Q09VzwJtqceNEYZHv3XetB42/Do7wIzGxqASIydX+tdsq4u2ssvs3z4KRS/9RYdf38V2X37Jic4IYRo5jRfoC/wENVabUN+bxen54hnmE50TEiZ4dE7AJVAURzHiyRSOFuyri6u7Gy6jLdmdNr55pSkxCWEEC3E08BLWFMv/gSYhpVwHYsnwX5vJ9aXgHnAl0gTcdpU1WAbeJ78U4aDUmyfOJGS999vcFxCCNFCtA35va8CEXsFnauBM+I5geMEqweNMXrQ2K4HjX9jLUR7JyDdT9PEzq847aRWm8zu3en5rNWRbtP/3UakrKyBkQkhRItQYf/cpfkCvYBsoFc8J3CcYA2PXjWllB40PtODxjvAI/FcTCSPy151vYH5FYA2J55A4aifEd65U5azE0IIy8eaL9ABK899BawA4prIPZ5OTifWsG1oPBcTybO/Bpuc8+Wfciol099mw4030Gf6dDLat0/OiYUQohkK+b03228na77AJ1jjbbfXcchB6k2whkf/JTAa0AyP/lpMUVtkwfX0qXoGm5wMW+g9h73z57PztdfYePMt9HzqyaScVwghmruQ37sOWKf5AmuBnk6Pc1KDXQYEgOPtn1ElwOx4ghTJE+1FnKwarFKKrrffxu5PP2HPJ59glpejsrKSc3IhhGgZVP277OdkqsTFwGLDowf0oLE14bBEUrmS1Is4lnK7yT/lFHa+8io/3HEn3e65u2pCCyGEEPH9kxvPM9gMw6O/w/719mYDV+tB44d4LiiSI5r3GjIOtiadr72WPV98QfGUKeQceYTMUyyEaFU0X2BAHcVxTdsYzzjYJ4HPgR7263N7m0iDZDcRR2W0b0/PZyYC8ON//5vckwshRNMXqONVGs+J4snGh+pB42cxn/2GR18Uz8VE8uxvuU1yhgWyinqQO3Ag+xYtomTWuxSedWbSryGEEE1RyO/tnaxzxZNgXYZH76oHjU0AhkfvQj0PfA2PngO8AgwA9gKbgLF60AjZx78A9AXK7O2f2sflYa3TNwRr+SGfHjSm2GUu4N/AOVjZZYIeNB6jlama7D/5+RWATn8Yy/o/X8uG668nu+9bsjC7EELEqd4mYsOjv2y/vR9YaHj0Jw2P/gTwtb2tPk8Ch+tBYyDwDvublf3APD1o9MeaGWqS4dGjCf9GoEwPGv2wVu95zPDo0YGZl2Il7MOwejbfbHh0j4M4WpToXzaRFCXY/FNPpdvdfwfTZMerr9V/gBBCiAM4eQbrAdCDxotY8zD+D1gCnKkHjZfqOlAPGqV60JihB41oGpgH9LHfjwYetfdbAGwGTrbLLoopWw3MBc6LKXtcDxphPWhsB14DLnZwHy2KSvI42JoUjBwJQMnMmQ2eklEIIVobJ03EVf+y6kFjCVZyTdSfgbcNj94RcFUb9hNi/wDensCaOMoG13QxpdR4YHzM58Qjb2JS1ckplis3l/yRP2H3B7PZ++WXtDmxpsm8hBBC1MRJgj3K8OhbatiuAFMPGo7WxjM8+q1Af6wV4nM5uHdO9exnJli2fyfTnABMiH52u90tphpWNQ42xXfU/pe/ZPcHs9n52uuSYIUQIg5OmoiXYXU2qv4abP+sl+HRbwR+AZytB429etDYZm/vHLNbL2Ct/X4toCVQ1mqkahxsdXlDhkBmJiUzZlDy7nspvZYQQrQkTmqwZXrQWFP/bjUzPPp44BJgpB40dsYUvQ6MA+4wPPoQoCvwabWyyw2P3hs4FavmGy272vDoU7DmQ74IOCvR+JqrxmruduXl0WviM6wZcxmbbr+dghGnyRSKQgjhgJMabML/khsevQh4AGgHfGR49EWGR//SLr4FGGZ49OXAc8AYPWhU2mX3A7mGR18BvAuMszs0AbwIfI9Vs14A3K8HDSPRGJu7xuh7lDdkCIU/s5az27toUeovKIQQLYBqTb1D3W63GQ6H0x1GUvx3zkrumxXkrXEnccyh7VJ+vZKZM9lw/XgyunSh76yZuPLyUn5NIUTrppQKm6YZ1/SETUk8UyWKJkSlYLL/uhSefTZtL/gFlVu2sOmeexrpqkII0Xw1278MWrtou/0bX69LeUenKPOKa1n76f8w53xD//U7GdC9LW5Xyxn6JIQQySRNxM3Uu99t4uoXv05rDHeddwSXDdXSGoMQouVq7k3EzTbw1u7MI7ry+KXHsW773ka9bnjXLta/OY2Xugxm5aIgSIIVQogaSYJtxs46smtarrtNb8dLTy7hh7mfU6KZFJ59dlriEEKIpkw6OYm4tdd6ojDZlZnHhuvHs/7P16Y7JCGEaHIkwYq4uVyKdnlZbDliMK7CQna99x4bb7mFcElJukMTQogmQxKsSEhR+zx2VULv114lW9cpfms66676PWZ5ebpDE0KIJkESrEhIXpabyohJlqbR+43XyezZk32LF7P81NMoX78h3eEJIUTaSYIVCcl0u6iojACg3G56T5lCm2HDCO/YwborryS8e0+aIxRCiPSSBCsSkuFWVEQiVZ/d+W049KknAShfs4ZVo36GWVlZ2+FCCNHiSYIVCcl0u6gMHzhJiXK76f/pJ6isLCo3/sDme/1pik4IIdJPEqxISKZbURkxqT4TWEanTvSdOQOAHZMmsW3is5gtZPYsIYSIhyRYkZCC7EwAtu4uO6gss0cPDn3qKVz5+Wz55z/ZLIsDCCFaIUmwIiHHae0BmBPcWmN5/vCT6fPO26AUOya/zJrLfkOk7OBkLIQQLZUkWJGQY+01aNftqH0u5MyuXekzI4C7Uyf2zp/P+mvGyWQUQohWQ+YiFgk5pG0OAJuKS+vcL7t3b/q8PZ1VZ5/Dns8+Y8WI0zls/pcot7sxwhRCtAKaL/Ae0BWIALuAP4X83kU17Hcl4MOqXM4Grgn5vSkb7iA1WJGQguwM8rMzmLlkU737ZrRvT/9P5pIzYACRPXtYNuwkSr9f1ghRCiFaidEhv/fokN87EHgAmFh9B80X6A38HTgZ6IeVkK9MZVCSYEVClFKUVoSpjBkLW+f+mZn0evEF8k87jUhxMWt+/WsipXXXfoUQwomQ37sz5mNbrJpsdRcCU0N+7+aQ32sCjwOXpDIuSbAiYcP7d4prf1ebNhQ9+giZvXoS2b2bjTfdhFlRkaLohBAtgEsptT7mNb62HTVf4AXNF1gH3A38poZdegJrYj6H7G0pIwlWJMylFA4rsFWU24328suonBx2vf8B3x9/gnR8EkLUJmKaZlHMa0JtO4b83stCfu+hwN+A+2vZLXbgvkpmoDWRBCsS5nIpItUmmnAio0MH+n88h5yjj8bct481v76Ufd99l4IIhRCtTcjvfR4YofkCHasVrQW0mM+97G0pIwlWJMylIJxAggVwt21Lr+efI//00ylbvpzQBRdStmpVkiMUQrR0mi9QqPkC3WM+/xzYBmyvtuubwM81X+AQzRdQwFjglVTGJglWJMztUpgmB02X6JQrN5dDH3uUTtf8AYDV5/+c7ZMmJTNEIUTL1xaYpvkC32q+wGJgHHBuyO81NV/gac0XGAUQ8ntXAbcDnwErgS3AM6kMTCX6j2Nz5Ha7zbDMi5s04yZ/Q+B/P7DyH+fgdjXscUbx22+z8aabAejmv5e2552HUil/RCKEaMKUUmHTNJvtfA1SgxUJc9sJMJHnsNW1/dnPKHr8v+By8YPvL6y94ooGn1MIIdJJEqxIWLTSGo4kpxWk4LTT6PfB+7jy89n7xTzW/OZyKjZvTsq5hRCisUmCFQlz2Rk2mU8ZMrt3p9ekSWT378/eL78kdNHFVGyqf7YoIYRoaiTBioS57CbiRHsS1ybn8MPoPf0tsj0eKjdtYsVpI9j37bdJvYYQQqRayh8eGx79P8AorDFHR+lBY4m9vQvwAtAXKAPG6kHjU7ssD6t31xCsKa98etCYYpe5gH8D52ANGp6gB43HUn0f4mDJfAZbnVKK3q+9ykbfXyiZMYN1v7uKfp/MxZWVlfRrCSFEKjRGDfYNrMmV11Tb7gfm6UGjP/BbYJLh0aMJ/0agTA8a/YAzgccMj97eLrsUGAAcBhwP3Gx4dE+K70HUwGX/9phxzubklMrKoseEB8ju359wcTEbrrue8M6dqbmYEEIkWcprsHrQmAtgePTqRaOB3vY+CwyPvhkrEc8BLgIut8tWGx59LnAe8Jxd9rgeNMLAdsOjvwZcDNyR2jsR1UWH5pz177l0KcxJ3YVG3kBZn1WY+0pRf32dbF0HpcjNdPHPC46hZ8e81F1bCCESlJbxRYZH7wi49KCxNWZziP0TL9c1KXNNZYNruo49MfT4mM8NiFpU9/NjezB/9XZ2l1aytSTFK+N07Unltm2Y5eWwch1bswsBmLpwA9eO7J/aawshRALSOYC3+oO76tmvrkmZHU3YbE8MXTU5tNvtbj2zajSC43p14L3rT22060X27GHT3fdQPHUquws78MvTb+WzlT9KghVCNElp6UWsB41tAIZH7xyzOXbi5bomZW70CZtF0+Bq04bu9/6DtuefT37JdrLCFexZt57ydevSHZoQQhwkncN0XseaMxLDow/BWl3+0xrKegOnAtNjyq42PLrb8OgdsJ7JvtqIcYs063bvPyj672N0qdjFnm07WXnGT1n1s1Hs/uRTTJkKUwjRRKQ8wRoe/VHDo68HioAPDI++wi66BRhmePTlWJ2XxuhBo9Iuux/Itfd9FxinB43oyggvAt8Dy4AFwP160DBSfR+i6VBKUTBiBG21nhR37Ia7a1fKli9n3VVXsWrUeRS/9VbCCxAIIUSyyGT/otn627RveWneWmb8+WS0jcvZdM8/KLP/1soZMIBOfxxHwemnpzlKIUSiZLJ/IdLknCO7AfDMpyHyBg+mz9Qp9PtwNrnHHUfp0qWsv2YcG/9yK2WrVqc5UiFEayQJVjRbQ3p3AMD4oaRqW2b37miTXqLnC8/jatuW4qlTWXXOOay/7nr2fv11ukIVQrRCkmBFs5XpdtG7UxvKwwdPJdXm+OPpP/djekx4gMxDD2XXrFms+fWlrLnsN5TMmiXPaIUQKScJVjRr2RkuKmtIsACu7GwKzzmHvu+9S89nJ5J34onsnT+fDdddz7LBQyiZMaORoxVCtCaSYEWzlul2URGuuzaqlKLN0KH0eu5Zek95k/yRPyGyZw8bxt/A+j9fy+65cxspWiFEayIJVjRrGW5FZcT5agM5AwZw6COPoL3yMrnHHMOu995j3e+vZvUvR7Pj9dcx4ziXEELURRKsaNYyXS4q66nB1iR34EC0V1+hT+AdCs46i9Jvv2XT/93GsqHD2P7884SLi1MQrRCiNZFxsKJZ+9VT85i/ejunHd4FAKWsyamtn8r6ab+nqsyavlrF7B/ZvZuKUIjyVatQmCgTcjyHkd2nL67cnAPOhf0+w6W4bGgv+nUpSM/NC9HCNfdxsJJgRbM24b3vefzjVZiYmKa1CoRpmvbP1F//mKK2vPXHk1N/ISFaIUmwzYgk2NbJNM2qZFs9AUcTM9U+h8vK2Pna62yfPJnKLVsxFbS/4grannceGYd0xQRO8n9IRTjC8nvOlqUQhUgBSbDNiCRYkYidU6byw//9H9i/O+1Gj6bDby7jtoV7ePWrdTx+6SDOsmeVEkIkjyTYZkQSrEhUePduiqdOY9tTT1G5ZQsAG0aez+/yT6Z9tosF44eR0bZtmqMUomWRBNuMSIIVDWWaJiXvvMOOyS+zb/Firjn1Ola37c606T7yCvPJ7t+fzEMPJfeoI2kzfDiZPXpI87EQCZIE24xIghXJVLljB3+ZNJ/X11cy0/yC3NAKylasILJnT9U+GV27kjfoWLIPO4wsTSOrVy8yu3fHLbVdIeolCbYZkQQrku3v7yzlmU9Xc/7A7uRmuQGo3LWbyu07qNy6lYptPxIu2YU1GAhMBSYKd2Eh7q5dcbdvj6ugAJWTCy57H7s3tPXerHqP3QkLoF+XAq4f2V9qx6JFa+4JttkGLkRTcNgh+QBMW7SxhtLO0L4ztK/l4FLghzD8sBPYGeeVN+E9qhuHd5UxuEI0VVKDFaKBSkoriERMazILsCa02P/2oIktAKgMU75yBfu+XULFmjWUr1pF+Zo1VG7YgIqEq/bHNMk95hjyTzqJ7N69ye6jsUi145IXFjGsb0ce/dUg2rfJasS7FaLxNPcarCRYIZqQSHk5FevWUb5mLeWrVrJnwQL2fPIpxMyRHEHx91OuZl6Hfly1L8hv22wno2NHMjp3IuOQrmT26EFG506427bFlZOTxrsRomEkwTYjkmBFcxTevYeKdWspW7GS8lCI8jVr2LB2E6P7XMwhe7dTtGtL7Qe7XKjMTFwZGaisLFR2Nioz03plZaEyMlCZmZCRgSszA+XOiD4urtFPB3TlVyf0TP5NClEDSbDNiCRY0VJUhiP84r+fs3LLbsDqDIUZgYhp95IyqxaVN83922pjxmZVZTdrx75QlJrWPocXunG5XCiXwuV2oVwulNtt/VQqZj5odcC80Pvngj54nuiqJvUDjgGXUrTLyyIn8+B1SWq7m9pv8+CC2vatdXstV619f+fnr+3ccW6mtn/Ta9oaf9zOzw1w7U/6c9ghifcTkATbjEiCFa2ZGYkQ3r6d8M6dhHfupGLzZiK7dhEuLrG2FRcT2VVCeMdOyteupfLHHw9oml7YuT+v9R/BpjYdwFoSAdNe/MAE+7MCl/1Tueyfan+Zsn/an62zq6re1Sax57NeFbKCYLM1+aoTGNa3U8LHS4JtRiTBCuGcGYkQLi62knJxCeGSYsLbthMpK8UsL8fct4/I3r1E9uwlsncvZnkZkX2l1vuyMiKlMe/37iVSWgoVFXHHsTszh7CqeWVNFf3nK1qLzs6ymrzdLpQrA+VyQYbbaibPykS53Ci329rf5YKMDFR2JsqdiXIpcLlRbqsmjtuFUi5UZhYq0z6X/QcEygUul33p/duVy4XKyrb3jWkJsHu+7X974PbYZaCiZSp6TIYblZlt3W9sT7mYRgdX9H3ssC0V7XZ34LYDNsXs79rfjLB/9/0njjmH/SnaeS+mEx/VzpFz5JFkduhQ4387J5p7gm22gScikhfhiMeOSHcYQrQcufbLKdNV1ZRd1XQdidhN2hH7s9XcbUbsOqxZbG0jegzW9mhzuDVAuFpTeMx+sWVV22PLIvv3jSmi0n4f/98EwpY5vweuvHh+QVqWVpVghRBpprBrfwf3pWoyU2bEJtuqZZdin5CaHPjQsXrSrvb5oHMf/KHm3avFUPuJ6lfn7g7OleDxKqt1DyGTJmIhhBBNkjQRCyGEEGmi+QI5wCvAAGAvsAkYG/J7Q9X2Ow2YASyL2Tw05PfuS1VskmCFEEI0d08CM0N+r6n5An+0P/+0hv2WhvzewY0VlCRYIYQQzVbI7y3FqplGzQOuS080B5IEK4QQoqlyKaXWx3yeYJrmhHqO+TPwdi1lh2u+wDdAGHg25Pc+lowga9NsE6zh0fsDzwOdsJYiuVwPGkvTGpQQQohkipimWeR0Z80XuBXoD4ytofgboCjk9xZrvkARMEPzBX4M+b2vJSnWg9Q8ert5eAJ4Ug8ahwH/BJ5JczxCCCHSRPMFbgR+AZwd8nv3Vi8P+b0lIb+32H6/HngZGJ7KmJplgjU8ehdgEPCSvelNoLfh0bW0BSWEECItNF9gPHAJcEbI791Zyz7dNF/AZb8vAM4FFqYyrubaRHwosFEPGpUAetAwDY++FugJhKI7KaXGA+NjD1RKNWQgrAuQmVEt8l1Y5HvYT74Li3wP+zX0u3DXt4Pd3PsAsAr4SPMFAMpCfu8Jmi/wNDA95PdOBy4A/qD5ApVYue914NkGxFavZjnRhOHRjwNe0IPGETHbFgA36EFjbqquq5RaH8/zgJZMvguLfA/7yXdhke9hv9b+XTTLJmJgHVBkePQMAMOjK6xa7dq0RiWEEELYmmWC1YPGFqy280vtTRcAIT1ohNIWlBBCCBGjuT6DBbgaeM7w6LcCJcBvGuGa9Y2/ak3ku7DI97CffBcW+R72a9XfRbN8BiuEEEI0dc2yiVgIIYRo6iTBCiGEECkgCVYIIYRIAUmwDiml+iulPldKLVNKzVdKDUh3TKmilAoppYJKqUX26yJ7exel1Cyl1HKl1BKl1Mkxx+QppV5WSq2wv6NfpO8OEqOU+o9976ZS6siY7Qndt1LKpZR6WCm10i6/prHvKVF1fBdzlFKrYn43ro8pa3HfhVIqRyk1zb6fRfbvgWaXtarfi3q+i1b1e+GYaZrycvACPgQut99fCHyR7phSeK8h4Mgatk8E7rDfDwHWABn259uA5+z3vbEWPW6f7nuJ875PAYqq33+i9w1cBszGmo2mg31eT7rvs4HfxRzg3FqOaXHfBZADnMP+DqF/BN5rjb8X9XwXrer3wulLarAOKKVqnPs4+tdbKzIaeBTANM0FwGYg+lf7RTFlq4G5wHlpiDFhpmnONU1zfQ1Fid73RcDjpmmGTdPcDrwGXJy6O0ieOr6LurS478I0zVLTNGeYdjbAWmu0j/2+Vf1e1PNd1KXFfRdOSYJ15lBgo2malQD2L1h07uOWapJS6lul1NNKqc5KqY6AyzTNrTH7hNj/HfTE+gu+prJmq4H33SK/E+B++3fjVaVU7D+wreG7+DPwtvxeAAevu9qafy9qJAnWueoDhlVaomgcp5imeQxWrX0b1rq7UP93YNZR1pw15L5b2ncyxjRNHTga+AR4p1p5i/0ulFLRtUb/am9qtb8XNXwXrfb3oi6SYJ1ZBxQppTIAlFIteu5j0zTX2j8rgIeA4aZpbgNQSnWO2bUX+7+DtYBWS1mz1cD7bnHfiWma6+yfpmmajwB97NoctODvQilVtdaoaZp7W/PvRfXvAlrv70V9JME6YJpmjXMfm6YZSltQKaKUaqOUahez6RL2r5n4OjDO3m8I0BX4tIay3sCpwPRGCLkxJHrfrwNXK6XcSqkOWM+bXm3EuJNKKZWhlDok5vMFwOZosqGFfhfKWvbyEuAM0zR3xhS1ut+Lmr6L1vp74Ui6e1k1lxdwOPAFsAz4Cjgi3TGl6D77YCXU/wHfAm8Bml12CPAesBz4Djg15rg2WP9jrLC/owvTfS8J3PujwHqgEqun44qG3DdWz8hHgZX264/pvseGfBf2vX5l/14sxur9eUxL/i6welKbdsyL7NeXrfH3orbvojX+Xjh9yVzEQgghRApIE7EQQgiRApJghRBCiBSQBCuEEEKkgCRYIYQQIgUkwQohhBApIAlWiEZir05zpFLqcqXUYSk4fzul1M3Vtj2tlBqe7GsJIeonCVaIxnc5EHeCtZf2quv/2XbAAQnWNM3fmab5SbzXEkI0nCRYIRrXacBg4D/2upnngDX9nLLWGf5GKTVDKXWovf0OpdSLSqkpWAP7uyml7ldKLbCP/1gp1d8+9+NAO3v7V/bxc5RS59rvD1FKTbUnZF+ilPp9NCi7dn27stY8Xq2U+ltjfSFCtFQZ6Q5AiFZmDtasN/8yTfMdAKXUr7BqtENN0wwrpcYAj7B/Sa8RwCDTmrITpdR9pmneZL+/GHgQOBcYC3xlmubAWq79HyBomubP7SUYv1ZKLTJNc75d3s40zWH2/LorlFLPmqa5Ial3L0QrIglWiPQ7H6tW+7W1jgRuIBxT/k40udp+qpT6E1CA1QpV6PA6I4FjwJpf264V/wSIJthJdtlWpdQqrMWxJcEKkSBJsEKknwLuNk1zYi3lu6t2VKonVk30eNM0VymljgY+jONa1edGjf1cGvM+jPz7IESDyDNYIRpfCdA25vN04Bp7NRGUUplKqWNrObYtUA5sspdN/GO18+ZFl1WswQfA7+1rdAZ+TnzJWQgRB0mwQjS+J4Hbop2cTNN8EXgJmKOUWozVmWlETQeapvkt1hJf32E9z10bU7Ydq5n322gnp2r+DBytlPof8BFwT8zzVyFEkslqOkIIIUQKSA1WCCGESAFJsEIIIUQKSIIVQgghUkASrBBCCJECkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECnw/9u1/1xcs4TpAAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<Figure size 480x320 with 2 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "### Visualize results\n",
-    "mpl.rcParams['figure.dpi'] = 80\n",
-    "fig, ax1 = plt.subplots()\n",
-    "\n",
-    "color = 'tab:red'\n",
-    "ax1.set_xlabel('Iteration')\n",
-    "ax1.set_ylabel('Total FIFO Size [kB]', color=color)\n",
-    "ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)\n",
-    "ax1.tick_params(axis='y', labelcolor=color)\n",
-    "ax1.set_ylim(0, max(log_total_fifo_size))\n",
-    "         \n",
-    "ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis\n",
-    "\n",
-    "color = 'tab:blue'\n",
-    "ax2.set_ylabel('Latency [cycles]', color=color)\n",
-    "ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)\n",
-    "ax2.tick_params(axis='y', labelcolor=color)\n",
-    "#ax2.set_ylim(0, max(log_latency))\n",
-    "\n",
-    "ax2.axhline(log_min_latency[0], color=\"green\", label=\"Minimum (1st frame) Latency\")\n",
-    "ax2.legend()\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.savefig('fifo_iterative_graph.png', dpi = 300)\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "466f818f",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Iteration: 11\n",
-      "Reducing depth of FIFO: 48/266\n",
-      "Numer of minimized FIFOs: 266/266\n",
-      "Interval: 903174\n",
-      "Min. latency / latency: 2549314/2580781\n",
-      "Total FIFO Size (kB): 226\n",
-      "Done (49 seconds)\n"
-     ]
-    }
-   ],
-   "source": [
-    "### Optional second pass for fine-tuning\n",
-    "(fifo_depths,\n",
-    " log_total_fifo_size,\n",
-    " log_interval,\n",
-    " log_min_latency,\n",
-    " log_latency) = size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "2c707459",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "FIFO        DEPTH | SIZE\n",
-      "FIFO 000:       1 | 24\n",
-      "FIFO 001:       2 | 48\n",
-      "FIFO 002:       2 | 48\n",
-      "FIFO 003:      16 | 2048\n",
-      "FIFO 004:       8 | 64\n",
-      "FIFO 005:       2 | 16\n",
-      "FIFO 006:       8 | 64\n",
-      "FIFO 007:      32 | 256\n",
-      "FIFO 008:      32 | 128\n",
-      "FIFO 009:      32 | 128\n",
-      "FIFO 010:       2 | 8\n",
-      "FIFO 011:     128 | 8192\n",
-      "FIFO 012:       1 | 32\n",
-      "FIFO 013:       1 | 2\n",
-      "FIFO 014:      16 | 128\n",
-      "FIFO 015:     256 | 2048\n",
-      "FIFO 016:       2 | 16\n",
-      "FIFO 017:       2 | 16\n",
-      "FIFO 018:     355 | 45440\n",
-      "FIFO 019:       1 | 4\n",
-      "FIFO 020:       4 | 256\n",
-      "FIFO 021:       1 | 8\n",
-      "FIFO 022:       1 | 10\n",
-      "FIFO 023:       1 | 8\n",
-      "FIFO 024:    4096 | 32768\n",
-      "FIFO 025:       1 | 8\n",
-      "FIFO 026:       1 | 4\n",
-      "FIFO 027:    4096 | 32768\n",
-      "FIFO 028:       1 | 64\n",
-      "FIFO 029:     256 | 1024\n",
-      "FIFO 030:     256 | 2048\n",
-      "FIFO 031:       2 | 16\n",
-      "FIFO 032:       2 | 16\n",
-      "FIFO 033:     288 | 36864\n",
-      "FIFO 034:       1 | 4\n",
-      "FIFO 035:       1 | 64\n",
-      "FIFO 036:       1 | 8\n",
-      "FIFO 037:       1 | 10\n",
-      "FIFO 038:       4 | 32\n",
-      "FIFO 039:       4 | 32\n",
-      "FIFO 040:    4096 | 32768\n",
-      "FIFO 041:    4096 | 32768\n",
-      "FIFO 042:       8 | 32\n",
-      "FIFO 043:      16 | 1024\n",
-      "FIFO 044:     256 | 1024\n",
-      "FIFO 045:     256 | 2048\n",
-      "FIFO 046:       2 | 16\n",
-      "FIFO 047:       2 | 16\n",
-      "FIFO 048:     288 | 36864\n",
-      "FIFO 049:       1 | 4\n",
-      "FIFO 050:       1 | 128\n",
-      "FIFO 051:       1 | 8\n",
-      "FIFO 052:       1 | 10\n",
-      "FIFO 053:       1 | 8\n",
-      "FIFO 054:       1 | 4\n",
-      "FIFO 055:       1 | 4\n",
-      "FIFO 056:       1 | 4\n",
-      "FIFO 057:       1 | 8\n",
-      "FIFO 058:      28 | 3584\n",
-      "FIFO 059:       1 | 4\n",
-      "FIFO 060:       1 | 8\n",
-      "FIFO 061:       1 | 8\n",
-      "FIFO 062:     114 | 14592\n",
-      "FIFO 063:       1 | 8\n",
-      "FIFO 064:       2 | 16\n",
-      "FIFO 065:       1 | 8\n",
-      "FIFO 066:     243 | 31104\n",
-      "FIFO 067:       1 | 4\n",
-      "FIFO 068:       2 | 128\n",
-      "FIFO 069:       1 | 8\n",
-      "FIFO 070:       1 | 10\n",
-      "FIFO 071:       1 | 8\n",
-      "FIFO 072:       1 | 8\n",
-      "FIFO 073:    4096 | 32768\n",
-      "FIFO 074:    4096 | 32768\n",
-      "FIFO 075:       1 | 4\n",
-      "FIFO 076:       6 | 384\n",
-      "FIFO 077:      60 | 240\n",
-      "FIFO 078:     128 | 1024\n",
-      "FIFO 079:       2 | 16\n",
-      "FIFO 080:       2 | 16\n",
-      "FIFO 081:     394 | 50432\n",
-      "FIFO 082:       1 | 4\n",
-      "FIFO 083:       1 | 64\n",
-      "FIFO 084:      15 | 120\n",
-      "FIFO 085:      15 | 150\n",
-      "FIFO 086:      16 | 128\n",
-      "FIFO 087:      16 | 128\n",
-      "FIFO 088:    4096 | 32768\n",
-      "FIFO 089:    4096 | 32768\n",
-      "FIFO 090:      16 | 64\n",
-      "FIFO 091:      32 | 2048\n",
-      "FIFO 092:      64 | 256\n",
-      "FIFO 093:     128 | 1024\n",
-      "FIFO 094:      32 | 256\n",
-      "FIFO 095:       2 | 16\n",
-      "FIFO 096:     394 | 50432\n",
-      "FIFO 097:       1 | 4\n",
-      "FIFO 098:       1 | 64\n",
-      "FIFO 099:      15 | 120\n",
-      "FIFO 100:      15 | 150\n",
-      "FIFO 101:      16 | 128\n",
-      "FIFO 102:      16 | 128\n",
-      "FIFO 103:    4096 | 32768\n",
-      "FIFO 104:    4096 | 32768\n",
-      "FIFO 105:      16 | 64\n",
-      "FIFO 106:      32 | 2048\n",
-      "FIFO 107:      64 | 256\n",
-      "FIFO 108:     128 | 1024\n",
-      "FIFO 109:      32 | 256\n",
-      "FIFO 110:       2 | 16\n",
-      "FIFO 111:     394 | 50432\n",
-      "FIFO 112:       1 | 4\n",
-      "FIFO 113:       1 | 64\n",
-      "FIFO 114:       1 | 8\n",
-      "FIFO 115:       8 | 80\n",
-      "FIFO 116:       8 | 64\n",
-      "FIFO 117:       8 | 32\n",
-      "FIFO 118:       1 | 4\n",
-      "FIFO 119:       8 | 32\n",
-      "FIFO 120:       1 | 8\n",
-      "FIFO 121:      16 | 2048\n",
-      "FIFO 122:       8 | 32\n",
-      "FIFO 123:       1 | 8\n",
-      "FIFO 124:       8 | 64\n",
-      "FIFO 125:     121 | 15488\n",
-      "FIFO 126:       1 | 8\n",
-      "FIFO 127:       2 | 16\n",
-      "FIFO 128:       1 | 8\n",
-      "FIFO 129:     243 | 31104\n",
-      "FIFO 130:       2 | 8\n",
-      "FIFO 131:       8 | 512\n",
-      "FIFO 132:       1 | 8\n",
-      "FIFO 133:       8 | 80\n",
-      "FIFO 134:       8 | 64\n",
-      "FIFO 135:       8 | 64\n",
-      "FIFO 136:    1024 | 8192\n",
-      "FIFO 137:    8192 | 65536\n",
-      "FIFO 138:       8 | 32\n",
-      "FIFO 139:      16 | 1024\n",
-      "FIFO 140:       4 | 16\n",
-      "FIFO 141:       8 | 64\n",
-      "FIFO 142:       2 | 16\n",
-      "FIFO 143:       2 | 16\n",
-      "FIFO 144:     512 | 65536\n",
-      "FIFO 145:       1 | 4\n",
-      "FIFO 146:       1 | 64\n",
-      "FIFO 147:      30 | 240\n",
-      "FIFO 148:      32 | 320\n",
-      "FIFO 149:      32 | 256\n",
-      "FIFO 150:      32 | 256\n",
-      "FIFO 151:    1024 | 8192\n",
-      "FIFO 152:    8192 | 65536\n",
-      "FIFO 153:      32 | 128\n",
-      "FIFO 154:      32 | 2048\n",
-      "FIFO 155:      32 | 128\n",
-      "FIFO 156:      32 | 256\n",
-      "FIFO 157:       2 | 16\n",
-      "FIFO 158:       2 | 16\n",
-      "FIFO 159:     512 | 65536\n",
-      "FIFO 160:       1 | 4\n",
-      "FIFO 161:       1 | 64\n",
-      "FIFO 162:      30 | 240\n",
-      "FIFO 163:      32 | 320\n",
-      "FIFO 164:      32 | 256\n",
-      "FIFO 165:      32 | 256\n",
-      "FIFO 166:    1024 | 8192\n",
-      "FIFO 167:    8192 | 65536\n",
-      "FIFO 168:      32 | 128\n",
-      "FIFO 169:      32 | 2048\n",
-      "FIFO 170:      32 | 128\n",
-      "FIFO 171:      32 | 256\n",
-      "FIFO 172:       2 | 16\n",
-      "FIFO 173:       2 | 16\n",
-      "FIFO 174:     512 | 65536\n",
-      "FIFO 175:       1 | 4\n",
-      "FIFO 176:       1 | 64\n",
-      "FIFO 177:      30 | 240\n",
-      "FIFO 178:      32 | 320\n",
-      "FIFO 179:      32 | 256\n",
-      "FIFO 180:      32 | 256\n",
-      "FIFO 181:    1024 | 8192\n",
-      "FIFO 182:    8192 | 65536\n",
-      "FIFO 183:      32 | 128\n",
-      "FIFO 184:      32 | 2048\n",
-      "FIFO 185:      32 | 128\n",
-      "FIFO 186:      32 | 256\n",
-      "FIFO 187:       2 | 16\n",
-      "FIFO 188:       2 | 16\n",
-      "FIFO 189:     512 | 65536\n",
-      "FIFO 190:       1 | 4\n",
-      "FIFO 191:       1 | 64\n",
-      "FIFO 192:      30 | 240\n",
-      "FIFO 193:      32 | 320\n",
-      "FIFO 194:      32 | 256\n",
-      "FIFO 195:    1024 | 8192\n",
-      "FIFO 196:      32 | 256\n",
-      "FIFO 197:      32 | 128\n",
-      "FIFO 198:    8192 | 65536\n",
-      "FIFO 199:      32 | 2048\n",
-      "FIFO 200:      32 | 128\n",
-      "FIFO 201:      32 | 256\n",
-      "FIFO 202:       2 | 16\n",
-      "FIFO 203:       2 | 16\n",
-      "FIFO 204:     512 | 65536\n",
-      "FIFO 205:       1 | 4\n",
-      "FIFO 206:       1 | 64\n",
-      "FIFO 207:       1 | 8\n",
-      "FIFO 208:       1 | 10\n",
-      "FIFO 209:       1 | 8\n",
-      "FIFO 210:       1 | 10\n",
-      "FIFO 211:       1 | 4\n",
-      "FIFO 212:       1 | 4\n",
-      "FIFO 213:       1 | 4\n",
-      "FIFO 214:       1 | 8\n",
-      "FIFO 215:       8 | 1024\n",
-      "FIFO 216:       1 | 4\n",
-      "FIFO 217:       1 | 8\n",
-      "FIFO 218:       2 | 16\n",
-      "FIFO 219:     121 | 15488\n",
-      "FIFO 220:       1 | 8\n",
-      "FIFO 221:       2 | 16\n",
-      "FIFO 222:       1 | 8\n",
-      "FIFO 223:     218 | 27904\n",
-      "FIFO 224:       4 | 16\n",
-      "FIFO 225:       8 | 512\n",
-      "FIFO 226:       3 | 24\n",
-      "FIFO 227:       4 | 40\n",
-      "FIFO 228:       8 | 64\n",
-      "FIFO 229:       8 | 64\n",
-      "FIFO 230:    3696 | 29568\n",
-      "FIFO 231:    7782 | 62256\n",
-      "FIFO 232:       8 | 32\n",
-      "FIFO 233:      64 | 4096\n",
-      "FIFO 234:      16 | 64\n",
-      "FIFO 235:      16 | 128\n",
-      "FIFO 236:       2 | 16\n",
-      "FIFO 237:       2 | 16\n",
-      "FIFO 238:     512 | 65536\n",
-      "FIFO 239:       4 | 16\n",
-      "FIFO 240:       8 | 512\n",
-      "FIFO 241:       3 | 24\n",
-      "FIFO 242:       4 | 40\n",
-      "FIFO 243:       8 | 64\n",
-      "FIFO 244:       8 | 64\n",
-      "FIFO 245:    3696 | 29568\n",
-      "FIFO 246:    7782 | 62256\n",
-      "FIFO 247:       8 | 32\n",
-      "FIFO 248:      64 | 4096\n",
-      "FIFO 249:      16 | 64\n",
-      "FIFO 250:      16 | 128\n",
-      "FIFO 251:       2 | 16\n",
-      "FIFO 252:       2 | 16\n",
-      "FIFO 253:     512 | 65536\n",
-      "FIFO 254:       4 | 16\n",
-      "FIFO 255:       8 | 512\n",
-      "FIFO 256:       2 | 16\n",
-      "FIFO 257:       2 | 20\n",
-      "FIFO 258:       2 | 16\n",
-      "FIFO 259:       2 | 20\n",
-      "FIFO 260:       4 | 80\n",
-      "FIFO 261:       2 | 40\n",
-      "FIFO 262:       1 | 16\n",
-      "FIFO 263:       1 | 20\n",
-      "FIFO 264:       1 | 21\n",
-      "FIFO 265:       1 | 16\n"
-     ]
-    }
-   ],
-   "source": [
-    "### Display resulting FIFO depths\n",
-    "print(\"FIFO        DEPTH | SIZE\")\n",
-    "for fifo, depth in enumerate(fifo_depths):\n",
-    "    size = depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % fifo]\n",
-    "    print(\"FIFO %03d: \"%(fifo) + (\"%d\"%(depth)).rjust(7) + \" | %d\"%(size))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "64c444f5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### Export for use in FINN\n",
-    "fifo_depth_export = {}\n",
-    "for fifo, depth in enumerate(fifo_depths):\n",
-    "    fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo] = {}\n",
-    "    # Try to account for additional registers introduced by virtual FIFO HLS implementation\n",
-    "    fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo][\"depth\"] = depth + 4\n",
-    "\n",
-    "with open(\"fifo_depth_export.json\", \"w\") as f:\n",
-    "    json.dump(fifo_depth_export, f, indent=2)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.4"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/src/finn/benchmarking/bench_rtl_swg.py b/src/finn/benchmarking/bench_rtl_swg.py
deleted file mode 100644
index 37995be10e..0000000000
--- a/src/finn/benchmarking/bench_rtl_swg.py
+++ /dev/null
@@ -1,403 +0,0 @@
-import numpy as np
-from onnx import TensorProto, helper
-from qonnx.core.datatype import DataType
-from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.custom_op.general.im2col import compute_conv_output_dim
-from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.general import GiveUniqueNodeNames
-from qonnx.util.basic import gen_finn_dt_tensor
-
-import finn.core.onnx_exec as oxe
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.analysis.fpgadataflow.res_estimation import res_estimation
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
-    ReplaceVerilogRelPaths,
-)
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
-
-
-def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt):
-    k_h, k_w = k
-    ifm_dim_h, ifm_dim_w = ifm_dim
-    stride_h, stride_w = stride
-    dilation_h, dilation_w = dilation
-    ofm_dim_h, ofm_dim_w = ofm_dim
-
-    odt = idt
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
-    )
-
-    im2col_node = helper.make_node(
-        "Im2Col",
-        ["inp"],
-        ["outp"],
-        domain="finn.custom_op.general",
-        stride=[stride_h, stride_w],
-        kernel_size=[k_h, k_w],
-        input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)),
-        dilations=[dilation_h, dilation_w],
-        pad_amount=[0, 0, 0, 0],
-        pad_value=0,
-    )
-    graph = helper.make_graph(
-        nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp]
-    )
-
-    model = helper.make_model(graph, producer_name="im2col-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", odt)
-
-    return model
-
-
-def make_single_slidingwindow_modelwrapper(
-    type,
-    k,
-    ifm_ch,
-    ifm_dim,
-    ofm_dim,
-    simd,
-    m,
-    parallel_window,
-    stride,
-    dilation,
-    idt,
-    dw=0,
-    ram_style="auto",
-):
-    k_h, k_w = k
-    ifm_dim_h, ifm_dim_w = ifm_dim
-    stride_h, stride_w = stride
-    dilation_h, dilation_w = dilation
-    ofm_dim_h, ofm_dim_w = ofm_dim
-
-    odt = idt
-    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch])
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch]
-    )
-
-    SlidingWindow_node = helper.make_node(
-        type,
-        ["inp"],
-        ["outp"],
-        domain="finn.custom_op.fpgadataflow",
-        backend="fpgadataflow",
-        ConvKernelDim=[k_h, k_w],
-        IFMChannels=ifm_ch,
-        IFMDim=[ifm_dim_h, ifm_dim_w],
-        OFMDim=[ofm_dim_h, ofm_dim_w],
-        SIMD=simd,
-        M=m,
-        parallel_window=parallel_window,
-        Stride=[stride_h, stride_w],
-        Dilation=[dilation_h, dilation_w],
-        inputDataType=idt.name,
-        outputDataType=odt.name,
-        depthwise=dw,
-        ram_style=ram_style,
-    )
-    graph = helper.make_graph(
-        nodes=[SlidingWindow_node],
-        name="slidingwindow_graph",
-        inputs=[inp],
-        outputs=[outp],
-    )
-
-    model = helper.make_model(graph, producer_name="slidingwindow-model")
-    model = ModelWrapper(model)
-
-    model.set_tensor_datatype("inp", idt)
-    model.set_tensor_datatype("outp", odt)
-
-    # DEBUG
-    # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
-    # swg_inst = getCustomOp(swg_node)
-    # swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd")
-
-    return model
-
-
-def prepare_inputs(input_tensor):
-    return {"inp": input_tensor}
-
-
-def bench_rtl_swg(params, task_id, run_id, results_dir):
-    # Read params
-    idt = params["idt"]
-    k = params["k"]
-    ifm_dim = params["ifm_dim"]
-    ifm_ch = params["ifm_ch"]
-    stride = params["stride"]
-    dilation = params["dilation"]
-    dw = params["dw"]
-    simd = params["simd"]
-    m = params["m"]
-    parallel_window = params["parallel_window"]
-    flip = params["flip"]
-    ram_style = params["ram_style"]
-
-    only_estimates = params["only_estimates"]
-    skip_rtlsim = params["skip_rtlsim"]
-    skip_synth = params["skip_synth"]
-    synthesize_hls_comparison = params["synthesize_hls_comparison"]
-
-    output_dict = {}
-
-    # convert string to FINN DataType
-    idt = DataType[idt]
-
-    if flip:
-        if (
-            ifm_dim[0] == ifm_dim[1]
-            and k[0] == k[1]
-            and stride[0] == stride[1]
-            and dilation[0] == dilation[1]
-        ):
-            return
-        k = k[::-1]
-        ifm_dim = ifm_dim[::-1]
-        stride = stride[::-1]
-        dilation = dilation[::-1]
-
-    k_h, k_w = k
-    ifm_dim_h, ifm_dim_w = ifm_dim
-    stride_h, stride_w = stride
-    dilation_h, dilation_w = dilation
-
-    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
-    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
-
-    # inter-dependent test parameters
-    if simd == "ifm_ch":
-        simd = ifm_ch
-
-    # skip conditions
-    if simd > ifm_ch:
-        return
-    if ifm_ch % simd != 0:
-        return
-    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
-        return
-    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
-        return
-    if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or (
-        k_w == 1 and (stride_w != 1 or dilation_w != 1)
-    ):
-        return
-    if k_h == 1 and k_w == 1 and simd != ifm_ch:
-        return
-    if parallel_window and simd != ifm_ch:
-        return
-    if not parallel_window and m > 1:
-        return
-
-    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
-    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
-    ofm_dim = [ofm_dim_h, ofm_dim_w]
-
-    x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch))
-    model = make_single_slidingwindow_modelwrapper(
-        type="ConvolutionInputGenerator_rtl",
-        k=k,
-        ifm_ch=ifm_ch,
-        ifm_dim=ifm_dim,
-        ofm_dim=ofm_dim,
-        simd=simd,
-        m=m,
-        parallel_window=parallel_window,
-        stride=stride,
-        dilation=dilation,
-        idt=idt,
-        dw=dw,
-        ram_style=ram_style,
-    )
-
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(GiveUniqueNodeNames())
-    if not only_estimates:
-        model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
-        model = model.transform(PrepareRTLSim())
-
-    node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0]
-    inst = getCustomOp(node)
-
-    exp_cycles_dict = model.analysis(exp_cycles_per_layer)
-    exp_cycles = exp_cycles_dict[node.name]
-    exp_res_dict = model.analysis(res_estimation)
-    exp_res = exp_res_dict[node.name]
-
-    output_dict["est_Cycles"] = exp_cycles
-    output_dict["est_LUT"] = exp_res["LUT"]
-    output_dict["est_BRAM"] = exp_res["BRAM_18K"] * 0.5
-    output_dict["est_URAM"] = exp_res["URAM"]
-
-    if only_estimates:
-        return output_dict
-
-    if not skip_rtlsim:
-        # prepare input data
-        input_dict = prepare_inputs(x)
-        # execute model
-        oxe.execute_onnx(model, input_dict)["outp"]
-
-        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-        output_dict["Cycles"] = cycles_rtlsim
-        print("RTLSIM cycles: %d" % cycles_rtlsim)
-
-    if not skip_synth:
-        model = model.transform(ReplaceVerilogRelPaths())
-        model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5))
-        model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5))
-        ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth"))
-        output_dict["LUT"] = ooc_res_dict["LUT"]
-        output_dict["BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"]
-        output_dict["URAM"] = ooc_res_dict["URAM"]
-        output_dict["WNS"] = ooc_res_dict["WNS"]
-        output_dict["Fmax"] = ooc_res_dict["fmax_mhz"]
-
-    ###############################################################
-    # HLS COMPARISON:
-    if synthesize_hls_comparison:
-        output_dict["HLS_compatible"] = "yes"
-
-        is_square = True
-        props_to_check = [k, ifm_dim, ofm_dim, stride, dilation]
-        for prop in props_to_check:
-            is_square = prop[0] == prop[1]
-            if not is_square:
-                is_square = False
-
-        if not is_square or dilation[0] != 1 or dilation[1] != 1:
-            # try 1D HLS ConvInpGen
-
-            # rectangular case not supported
-            if ifm_dim[0] == 1:
-                if ofm_dim[0] != 1 or k[0] != 1 or stride[0] != 1 or dilation[0] != 1:
-                    output_dict["HLS_compatible"] = "no"
-            elif ifm_dim[1] == 1:
-                if ofm_dim[1] != 1 or k[1] != 1 or stride[1] != 1 or dilation[1] != 1:
-                    output_dict["HLS_compatible"] = "no"
-            else:
-                output_dict["HLS_compatible"] = "no"
-
-            # unsupported parallelization
-            if m > 1:
-                output_dict["HLS_compatible"] = "no"
-            if parallel_window > 0:
-                fully_unfolded = simd == ifm_ch
-                non_dws = dw == 0
-                no_stride = stride_h == 1 and stride_w == 1
-                no_dilation = dilation_h == 1 and dilation_w == 1
-                supported_ram_style = ram_style in ["auto", "distributed"]
-                if not (
-                    fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style
-                ):
-                    output_dict["HLS_compatible"] = "no"
-
-            # unsupported hyperparams
-            if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1):
-                output_dict["HLS_compatible"] = "no"
-            if (dilation_h > 1 or dilation_w > 1) and dw == 0:
-                output_dict["HLS_compatible"] = "no"
-
-            model = make_single_slidingwindow_modelwrapper(
-                type="ConvolutionInputGenerator1D",
-                k=k,
-                ifm_ch=ifm_ch,
-                ifm_dim=ifm_dim,
-                ofm_dim=ofm_dim,
-                simd=simd,
-                m=m,
-                parallel_window=parallel_window,
-                stride=stride,
-                dilation=dilation,
-                idt=idt,
-                dw=dw,
-                ram_style=ram_style,
-            )
-        else:
-            # try 2D HLS ConvInpGen
-
-            # unsupported parallelization
-            if m > 1 or parallel_window > 0:
-                output_dict["HLS_compatible"] = "no"
-
-            model = make_single_slidingwindow_modelwrapper(
-                type="ConvolutionInputGenerator",
-                k=k,
-                ifm_ch=ifm_ch,
-                ifm_dim=ifm_dim,
-                ofm_dim=ofm_dim,
-                simd=simd,
-                m=m,
-                parallel_window=parallel_window,
-                stride=stride,
-                dilation=dilation,
-                idt=idt,
-                dw=dw,
-                ram_style=ram_style,
-            )
-
-        if output_dict["HLS_compatible"] == "no":
-            return output_dict
-
-        # perform usual RTLSIM steps
-        model = model.transform(SetExecMode("rtlsim"))
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
-        model = model.transform(HLSSynthIP())
-        model = model.transform(PrepareRTLSim())
-
-        # extract first results (estimates)
-        node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator")
-        if len(node_) == 0:
-            node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")
-        node = node_[0]
-        inst = getCustomOp(node)
-
-        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
-        exp_cycles = exp_cycles_dict[node.name]
-        output_dict["HLS_FINN_est_Cycles"] = exp_cycles
-
-        exp_res_dict = model.analysis(res_estimation)
-        exp_res = exp_res_dict[node.name]
-        output_dict["HLS_FINN_est_LUT"] = exp_res["LUT"]
-        output_dict["HLS_FINN_est_BRAM"] = exp_res["BRAM_18K"] * 0.5
-        output_dict["HLS_FINN_est_URAM"] = exp_res["URAM"]
-
-        exp_res_dict_hls = model.analysis(hls_synth_res_estimation)
-        exp_res_hls = exp_res_dict_hls[node.name]
-        output_dict["HLS_HLS_est_LUT"] = int(exp_res_hls["LUT"])
-        output_dict["HLS_HLS_est_BRAM"] = int(exp_res_hls["BRAM_18K"]) * 0.5
-        output_dict["HLS_HLS_est_URAM"] = int(exp_res_hls["URAM"])
-
-        # perform rtlsim (for cycle measurement)
-        if not skip_rtlsim:
-            input_dict = prepare_inputs(x)
-            oxe.execute_onnx(model, input_dict)["outp"]
-            cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
-            output_dict["HLS_Cycles"] = cycles_rtlsim
-
-        # perform ooc synthesis (for resource/slack measurement)
-        model = model.transform(ReplaceVerilogRelPaths())
-        model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5))
-        model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5))
-        ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth"))
-        output_dict["HLS_LUT"] = ooc_res_dict["LUT"]
-        output_dict["HLS_BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"]
-        output_dict["HLS_URAM"] = ooc_res_dict["URAM"]
-        output_dict["HLS_WNS"] = ooc_res_dict["WNS"]
-        output_dict["HLS_Fmax"] = ooc_res_dict["fmax_mhz"]
-
-    return output_dict
diff --git a/src/finn/benchmarking/collect.py b/src/finn/benchmarking/collect.py
deleted file mode 100644
index fa71c2a2aa..0000000000
--- a/src/finn/benchmarking/collect.py
+++ /dev/null
@@ -1,280 +0,0 @@
-import json
-import os
-import shutil
-from dvclive.live import Live
-
-from finn.benchmarking.util import delete_dir_contents
-
-
-def log_dvc_metric(live, prefix, name, value):
-    # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix)
-    live.log_metric(prefix + name.replace("/", "-"), value, plot=False)
-
-def open_json_report(id, report_name):
-    # look in both, build & measurement, artifacts
-    path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
-    path2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name)
-    if os.path.isfile(path1):
-        with open(path1, "r") as f:
-            report = json.load(f)
-        return report
-    elif os.path.isfile(path2):
-        with open(path2, "r") as f:
-            report = json.load(f)
-        return report
-    else:
-        return None
-
-def log_all_metrics_from_report(id, live, report_name, prefix=""):
-    report = open_json_report(id, report_name)
-    if report:
-        for key in report:
-            log_dvc_metric(live, prefix, key, report[key])
-
-def log_metrics_from_report(id, live, report_name, keys, prefix=""):
-    report = open_json_report(id, report_name)
-    if report:
-        for key in keys:
-            if key in report:
-                log_dvc_metric(live, prefix, key, report[key])
-
-def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""):
-    report = open_json_report(id, report_name)
-    if report:
-        if key_top in report:
-            for key in keys:
-                if key in report[key_top]:
-                    log_dvc_metric(live, prefix, key, report[key_top][key])
-
-if __name__ == "__main__":
-    # Go through all runs found in the artifacts and log their results to DVC
-    run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output"))
-    print("Looking for runs in build artifacts")
-    run_ids = []
-    for run_dir in run_dir_list:
-        if run_dir.startswith("run_"):
-            run_id = int(run_dir[4:])
-            run_ids.append(run_id)
-    run_ids.sort()
-    print("Found %d runs" % len(run_ids))
-
-    follow_up_bench_cfg = list()
-    # Prepare (local) output directory where follow-up bench configs will be stored
-    output_cfg_dir = os.path.join(os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"))
-    output_folding_dir = os.path.join(output_cfg_dir, "folding")
-    output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json")
-
-    for id in run_ids:
-        print("Processing run %d" % id)
-        experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id)
-        experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + " (" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + ")"
-        #TODO: cache images once we switch to a cache provider that works with DVC Studio
-        with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live:
-            ### PARAMS ###
-            # input parameters logged by benchmarking infrastructure
-            metadata_bench = open_json_report(id, "metadata_bench.json")   
-            params = {"params": metadata_bench["params"]}
-            live.log_params(params)
-
-            # optional metadata logged by builder
-            metadata_builder = open_json_report(id, "metadata_builder.json")
-            if metadata_builder:
-                metadata = {
-                    "metadata": {
-                        "tool_version": metadata_builder["tool_version"],
-                    }
-                }
-                live.log_params(metadata)
-
-            # optional dut_info.json (additional information about DUT generated during model generation)
-            dut_info_report = open_json_report(id, "dut_info.json")
-            if dut_info_report:
-                dut_info = {"dut_info": dut_info_report}
-                live.log_params(dut_info)
-
-            ### METRICS ###
-            # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.)
-            # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36)
-
-            # status
-            status = metadata_bench["status"]
-            if status == "ok":
-                # mark as failed if either bench or builder indicates failure
-                if metadata_builder:
-                    status_builder = metadata_builder["status"]
-                    if status_builder == "failed":
-                        status = "failed"
-            log_dvc_metric(live, "", "status", status)
-
-            # verification steps
-            if "output" in metadata_bench:
-                if "builder_verification" in metadata_bench["output"]:
-                    log_dvc_metric(live, "", "verification", metadata_bench["output"]["builder_verification"]["verification"])
-
-            # estimate_layer_resources.json
-            log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [
-                "LUT",
-                "DSP",
-                "BRAM_18K",
-                "URAM",
-                ], prefix="estimate/resources/")
-
-            # estimate_layer_resources_hls.json
-            log_nested_metrics_from_report(id, live, "estimate_layer_resources_hls.json", "total", [
-                "LUT",
-                "FF",
-                "DSP",
-                "DSP48E",
-                "DSP58E", # TODO: aggregate/unify DSP reporting
-                "BRAM_18K",
-                "URAM",
-                ], prefix="hls_estimate/resources/")
-
-            # estimate_network_performance.json
-            log_metrics_from_report(id, live, "estimate_network_performance.json", [
-                "critical_path_cycles",
-                "max_cycles",
-                "max_cycles_node_name",
-                "estimated_throughput_fps",
-                "estimated_latency_ns",
-                ], prefix="estimate/performance/")
-
-            # rtlsim_performance.json
-            log_metrics_from_report(id, live, "rtlsim_performance.json", [
-                "N",
-                "TIMEOUT",
-                "latency_cycles",
-                "cycles",
-                "fclk[mhz]",
-                "throughput[images/s]",
-                "stable_throughput[images/s]",
-                # add INPUT_DONE, OUTPUT_DONE, number transactions?
-                ], prefix="rtlsim/performance/")
-
-            # fifo_sizing.json
-            log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/")
-
-            # stitched IP DCP synth resource report
-            log_nested_metrics_from_report(id, live, "post_synth_resources_dcp.json", "(top)", [
-                "LUT",
-                "FF",
-                "SRL",
-                "DSP",
-                "BRAM_18K",
-                "BRAM_36K",
-                "URAM",
-                ], prefix="synth(dcp)/resources/")
-
-            # stitched IP DCP synth resource breakdown
-            # TODO: generalize to all build flows and bitfile synth
-            layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"]
-            for category in layer_categories:
-                log_nested_metrics_from_report(id, live, "res_breakdown_build_output.json", category, [
-                    "LUT",
-                    "FF",
-                    "SRL",
-                    "DSP",
-                    "BRAM_18K",
-                    "BRAM_36K",
-                    "URAM",
-                    ], prefix="synth(dcp)/resources(breakdown)/" + category + "/")
-
-            # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis)
-            log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
-                "LUT",
-                "LUTRAM",
-                "FF",
-                "DSP",
-                "BRAM",
-                "BRAM_18K",
-                "BRAM_36K",
-                "URAM",
-                ], prefix="synth(ooc)/resources/")
-            log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [
-                "WNS",
-                "fmax_mhz",
-                # add TNS? what is "delay"?
-                ], prefix="synth(ooc)/timing/")
-
-            # post_synth_resources.json (shell synth / step_synthesize_bitfile)
-            log_nested_metrics_from_report(id, live, "post_synth_resources.json", "(top)", [
-                "LUT",
-                "FF",
-                "SRL",
-                "DSP",
-                "BRAM_18K",
-                "BRAM_36K",
-                "URAM",
-                ], prefix="synth/resources/")
-
-            # post synth timing report 
-            # TODO: only exported as post_route_timing.rpt, not .json
-
-            # instrumentation measurement
-            log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/")
-
-            # IODMA validation accuracy
-            log_metrics_from_report(id, live, "validation.json", [
-                "top-1_accuracy",
-                ], prefix="measurement/validation/")
-
-            # power measurement
-            # TODO
-
-            # live fifosizing report + graph png
-            log_metrics_from_report(id, live, "fifo_sizing_report.json", [
-                "error",
-                "fifo_size_total_kB",
-                ], prefix="fifosizing/live/")
-
-            image = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png")
-            if os.path.isfile(image):
-                live.log_image("fifosizing_pass_1", image)
-
-            # time_per_step.json
-            log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"])
-
-            ### ARTIFACTS ###
-            # Log build reports as they come from GitLab artifacts,
-            # but copy them to a central dir first so all runs share the same path
-            run_report_dir1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports")
-            run_report_dir2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports")
-            dvc_report_dir = "reports"
-            os.makedirs(dvc_report_dir, exist_ok=True)
-            delete_dir_contents(dvc_report_dir)
-            if os.path.isdir(run_report_dir1):
-                shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True)
-            if os.path.isdir(run_report_dir2):
-                shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True)
-            live.log_artifact(dvc_report_dir)
-
-        # Prepare benchmarking config for follow-up runs after live FIFO-sizing
-        folding_config_lfs_path = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "folding_config_lfs.json")
-        if os.path.isfile(folding_config_lfs_path):
-            # Copy folding config produced by live FIFO-sizing
-            output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json")
-            os.makedirs(output_folding_dir, exist_ok=True)
-            print("Saving lfs-generated folding config of this run to use in a future follow-up run: %s" % output_folding_path)
-            shutil.copy(folding_config_lfs_path, output_folding_path)
-
-            # Create benchmarking config
-            metadata_bench = open_json_report(id, "metadata_bench.json")   
-            configuration = dict()
-            for key in metadata_bench["params"]:
-                # wrap in list
-                configuration[key] = [metadata_bench["params"][key]]
-            # overwrite FIFO-related params
-            import_folding_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"), "folding", experiment_name + ".json")
-            configuration["fifo_method"] = ["manual"]
-            configuration["target_fps"] = ["None"]
-            configuration["folding_path"] = [import_folding_path]
-
-            follow_up_bench_cfg.append(configuration)
-
-    # Save aggregated benchmarking config for follow-up job
-    if follow_up_bench_cfg:
-        print("Saving follow-up bench config for lfs: %s" % output_cfg_path)
-        with open(output_cfg_path, "w") as f:
-            json.dump(follow_up_bench_cfg, f, indent=2)
-
-    print("Done")

From 9a1682e79b5cfc6b0896e5b2b7329eaf0982ee25 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 10:22:14 +0200
Subject: [PATCH 111/125] Move VGG10 files to dvc

---
 .gitignore                                    |  1 +
 ci/.gitlab-bench.yml                          |  2 +-
 .../benchmarking => ci}/cfg/metafi_test.yml   |  0
 .../cfg/mobilenetv1_test.yml                  |  5 ++-
 .../benchmarking => ci}/cfg/mvau_test.yml     |  0
 .../benchmarking => ci}/cfg/resnet50_test.yml |  4 +-
 .../cfg/synthetic_fifotest.yml                |  2 +-
 .../cfg/transformer_gpt_all.yml               |  0
 .../cfg/transformer_radioml_all.yml           |  0
 .../cfg/transformer_sweep.yml                 |  0
 .../cfg/transformer_test.yml                  |  0
 ci/cfg/vgg10_test.yml                         | 33 ++++++++++++++
 models.dvc                                    |  6 +++
 src/finn/benchmarking/bench.py                |  2 +-
 src/finn/benchmarking/cfg/vgg10_test.yml      | 33 --------------
 src/finn/benchmarking/dut/resnet50.yml        |  2 +-
 .../builder/custom_step_library/resnet.py     | 44 -------------------
 17 files changed, 49 insertions(+), 85 deletions(-)
 rename {src/finn/benchmarking => ci}/cfg/metafi_test.yml (100%)
 rename {src/finn/benchmarking => ci}/cfg/mobilenetv1_test.yml (83%)
 rename {src/finn/benchmarking => ci}/cfg/mvau_test.yml (100%)
 rename {src/finn/benchmarking => ci}/cfg/resnet50_test.yml (84%)
 rename {src/finn/benchmarking => ci}/cfg/synthetic_fifotest.yml (95%)
 rename {src/finn/benchmarking => ci}/cfg/transformer_gpt_all.yml (100%)
 rename {src/finn/benchmarking => ci}/cfg/transformer_radioml_all.yml (100%)
 rename {src/finn/benchmarking => ci}/cfg/transformer_sweep.yml (100%)
 rename {src/finn/benchmarking => ci}/cfg/transformer_test.yml (100%)
 create mode 100644 ci/cfg/vgg10_test.yml
 create mode 100644 models.dvc
 delete mode 100644 src/finn/benchmarking/cfg/vgg10_test.yml

diff --git a/.gitignore b/.gitignore
index 7ddc2c6d67..2d48ddac55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,3 +106,4 @@ bench_input
 bench_output
 bench_save
 bench_work
+/models
diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml
index f3139c0fbd..ca98a4b115 100644
--- a/ci/.gitlab-bench.yml
+++ b/ci/.gitlab-bench.yml
@@ -22,7 +22,6 @@ FINN Build:
     - job: Build
       pipeline: $PARENT_PIPELINE_ID
   variables:
-    GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead
     SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )"
     NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH"
   extends: .setup_full_2022_2
@@ -32,6 +31,7 @@ FINN Build:
     # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
     - |
       source finn-plus-venv/bin/activate
+      dvc pull
       finn bench --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG
   cache:
     key: $CI_COMMIT_SHA
diff --git a/src/finn/benchmarking/cfg/metafi_test.yml b/ci/cfg/metafi_test.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/metafi_test.yml
rename to ci/cfg/metafi_test.yml
diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.yml b/ci/cfg/mobilenetv1_test.yml
similarity index 83%
rename from src/finn/benchmarking/cfg/mobilenetv1_test.yml
rename to ci/cfg/mobilenetv1_test.yml
index 040fa380e4..e43fc5d081 100644
--- a/src/finn/benchmarking/cfg/mobilenetv1_test.yml
+++ b/ci/cfg/mobilenetv1_test.yml
@@ -11,7 +11,8 @@
 
         "auto_fifo_depths": [False],
 
-        "rtlsim_batch_sizauto_fifo_depths": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+        "rtlsim_batch_size": [2],
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["mobilenetv1"],
@@ -25,7 +26,7 @@
 
         "live_fifo_sizing": [True],
 
-        "rtlsim_batch_size": [5],
+        "rtlsim_batch_size": [2],
         "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
     }
 ]
\ No newline at end of file
diff --git a/src/finn/benchmarking/cfg/mvau_test.yml b/ci/cfg/mvau_test.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/mvau_test.yml
rename to ci/cfg/mvau_test.yml
diff --git a/src/finn/benchmarking/cfg/resnet50_test.yml b/ci/cfg/resnet50_test.yml
similarity index 84%
rename from src/finn/benchmarking/cfg/resnet50_test.yml
rename to ci/cfg/resnet50_test.yml
index e3acf9fa7d..937d106474 100644
--- a/src/finn/benchmarking/cfg/resnet50_test.yml
+++ b/ci/cfg/resnet50_test.yml
@@ -5,7 +5,7 @@
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
         "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
         "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+        "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
         "board": ["U250"],
         "synth_clk_period_ns": [4],
@@ -21,7 +21,7 @@
         "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
         "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
         "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
+        "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
 
         "board": ["RFSoC2x2"],
         "synth_clk_period_ns": [10],
diff --git a/src/finn/benchmarking/cfg/synthetic_fifotest.yml b/ci/cfg/synthetic_fifotest.yml
similarity index 95%
rename from src/finn/benchmarking/cfg/synthetic_fifotest.yml
rename to ci/cfg/synthetic_fifotest.yml
index 58a49d108d..d0daa12d6a 100644
--- a/src/finn/benchmarking/cfg/synthetic_fifotest.yml
+++ b/ci/cfg/synthetic_fifotest.yml
@@ -17,7 +17,7 @@
         "rtlsim_n": [5],
 
         "live_fifo_sizing": [True],
-        "output_products": [["bitfile", "pynq_driver", "deployment_package"]]
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
     },
     {
         "dut": ["synthetic_nonlinear"],
diff --git a/src/finn/benchmarking/cfg/transformer_gpt_all.yml b/ci/cfg/transformer_gpt_all.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/transformer_gpt_all.yml
rename to ci/cfg/transformer_gpt_all.yml
diff --git a/src/finn/benchmarking/cfg/transformer_radioml_all.yml b/ci/cfg/transformer_radioml_all.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/transformer_radioml_all.yml
rename to ci/cfg/transformer_radioml_all.yml
diff --git a/src/finn/benchmarking/cfg/transformer_sweep.yml b/ci/cfg/transformer_sweep.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/transformer_sweep.yml
rename to ci/cfg/transformer_sweep.yml
diff --git a/src/finn/benchmarking/cfg/transformer_test.yml b/ci/cfg/transformer_test.yml
similarity index 100%
rename from src/finn/benchmarking/cfg/transformer_test.yml
rename to ci/cfg/transformer_test.yml
diff --git a/ci/cfg/vgg10_test.yml b/ci/cfg/vgg10_test.yml
new file mode 100644
index 0000000000..33b5e7ba5f
--- /dev/null
+++ b/ci/cfg/vgg10_test.yml
@@ -0,0 +1,33 @@
+[
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"],
+        "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"],
+        "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "auto_fifo_depths": [True],
+        "auto_fifo_strategy": ["largefifo_rtlsim"],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["vgg10"],
+
+        "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"],
+        "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"],
+        "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"],
+
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+
+        "live_fifo_sizing": [True],
+
+        "rtlsim_batch_size": [5],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
\ No newline at end of file
diff --git a/models.dvc b/models.dvc
new file mode 100644
index 0000000000..75a6adb5e4
--- /dev/null
+++ b/models.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 888f3cd73800cf97d94d78e71456370f.dir
+  size: 348910
+  nfiles: 3
+  hash: md5
+  path: models
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 7a9b0877e6..d47a98bd44 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -46,7 +46,7 @@ def get_default_session_options_new():
         if config_name == "manual":
             config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
         else:
-            configs_path = os.path.join(os.path.dirname(__file__), "cfg")
+            configs_path = os.path.join(os.path.dirname(__file__), "../../..", "ci/cfg")
             config_select = config_name + ".yml"
             config_path = os.path.join(configs_path, config_select)
         print("Job launched with SLURM ID: %d" % (job_id))
diff --git a/src/finn/benchmarking/cfg/vgg10_test.yml b/src/finn/benchmarking/cfg/vgg10_test.yml
deleted file mode 100644
index e16122b130..0000000000
--- a/src/finn/benchmarking/cfg/vgg10_test.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-[
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "auto_fifo_depths": [True],
-        "auto_fifo_strategy": ["largefifo_rtlsim"],
-
-        "rtlsim_batch_size": [5],
-        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "live_fifo_sizing": [True],
-
-        "rtlsim_batch_size": [5],
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml
index 6d6d4bcc31..7452ef5df9 100644
--- a/src/finn/benchmarking/dut/resnet50.yml
+++ b/src/finn/benchmarking/dut/resnet50.yml
@@ -10,7 +10,7 @@ steps:
   - step_set_fifo_depths
   - step_hw_codegen
   - step_hw_ipgen
-  - finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step
+  #- finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step
   - step_create_stitched_ip
   - step_measure_rtlsim_performance
   - step_out_of_context_synthesis
diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py
index 90deae5721..a4082b1adf 100644
--- a/src/finn/builder/custom_step_library/resnet.py
+++ b/src/finn/builder/custom_step_library/resnet.py
@@ -207,47 +207,3 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(SortGraph())
 
     return model
-
-
-def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig):
-    if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO:
-        # previously, we would always ran the finn experimental partitioner on ResNet-50
-        # this is now changed and a fixed floorplan is applied
-        model = model.transform(GiveUniqueNodeNames())
-        model = model.transform(ApplyConfig(cfg.floorplan_path))
-        print("Fixed SLR floorplanning applied")
-
-        # if you would like to try out the experimental partitioner
-        # please uncomment the lines (that are not marked as comment) below.
-
-        # import numpy as np
-        # from finnexperimental.analysis.partitioning import partition
-
-        # comment: apply partitioning of the model, restricting the first and last layer to SLR0
-        # default_slr = 0
-        # abs_anchors = [(0, [default_slr]), (-1, [default_slr])]
-
-        # comment: increase resource limits to make partitioning feasible, except for SLR0
-        # comment: which also has DDR subsystem
-        # limits = np.array(
-        #    [
-        #        [0.75, 0.5, 0.7, 0.6, 0.6],
-        #        [1, 0.7, 0.9, 0.8, 0.8],
-        #        [1, 0.7, 0.9, 0.8, 0.8],
-        #        [1, 0.7, 0.9, 0.8, 0.8],
-        #    ]
-        # )
-        # floorplan = partition(
-        #    model,
-        #    cfg.synth_clk_period_ns,
-        #    cfg.board,
-        #    abs_anchors=abs_anchors,
-        #    multivariant=False,
-        #    linear_cuts=True,
-        #    limits=limits,
-        # )[0]
-
-        # comment: apply floorplan to model
-        # model = model.transform(ApplyConfig(floorplan))
-        # print("SLR floorplanning applied from partitioner")
-    return model

From 881432fa713b616d427fe24458cd6e7834dc5868 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 10:35:30 +0200
Subject: [PATCH 112/125] Fix cfg path

---
 src/finn/benchmarking/bench.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index d47a98bd44..738d8a9c85 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -46,7 +46,7 @@ def get_default_session_options_new():
         if config_name == "manual":
             config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
         else:
-            configs_path = os.path.join(os.path.dirname(__file__), "../../..", "ci/cfg")
+            configs_path = os.path.join("ci", "cfg")
             config_select = config_name + ".yml"
             config_path = os.path.join(configs_path, config_select)
         print("Job launched with SLURM ID: %d" % (job_id))

From 1568af68163507f7172636f6fb2c3b9d6306cce4 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 13:37:17 +0200
Subject: [PATCH 113/125] Move all models to dvc, refactor configs

---
 .gitlab-ci.yml                                |  45 +++++-
 ci/.gitlab-bench.yml                          |   1 +
 ci/cfg/live_fifosizing.yml                    |  50 +++++++
 ci/cfg/metafi_test.yml                        |  14 --
 ...mvau_test.yml => microbenchmark_basic.yml} |  24 +++-
 ci/cfg/mobilenetv1_test.yml                   |  32 -----
 ci/cfg/regression_basic.yml                   |  10 ++
 ci/cfg/regression_extended.yml                |  48 +++++++
 ci/cfg/resnet50_test.yml                      |  33 -----
 ci/cfg/synthetic_fifotest.yml                 |  68 ---------
 ci/cfg/transformer_gpt_all.yml                |  12 --
 ci/cfg/transformer_radioml_all.yml            |  22 ---
 ci/cfg/transformer_sweep.yml                  |  87 -----------
 ci/cfg/transformer_test.yml                   |  24 ----
 ci/cfg/vgg10_test.yml                         |  33 -----
 ci/collect.py                                 |   5 +-
 models.dvc                                    |   6 +-
 src/finn/benchmarking/bench.py                |  10 +-
 src/finn/benchmarking/bench_base.py           | 135 ------------------
 src/finn/benchmarking/dut/metafi.yml          |  28 ----
 src/finn/benchmarking/dut/mobilenetv1.yml     |   7 +
 src/finn/benchmarking/dut/resnet50.yml        |   9 +-
 src/finn/benchmarking/dut/transformer.py      |   2 +-
 src/finn/benchmarking/dut/vgg10.yml           |   8 ++
 24 files changed, 207 insertions(+), 506 deletions(-)
 create mode 100644 ci/cfg/live_fifosizing.yml
 delete mode 100644 ci/cfg/metafi_test.yml
 rename ci/cfg/{mvau_test.yml => microbenchmark_basic.yml} (52%)
 delete mode 100644 ci/cfg/mobilenetv1_test.yml
 create mode 100644 ci/cfg/regression_basic.yml
 create mode 100644 ci/cfg/regression_extended.yml
 delete mode 100644 ci/cfg/resnet50_test.yml
 delete mode 100644 ci/cfg/synthetic_fifotest.yml
 delete mode 100644 ci/cfg/transformer_gpt_all.yml
 delete mode 100644 ci/cfg/transformer_radioml_all.yml
 delete mode 100644 ci/cfg/transformer_sweep.yml
 delete mode 100644 ci/cfg/transformer_test.yml
 delete mode 100644 ci/cfg/vgg10_test.yml
 delete mode 100644 src/finn/benchmarking/dut/metafi.yml

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ad524d0fd7..23eb8c39fe 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -11,7 +11,7 @@ variables:
     value: ""
   TEST_SUITE:
     description: "Select test suite to run"
-    value: "quicktest_ci" # DEBUG
+    value: "full_ci"
     options:
       - "none"
       - "quicktest_ci"
@@ -35,15 +35,14 @@ variables:
     description: "Optional QoS option (include --qos, e.g., --qos express)"
     value: ""
   MANUAL_CFG_PATH:
-    description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner"
+    description: "Name (in ci/cfg/) or path (relative to LOCAL_CFG_DIR) of benchmarking config to run"
     value: ""
 
 workflow:
   name: '$PIPELINE_NAME'
   rules:
-    # Run pipeline for GitHub PRs to dev or main (does not support PRs from forks)
+    # Run pipeline for GitHub PRs to dev (does not support PRs from forks)
     - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
-    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "main"
     # Run pipeline for pushes to dev or main
     - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "main"
     # Run pipeline if manually triggered via API or web GUI
@@ -124,6 +123,9 @@ FINN Test Suite 2022.2:
     # Do not run if test suite has been deselected
     - if: $TEST_SUITE == "none"
       when: never
+    # Do not run for PRs to dev (run only for pushes)
+    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
+      when: never
     # Always run, as long as there was no prior failure
     - when: on_success
   cache:
@@ -155,6 +157,15 @@ FINN Test Suite 2024.2:
   extends:
     - FINN Test Suite 2022.2
     - .setup_full_2024_2
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Do not run if test suite has been deselected
+    - if: $TEST_SUITE == "none"
+      when: never
+    # Always run, as long as there was no prior failure
+    - when: on_success
 
 Bench (Manual):
   stage: test
@@ -172,7 +183,7 @@ Bench (Manual):
     PARENT_PIPELINE_ID: $CI_PIPELINE_ID
     BENCH_CFG: "manual"
 
-Bench:
+Bench (Basic):
   stage: test
   rules:
     # Do not run on a schedule
@@ -188,4 +199,26 @@ Bench:
     PARENT_PIPELINE_ID: $CI_PIPELINE_ID
   parallel:
     matrix:
-      - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest, vgg10_test, mobilenetv1_test]
+      - BENCH_CFG: [regression_basic]
+
+Bench (Extended):
+  stage: test
+  rules:
+    # Do not run on a schedule
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: never
+    # Do not run for PRs to dev (run only for pushes)
+    - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev"
+      when: never
+    - if: $MANUAL_CFG_PATH == ""
+  trigger:
+    include: ci/.gitlab-bench.yml
+    strategy: depend
+    forward:
+      pipeline_variables: true
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    PARALLEL_JOBS: "4"
+  parallel:
+    matrix:
+      - BENCH_CFG: [regression_extended, microbenchmark_basic]
diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml
index ca98a4b115..b5d17d7fdc 100644
--- a/ci/.gitlab-bench.yml
+++ b/ci/.gitlab-bench.yml
@@ -29,6 +29,7 @@ FINN Build:
     # Launch additional monitoring
     - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
     # Launch benchmarking script via FINN CLI, includes deps update and environment preparation
+    # TODO: cache dvc pull
     - |
       source finn-plus-venv/bin/activate
       dvc pull
diff --git a/ci/cfg/live_fifosizing.yml b/ci/cfg/live_fifosizing.yml
new file mode 100644
index 0000000000..f121bacf6d
--- /dev/null
+++ b/ci/cfg/live_fifosizing.yml
@@ -0,0 +1,50 @@
+[
+    # Real models
+    {
+        "dut": ["vgg10"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["resnet50"],
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # Synthetic non-linear models
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [64],
+        "kernel_size": [5],
+        "ch": [8],
+        "simd": [8],
+        "pe": [8],
+        "parallel_window": [1],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [4, 8, 16],
+
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["synthetic_nonlinear"],
+        "dim": [64],
+        "kernel_size": [5],
+        "ch": [8],
+        "simd": [1],
+        "pe": [1],
+        "parallel_window": [0],
+
+        "lb_num_layers": [1],
+        "rb_num_layers": [4, 8, 16],
+
+        "live_fifo_sizing": [True],
+        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/metafi_test.yml b/ci/cfg/metafi_test.yml
deleted file mode 100644
index 711250bbdb..0000000000
--- a/ci/cfg/metafi_test.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-[
-    {
-        "dut": ["metafi"],
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "live_fifo_sizing": [True],
-
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-    ]
\ No newline at end of file
diff --git a/ci/cfg/mvau_test.yml b/ci/cfg/microbenchmark_basic.yml
similarity index 52%
rename from ci/cfg/mvau_test.yml
rename to ci/cfg/microbenchmark_basic.yml
index 7e0b3d14d2..e9a102e51c 100644
--- a/ci/cfg/mvau_test.yml
+++ b/ci/cfg/microbenchmark_basic.yml
@@ -1,4 +1,5 @@
 [
+    # MVAU Test
     {
         "dut": ["mvau"],
         "idt": ["INT4","INT2"],
@@ -22,5 +23,26 @@
         "dut_duplication": [1],
 
         "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # Transformer Dummy
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+
+        "calibration_passes": [32],
+
+        "model_num_heads": [1],
+        "model_num_layers": [1],
+        "model_bias":[true],
+        "model_emb_dim": [32],
+        "model_mlp_dim": [192],
+        "model_seq_len": [64],
+        "model_bits": [2],
+        "model_norm": ["none"],
+        "model_mask": ["none"],
+        "model_positional_encoding": ["binary"],
+
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
     }
-    ]
+]
diff --git a/ci/cfg/mobilenetv1_test.yml b/ci/cfg/mobilenetv1_test.yml
deleted file mode 100644
index e43fc5d081..0000000000
--- a/ci/cfg/mobilenetv1_test.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-[
-    {
-        "dut": ["mobilenetv1"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "auto_fifo_depths": [False],
-
-        "rtlsim_batch_size": [2],
-        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["mobilenetv1"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "live_fifo_sizing": [True],
-
-        "rtlsim_batch_size": [2],
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/ci/cfg/regression_basic.yml b/ci/cfg/regression_basic.yml
new file mode 100644
index 0000000000..9a7604fe19
--- /dev/null
+++ b/ci/cfg/regression_basic.yml
@@ -0,0 +1,10 @@
+[
+    {
+        "dut": ["vgg10"],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+    {
+        "dut": ["mobilenetv1"],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml
new file mode 100644
index 0000000000..d4c2d127a2
--- /dev/null
+++ b/ci/cfg/regression_extended.yml
@@ -0,0 +1,48 @@
+[
+    # ResNet-50
+    {
+        "dut": ["resnet50"],
+        "board": ["U250"],
+        "synth_clk_period_ns": [4],
+        "rtlsim_batch_size": [3],
+        # no deployment package because Alveo deployment is not yet supported by CI
+        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile"]]
+    },
+
+    # 4x GPT Transformer models (currently disabled due to streamlining issues!)
+    # {
+    #     "dut": ["transformer"],
+    #     "seed": [12],
+    #     "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", 
+    #                   "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256",
+    #                   "models/gpt_c_gpt2-s512-t2048-l2-h4-e512",
+    #                   "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"],
+    #     "board": ["U280"],
+    #     "synth_clk_period_ns": [10],
+    #     "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
+    # }
+
+    # 5x RadioML Transformer models
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+        "model_dir": ["models/rml_transformer_0",
+                      "models/rml_transformer_a",
+                      "models/rml_transformer_b",
+                      "models/rml_transformer_c",
+                      "models/rml_transformer_d",],
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    },
+
+    # 1x RadioML Conformer model
+    {
+        "dut": ["transformer"],
+        "seed": [12],
+        "model_dir": ["models/rml_conformer"],
+        "board": ["RFSoC2x2"],
+        "synth_clk_period_ns": [10],
+        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
+    }
+]
diff --git a/ci/cfg/resnet50_test.yml b/ci/cfg/resnet50_test.yml
deleted file mode 100644
index 937d106474..0000000000
--- a/ci/cfg/resnet50_test.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-[
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["U250"],
-        "synth_clk_period_ns": [4],
-
-        "auto_fifo_depths": [False],
-
-        "rtlsim_batch_size": [5],
-        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]]
-    },
-    {
-        "dut": ["resnet50"],
-
-        "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"],
-        "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"],
-        "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"],
-        "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "live_fifo_sizing": [True],
-
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-    ]
\ No newline at end of file
diff --git a/ci/cfg/synthetic_fifotest.yml b/ci/cfg/synthetic_fifotest.yml
deleted file mode 100644
index d0daa12d6a..0000000000
--- a/ci/cfg/synthetic_fifotest.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-[
-    {
-        "dut": ["synthetic_nonlinear"],
-        "dim": [64],
-        "kernel_size": [5],
-        "ch": [8],
-        "simd": [8],
-        "pe": [8],
-        "parallel_window": [1],
-
-        "lb_num_layers": [1],
-        "rb_num_layers": [4],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "rtlsim_n": [5],
-
-        "live_fifo_sizing": [True],
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["synthetic_nonlinear"],
-        "dim": [64],
-        "kernel_size": [5],
-        "ch": [8],
-        "simd": [8],
-        "pe": [8],
-        "parallel_window": [1],
-
-        "lb_num_layers": [1],
-        "rb_num_layers": [4],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "rtlsim_batch_size": [5],
-
-        "auto_fifo_depths": [True],
-        "auto_fifo_strategy": ["characterize"],
-        "characteristic_function_strategy": ["analytical", "rtlsim"],
-
-        "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["synthetic_nonlinear"],
-        "dim": [64],
-        "kernel_size": [5],
-        "ch": [8],
-        "simd": [8],
-        "pe": [8],
-        "parallel_window": [1],
-
-        "lb_num_layers": [1],
-        "rb_num_layers": [4],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "rtlsim_batch_size": [5],
-
-        "auto_fifo_depths": [True],
-        "auto_fifo_strategy": ["largefifo_rtlsim"],
-
-        "fifosim_n_inferences": [2],
-        "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/ci/cfg/transformer_gpt_all.yml b/ci/cfg/transformer_gpt_all.yml
deleted file mode 100644
index e0610c3d7e..0000000000
--- a/ci/cfg/transformer_gpt_all.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-[
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"],
-
-        "board": ["U280"],
-        "synth_clk_period_ns": [10],
-
-        "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]]
-    }
-]
diff --git a/ci/cfg/transformer_radioml_all.yml b/ci/cfg/transformer_radioml_all.yml
deleted file mode 100644
index dede0988c8..0000000000
--- a/ci/cfg/transformer_radioml_all.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-[
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-        "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/ci/cfg/transformer_sweep.yml b/ci/cfg/transformer_sweep.yml
deleted file mode 100644
index 7fa9420d01..0000000000
--- a/ci/cfg/transformer_sweep.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-[
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [1],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [32],
-        "model_mlp_dim": [1536],
-        "model_seq_len": [512],
-        "model_bits": [2],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [8],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [256],
-        "model_mlp_dim": [1536],
-        "model_seq_len": [512],
-        "model_bits": [2],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [12],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [384],
-        "model_mlp_dim": [1536],
-        "model_seq_len": [512],
-        "model_bits": [2],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [12],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [96],
-        "model_mlp_dim": [1536],
-        "model_seq_len": [512],
-        "model_bits": [2],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"]
-    },
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [1],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [32],
-        "model_mlp_dim": [1536],
-        "model_seq_len": [512],
-        "model_bits": [2, 4, 6, 8],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"]
-    }
-]
diff --git a/ci/cfg/transformer_test.yml b/ci/cfg/transformer_test.yml
deleted file mode 100644
index a529981fdc..0000000000
--- a/ci/cfg/transformer_test.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-[
-    {
-        "dut": ["transformer"],
-        "seed": [12],
-
-        "calibration_passes": [32],
-
-        "model_num_heads": [1],
-        "model_num_layers": [1],
-        "model_bias":[true],
-        "model_emb_dim": [32],
-        "model_mlp_dim": [192],
-        "model_seq_len": [64],
-        "model_bits": [2],
-        "model_norm": ["none"],
-        "model_mask": ["none"],
-        "model_positional_encoding": ["binary"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
diff --git a/ci/cfg/vgg10_test.yml b/ci/cfg/vgg10_test.yml
deleted file mode 100644
index 33b5e7ba5f..0000000000
--- a/ci/cfg/vgg10_test.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-[
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"],
-        "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"],
-        "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "auto_fifo_depths": [True],
-        "auto_fifo_strategy": ["largefifo_rtlsim"],
-
-        "rtlsim_batch_size": [5],
-        "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]]
-    },
-    {
-        "dut": ["vgg10"],
-
-        "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"],
-        "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"],
-        "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"],
-
-        "board": ["RFSoC2x2"],
-        "synth_clk_period_ns": [10],
-
-        "live_fifo_sizing": [True],
-
-        "rtlsim_batch_size": [5],
-        "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]]
-    }
-]
\ No newline at end of file
diff --git a/ci/collect.py b/ci/collect.py
index b833278fe9..c7042abf25 100644
--- a/ci/collect.py
+++ b/ci/collect.py
@@ -397,9 +397,10 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=
                 "folding",
                 experiment_name + ".json",
             )
-            configuration["fifo_method"] = ["manual"]
+            configuration["live_fifo_sizing"] = [False]
+            configuration["auto_fifo_depths"] = [False]
             configuration["target_fps"] = ["None"]
-            configuration["folding_path"] = [import_folding_path]
+            configuration["folding_config_file"] = [import_folding_path]
 
             follow_up_bench_cfg.append(configuration)
 
diff --git a/models.dvc b/models.dvc
index 75a6adb5e4..784500a21f 100644
--- a/models.dvc
+++ b/models.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 888f3cd73800cf97d94d78e71456370f.dir
-  size: 348910
-  nfiles: 3
+- md5: 5db49af689e7827c32280837e0c80470.dir
+  size: 202993533
+  nfiles: 40
   hash: md5
   path: models
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 738d8a9c85..8233707260 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -44,11 +44,13 @@ def get_default_session_options_new():
 
         # Gather benchmarking configs
         if config_name == "manual":
-            config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
+            # First check if the repo contains a config with this name (in ci/cfg/*)
+            config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml")
+            if not os.path.exists(config_path):
+                # Otherwise look in LOCAL_CFG_DIR for the filename
+                config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
         else:
-            configs_path = os.path.join("ci", "cfg")
-            config_select = config_name + ".yml"
-            config_path = os.path.join(configs_path, config_select)
+            config_path = os.path.join("ci", "cfg", config_name + ".yml")
         print("Job launched with SLURM ID: %d" % (job_id))
     except KeyError:
         # Launched without SLURM, assume test run on local machine
diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
index dc1b40cee2..4fe8e77168 100644
--- a/src/finn/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -241,141 +241,6 @@ def step_build_setup(self):
     def run(self):
         return self.steps_full_build_flow()
 
-    # def step_finn_estimate(self):
-    #     # Gather FINN estimates
-    #     print("Gathering FINN estimates")
-
-    #     model = self.model_initial
-    #     finn_resources_model = res_estimation(model, fpgapart=self.part)
-    #     finn_cycles_model = model.analysis(exp_cycles_per_layer)
-    #     if self.target_node:
-    #         node = model.get_nodes_by_op_type(self.target_node)[0]
-    #         finn_resources = finn_resources_model[node.name]
-    #         finn_cycles = finn_cycles_model[node.name]
-    #     else:
-    #         finn_resources = finn_resources_model # TODO: aggregate?
-    #         finn_cycles = 0 # TODO: aggregate or drop
-    #     finn_estimates = finn_resources
-    #     finn_estimates["CYCLES"] = finn_cycles
-    #     self.output_dict["finn_estimates"] = finn_estimates
-
-    # def step_hls(self):
-    #     # Perform Vitis HLS synthesis for HLS resource/performance reports
-    #     start_time = time.time()
-    #     print("Performing Vitis HLS synthesis")
-    #     model = self.model_initial
-    #     model = model.transform(PrepareIP(self.part, self.clock_period_ns))
-    #     model = model.transform(HLSSynthIP())
-
-    #     hls_resources_model = model.analysis(hls_synth_res_estimation)
-    #     if self.target_node:
-    #         node = model.get_nodes_by_op_type(self.target_node)[0]
-    #         hls_resources = hls_resources_model[node.name]
-    #     else:
-    #         hls_resources = hls_resources_model # TODO: aggregate?
-    #     self.output_dict["hls_estimates"] = hls_resources
-    #     self.output_dict["hls_time"] = int(time.time() - start_time)
-
-    #     self.model_step_hls = copy.deepcopy(model)
-
-    # def step_rtlsim(self):
-    #     # Perform RTL simulation for performance measurement
-    #     start_time = time.time()
-    #     print("Performing Verilator RTL simulation (n=1)")
-    #     # Prepare
-    #     model = self.model_step_hls
-    #     model = model.transform(SetExecMode("rtlsim"))
-    #     model = model.transform(PrepareRTLSim())
-    #     # Generate input data
-    #     input_tensor = model.graph.input[0]
-    #     input_shape = model.get_tensor_shape(input_tensor.name)
-    #     input_dtype = model.get_tensor_datatype(input_tensor.name)
-    #     x = gen_finn_dt_tensor(input_dtype, input_shape)
-    #     input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case
-    #     # Run
-    #     oxe.execute_onnx(model, input_dict)["outp"]  # do not check output for correctness TODO: add functional verification throughout benchmarking steps
-    #     # Log result
-    #     node = model.get_nodes_by_op_type("MVAU_hls")[0]
-    #     inst = getCustomOp(node)
-    #     rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim")
-    #     self.output_dict["rtlsim_cycles"] = rtlsim_cycles
-    #     self.output_dict["rtlsim_time"] = int(time.time() - start_time)
-
-# TODO: re-introduce simple Vivado power estimation as new builder step
-    # def step_synthesis(self):
-    #     # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports
-    #     start_time = time.time()
-    #     print("Performing Vivado (stitched-ip, out-of-context) synthesis")
-    #     model = self.model_step_hls
-    #     model = model.transform(ReplaceVerilogRelPaths())
-    #     model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns))
-    #     model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns))
-    #     ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth"))
-
-    #     start_test_batch_fast(
-    #         results_path=self.artifacts_dir_power,
-    #         project_path=os.path.join(
-    #             ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
-    #         ),
-    #         run_target="impl_1",
-    #         pairs=[(25, 0.5), (50, 0.5), (75, 0.5)],
-    #     )
-
-    #     # Log most important power results directly (refer to detailed logs for more)
-    #     for reportname in ["25_0.5", "50_0.5", "75_0.5"]:
-    #         with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
-    #             report = json.load(f)
-    #             power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
-    #             power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
-    #             ooc_synth_results["power_%s" % reportname] = power
-    #             ooc_synth_results["power_dyn_%s" % reportname] = power_dyn
-
-    #     self.output_dict["ooc_synth"] = ooc_synth_results
-    #     self.output_dict["ooc_synth_time"] = int(time.time() - start_time)
-
-    #     # Save model for logging purposes
-    #     model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id)))
-    #     self.model_step_synthesis = copy.deepcopy(model)
-
-# TODO: re-introduce sim-based Vivado power estimation as new builder step
-    # def step_sim_power(self):
-    #     # Perform Vivado simulation for accurate power report
-    #     start_time = time.time()
-    #     if "ooc_synth" not in self.output_dict:
-    #         print("ERROR: step_sim_power requires step_synthesis")
-    #     print("Performing Vivado simulation for power report")
-    #     if "rtlsim_cycles" in self.output_dict:
-    #         sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns
-    #     else:
-    #         sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns
-
-    #     model = self.model_step_synthesis
-    #     input_tensor = model.graph.input[0]
-    #     output_tensor = model.graph.output[0]
-    #     input_node_inst = getCustomOp(model.find_consumer(input_tensor.name))
-    #     output_node_inst = getCustomOp(model.find_producer(output_tensor.name))
-    #     sim_power_report(
-    #         results_path=self.artifacts_dir_power,
-    #         project_path=os.path.join(
-    #             self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr"
-    #         ),
-    #         in_width=input_node_inst.get_instream_width(),
-    #         out_width=output_node_inst.get_outstream_width(),
-    #         dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(),
-    #         sim_duration_ns=sim_duration_ns,
-    #     )
-
-    #     # Log most important power results directly (refer to detailed logs for more)
-    #     for reportname in ["sim"]:
-    #         with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f:
-    #             report = json.load(f)
-    #             power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0])
-    #             power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0])
-    #             self.output_dict["power_%s" % reportname] = power
-    #             self.output_dict["power_dyn%s" % reportname] = power_dyn
-
-    #     self.output_dict["sim_power_time"] = int(time.time() - start_time)
-
     def step_parse_builder_output(self, build_dir):
         # TODO: output as .json or even add as new build step
         ### CHECK FOR VERIFICATION STEP SUCCESS ###
diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml
deleted file mode 100644
index fba5a68fe5..0000000000
--- a/src/finn/benchmarking/dut/metafi.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-steps:
-  #- step_residual_tidy
-  #- step_extract_absorb_bias
-  #- step_residual_topo
-  #- step_pre_streamline
-  #- step_residual_streamline
-  #- step_residual_convert_to_hw
-  - step_create_dataflow_partition
-  #- step_set_preferred_impl_style
-  - step_specialize_layers
-  - step_target_fps_parallelization
-  - step_apply_folding_config
-  - step_minimize_bit_width
-  - step_generate_estimate_reports
-  - step_set_fifo_depths
-  - step_hw_codegen
-  - step_hw_ipgen
-  - step_create_stitched_ip
-  - step_measure_rtlsim_performance
-  - step_out_of_context_synthesis
-  - step_synthesize_bitfile
-  - step_make_driver
-  - step_deployment_package
-
-target_fps: null # 23
-
-#TODO: where is this used and why?
-use_conv_rtl: True # use rtl for conv layers (MVAU cannot use rtl in our model)
diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml
index 71a80c4f2a..bb3b26f436 100644
--- a/src/finn/benchmarking/dut/mobilenetv1.yml
+++ b/src/finn/benchmarking/dut/mobilenetv1.yml
@@ -1,3 +1,7 @@
+model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx
+folding_config_file: models/mobilenetv1/ZCU102_folding_config.json
+specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json
+
 steps:
   - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step
   - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step
@@ -14,3 +18,6 @@ steps:
   - step_synthesize_bitfile
   - step_make_driver
   - step_deployment_package
+
+# folding config comes with FIFO sizes
+auto_fifo_depths: False
diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml
index 7452ef5df9..3a3211aad1 100644
--- a/src/finn/benchmarking/dut/resnet50.yml
+++ b/src/finn/benchmarking/dut/resnet50.yml
@@ -1,3 +1,8 @@
+model_path: models/resnet50/resnet50_w1a2_exported.onnx
+folding_config_file: models/resnet50/U250_folding_config.json
+specialize_layers_config_file: models/resnet50/U250_specialize_layers.json
+vitis_floorplan_file: models/resnet50/floorplan_resnet50.json
+
 steps:
   - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step
   - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step
@@ -10,10 +15,12 @@ steps:
   - step_set_fifo_depths
   - step_hw_codegen
   - step_hw_ipgen
-  #- finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step
   - step_create_stitched_ip
   - step_measure_rtlsim_performance
   - step_out_of_context_synthesis
   - step_synthesize_bitfile
   - step_make_driver
   - step_deployment_package
+
+# folding config comes with FIFO sizes
+auto_fifo_depths: False
diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py
index 48152ce9d5..9023c94aff 100644
--- a/src/finn/benchmarking/dut/transformer.py
+++ b/src/finn/benchmarking/dut/transformer.py
@@ -977,7 +977,7 @@ def step_build_setup(self):
         )
 
         # TESTING custom vs live FIFO-sizing
-        if self.params.get("fifo_method") == "live":
+        if self.params.get("live_fifo_sizing"):
             # insert default FIFO-sizing step (behind step_generate_estimate_reports)
             for i in range(len(cfg.steps)):
                 if cfg.steps[i] == "step_generate_estimate_reports":
diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml
index 9e271a6921..99a9ab333d 100644
--- a/src/finn/benchmarking/dut/vgg10.yml
+++ b/src/finn/benchmarking/dut/vgg10.yml
@@ -1,3 +1,7 @@
+model_path: models/vgg10/radioml_w4a4_small_tidy.onnx
+folding_config_file: models/vgg10/ZCU104_folding_config.json
+specialize_layers_config_file: models/vgg10/ZCU104_specialize_layers.json
+
 steps:
   - step_tidy_up
   - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step
@@ -20,4 +24,8 @@ steps:
   - step_make_driver
   - step_deployment_package
 
+# folding config doesn't come with FIFO sizes
+auto_fifo_depths: True
+auto_fifo_strategy: largefifo_rtlsim
+
 standalone_thresholds: True

From 9710dffe8493ab4366c2c8cdce1866c19df03d46 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 14:09:14 +0200
Subject: [PATCH 114/125] Fix linting

---
 .pre-commit-config.yaml                       |   1 +
 ci/cfg/regression_extended.yml                |   2 +-
 src/finn/benchmarking/bench.py                |  36 ++-
 src/finn/benchmarking/bench_base.py           | 160 +++++-----
 src/finn/benchmarking/dut/mvau.py             |  74 +++--
 .../benchmarking/dut/synthetic_nonlinear.py   |  31 +-
 src/finn/benchmarking/dut/transformer.py      | 298 ++++++++----------
 src/finn/benchmarking/templates.py            |   1 +
 src/finn/benchmarking/util.py                 |  16 +-
 .../builder/custom_step_library/conv1d.py     |   4 +-
 .../builder/custom_step_library/mobilenet.py  |   9 +-
 .../builder/custom_step_library/resnet.py     |   3 +-
 .../custom_step_library/transformer.py        |  19 +-
 .../qnn-data/templates/driver/validate.py     |  36 ++-
 14 files changed, 340 insertions(+), 350 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 048a3becda..10ff4d4415 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -43,6 +43,7 @@ repos:
   - id: check-merge-conflict
   - id: check-xml
   - id: check-yaml
+    args: ['--unsafe']
   - id: debug-statements
     exclude: '^src/finn/builder/build_dataflow.py$'
   - id: end-of-file-fixer
diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml
index d4c2d127a2..f40c11ab11 100644
--- a/ci/cfg/regression_extended.yml
+++ b/ci/cfg/regression_extended.yml
@@ -13,7 +13,7 @@
     # {
     #     "dut": ["transformer"],
     #     "seed": [12],
-    #     "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", 
+    #     "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256",
     #                   "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256",
     #                   "models/gpt_c_gpt2-s512-t2048-l2-h4-e512",
     #                   "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"],
diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 8233707260..995b3b565c 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -1,18 +1,16 @@
 import itertools
-import os
 import json
-import yaml
+import onnxruntime as ort
+import os
 import time
 import traceback
-import onnxruntime as ort
+import yaml
 
-from finn.benchmarking.util import delete_dir_contents
 from finn.benchmarking.bench_base import bench
-
 from finn.benchmarking.dut.mvau import bench_mvau
 from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear
 from finn.benchmarking.dut.transformer import bench_transformer
-
+from finn.benchmarking.util import delete_dir_contents
 
 # Register custom bench subclasses that offer more control than YAML-based flow
 dut = dict()
@@ -27,19 +25,24 @@ def start_bench_run(config_name):
     # See https://github.com/microsoft/onnxruntime/issues/8313
     # This seems to happen only when assigned CPU cores are not contiguous
     _default_session_options = ort.capi._pybind_state.get_default_session_options()
+
     def get_default_session_options_new():
         _default_session_options.inter_op_num_threads = 1
         _default_session_options.intra_op_num_threads = 1
         return _default_session_options
+
     ort.capi._pybind_state.get_default_session_options = get_default_session_options_new
 
     try:
         # Launched via SLURM, expect additional CI env vars
         job_id = int(os.environ["SLURM_JOB_ID"])
-        # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk)
+        # original experiment dir (before potential copy to ramdisk):
+        # experiment_dir = os.environ.get("EXPERIMENT_DIR")
         experiment_dir = os.environ.get("CI_PROJECT_DIR")
-        save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"),
-                            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"))
+        save_dir = os.path.join(
+            os.environ.get("LOCAL_ARTIFACT_DIR"),
+            "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"),
+        )
         work_dir = os.environ["PATH_WORKDIR"]
 
         # Gather benchmarking configs
@@ -48,7 +51,9 @@ def get_default_session_options_new():
             config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml")
             if not os.path.exists(config_path):
                 # Otherwise look in LOCAL_CFG_DIR for the filename
-                config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH"))
+                config_path = os.path.join(
+                    os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")
+                )
         else:
             config_path = os.path.join("ci", "cfg", config_name + ".yml")
         print("Job launched with SLURM ID: %d" % (job_id))
@@ -60,7 +65,7 @@ def get_default_session_options_new():
         work_dir = "bench_work"
         os.makedirs(work_dir, exist_ok=True)
         delete_dir_contents(work_dir)
-        config_path = config_name # expect caller to provide direct path to a single config file
+        config_path = config_name  # expect caller to provide direct path to a single config file
         print("Local test job launched without SLURM")
 
     try:
@@ -129,7 +134,8 @@ def get_default_session_options_new():
 
     # Run benchmark
     # TODO: integrate this loop (especially status logging) into the bench class
-    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable), coordinate with new logging
+    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts
+    # (GitLab web interface is not readable), coordinate with new logging
     for run, run_id in enumerate(selected_runs):
         print(
             "Starting run %d/%d (id %d of %d total runs)"
@@ -144,7 +150,9 @@ def get_default_session_options_new():
         # Create bench object for respective DUT
         if "dut" in params:
             if params["dut"] in dut:
-                bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir)
+                bench_object = dut[params["dut"]](
+                    params, task_id, run_id, work_dir, artifacts_dir, save_dir
+                )
             else:
                 # If no custom bench subclass is defined, fall back to base class,
                 # expect DUT-specific YAML definition instead
@@ -168,7 +176,7 @@ def get_default_session_options_new():
 
         log_dict["output"] = bench_object.output_dict
 
-        # examine status reported by builder (which catches all exceptions before they reach us here)
+        # examine status reported by builder (which catches all exceptions before they reach us)
         # we could also fail the pipeline if functional verification fails (TODO)
         builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json")
         if os.path.isfile(builder_log_path):
diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
index 4fe8e77168..5cebe09878 100644
--- a/src/finn/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -1,49 +1,25 @@
-import itertools
-import os
-import subprocess
-import copy
-import json
-import yaml
-import time
-import traceback
 import glob
+import json
+import os
 import shutil
-import numpy as np
+import subprocess
 from shutil import copy as shcopy
 from shutil import copytree
-import finn.core.onnx_exec as oxe
-from qonnx.custom_op.registry import getCustomOp
-from qonnx.transformation.base import Transformation
-from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
-from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
-from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
-from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
-from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
-from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
-from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
-from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
-from finn.analysis.fpgadataflow.res_estimation import res_estimation
-from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs
+
+import finn.builder.build_dataflow as build
 import finn.builder.build_dataflow_config as build_cfg
-from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map
-from finn.benchmarking.templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template
-from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
-from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
-    ReplaceVerilogRelPaths,
+from finn.benchmarking.templates import (
+    template_open,
+    template_sim_power,
+    template_single_test,
+    template_switching_simulation_tb,
 )
-from qonnx.util.basic import (
-    gen_finn_dt_tensor,
-    roundup_to_integer_multiple,
-)
-import finn.builder.build_dataflow as build
-from finn.analysis.fpgadataflow.post_synth_res import post_synth_res
-from qonnx.core.modelwrapper import ModelWrapper
+from finn.benchmarking.util import delete_dir_contents, power_xml_to_dict
 from finn.builder.build_dataflow_config import DataflowBuildConfig
-import pandas as pd
-import onnxruntime as ort
-#TODO: merge this file into bench.py once most functionality has been moved to builder
+from finn.util.basic import alveo_default_platform, alveo_part_map, part_map
+
+# TODO: merge this file into bench.py once most functionality has been moved to builder
+
 
 def start_test_batch_fast(results_path, project_path, run_target, pairs):
     # Prepare tcl script
@@ -87,7 +63,7 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt
     script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif")
     script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns)))
     script = script.replace("$REPORT_PATH$", results_path)
-    script = script.replace("$REPORT_NAME$", f"sim")
+    script = script.replace("$REPORT_NAME$", "sim")
     with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file:
         tcl_file.write(script)
 
@@ -117,7 +93,8 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt
     with open(power_report_json, "w") as json_file:
         json_file.write(json.dumps(power_report_dict, indent=2))
 
-class bench():
+
+class bench:
     def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True):
         super().__init__()
         self.params = params
@@ -128,8 +105,8 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         self.save_dir = save_dir
         self.debug = debug
 
-        #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously
-        #TODO: coordinate with new builder loggin setup
+        # TODO: setup a logger so output can go to console (with task id prefix)
+        # TODO: coordinate with new builder loggin setup
 
         # Setup some basic global default configuration
         # TODO: are these class members even used anymore?
@@ -152,42 +129,46 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
             self.part = part_map[self.board]
         else:
             raise Exception("No part specified for board %s" % self.board)
-    
+
         if self.board in alveo_part_map:
             self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO
             self.params["vitis_platform"] = alveo_default_platform[self.board]
         else:
             self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ
 
-        # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts)
+        # Clear FINN tmp build dir before every run
         print("Clearing FINN BUILD DIR ahead of run")
         delete_dir_contents(os.environ["FINN_BUILD_DIR"])
 
         # Initialize dictionary to collect all benchmark results
-        # TODO: remove completely or only use for meta data, actual results go into run-specific .json files within /report
+        # TODO: remove completely or only use for meta data,
+        # actual results go into run-specific .json files within /report
         self.output_dict = {}
 
-        # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow
+        # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.)
         self.build_inputs = {}
 
-        # Collect tuples of (name, source path, archive?) to save as pipeline artifacts upon run completion or fail by exception
+        # Collect tuples of (name, source path, archive?) to save as pipeline artifacts
         self.artifacts_collection = []
 
-        # Collect tuples of (name, source path, archive?) to save as local artifacts upon run completion or fail by exception
+        # Collect tuples of (name, source path, archive?) to save as local artifacts
         self.local_artifacts_collection = []
         if self.debug:
-            # Save entire FINN build dir and working dir
-            # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure)
-            self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True))
-            #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False))
+            # Save entire FINN_BUILD_DIR
+            # TODO: add option to only save upon error/exception
+            self.local_artifacts_collection.append(
+                ("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True)
+            )
 
-        ### SETUP ###
+        # SETUP
         # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR)
         # Ensure it exists but is empty (clear potential artifacts from previous runs)
         tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow")
         os.makedirs(tmp_buildflow_dir, exist_ok=True)
         delete_dir_contents(tmp_buildflow_dir)
-        self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir
+        self.build_inputs["build_dir"] = os.path.join(
+            tmp_buildflow_dir, "build_output"
+        )  # TODO remove in favor of self.build_dir
         self.build_dir = os.path.join(tmp_buildflow_dir, "build_output")
         self.report_dir = os.path.join(self.build_dir, "report")
         os.makedirs(self.report_dir, exist_ok=True)
@@ -196,7 +177,9 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         self.local_artifacts_collection.append(("build_output", self.build_dir, False))
         # Save reports and deployment package as pipeline artifacts
         self.artifacts_collection.append(("reports", self.report_dir, False))
-        self.artifacts_collection.append(("reports", os.path.join(self.build_dir, "build_dataflow.log"), False))
+        self.artifacts_collection.append(
+            ("reports", os.path.join(self.build_dir, "build_dataflow.log"), False)
+        )
         self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True))
 
     def save_artifact(self, target_path, source_path, archive=False):
@@ -213,13 +196,15 @@ def save_artifact(self, target_path, source_path, archive=False):
 
     def save_artifacts_collection(self):
         # this should be called upon successful or failed completion of a run
-        for (name, source_path, archive) in self.artifacts_collection:
-            target_path = os.path.join(self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name)
+        for name, source_path, archive in self.artifacts_collection:
+            target_path = os.path.join(
+                self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name
+            )
             self.save_artifact(target_path, source_path, archive)
 
     def save_local_artifacts_collection(self):
         # this should be called upon successful or failed completion of a run
-        for (name, source_path, archive) in self.local_artifacts_collection:
+        for name, source_path, archive in self.local_artifacts_collection:
             target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id))
             self.save_artifact(target_path, source_path, archive)
 
@@ -235,7 +220,7 @@ def step_build_setup(self):
             with open(dut_path, "r") as f:
                 return DataflowBuildConfig.from_yaml(f)
         else:
-            raise Exception("No DUT-specific YAML build definition found") 
+            raise Exception("No DUT-specific YAML build definition found")
 
     # defaults to normal build flow, may be overwritten by subclass
     def run(self):
@@ -243,31 +228,32 @@ def run(self):
 
     def step_parse_builder_output(self, build_dir):
         # TODO: output as .json or even add as new build step
-        ### CHECK FOR VERIFICATION STEP SUCCESS ###
-        if (os.path.exists(os.path.join(build_dir, "verification_output"))):
+        # CHECK FOR VERIFICATION STEP SUCCESS
+        if os.path.exists(os.path.join(build_dir, "verification_output")):
             # Collect all verification output filenames
             outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy"))
             # Extract the verification status for each verification output by matching
             # to the SUCCESS string contained in the filename
-            status = all([
-                out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs
-            ])
-    
+            status = all([out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs])
+
             # Construct a dictionary reporting the verification status as string
-            self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]}
+            self.output_dict["builder_verification"] = {
+                "verification": {True: "success", False: "fail"}[status]
+            }
             # TODO: mark job as failed if verification fails?
 
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
 
-        ### LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig)
+        # LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig)
         custom_params = [
-            "model_dir", # used to setup onnx/npy input
-            "model_path", # used to setup onnx/npy input
-            # model-gen parameters, such as seed, simd, pe, etc. (TODO: separate from builder options)
+            "model_dir",  # used to setup onnx/npy input
+            "model_path",  # used to setup onnx/npy input
+            # model-gen parameters, such as seed, simd, pe, etc.
+            # TODO: separate these from builder options
         ]
 
-        ### MODEL CREATION/IMPORT ###
+        # MODEL CREATION/IMPORT
         # TODO: track fixed input onnx models with DVC
         if "model_dir" in self.params:
             # input ONNX model and verification input/output pairs are provided
@@ -279,12 +265,14 @@ def steps_full_build_flow(self):
             self.build_inputs["onnx_path"] = self.params["model_path"]
         else:
             # input ONNX model (+ optional I/O pair for verification) will be generated
-            self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx")
+            self.build_inputs["onnx_path"] = os.path.join(
+                self.build_inputs["build_dir"], "model_export.onnx"
+            )
             if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped":
-                # microbenchmarks might skip because no valid model can be generated for given params
+                # microbenchmarks might skip because no model can be generated for given params
                 return "skipped"
 
-        ### BUILD SETUP ###
+        # BUILD SETUP
         # Initialize from YAML (default) or custom script (if dedicated subclass is defined)
         cfg = self.step_build_setup()
 
@@ -292,18 +280,18 @@ def steps_full_build_flow(self):
         cfg.output_dir = self.build_inputs["build_dir"]
         # enable extra performance optimizations (physopt)
         # TODO: check OMX synth strategy again!
-        cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST
+        cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST
         cfg.verbose = False
         cfg.enable_build_pdb_debug = False
-        #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
+        # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
         cfg.force_python_rtlsim = False
         cfg.split_large_fifos = True
-        cfg.save_intermediate_models = True # Save the intermediate model graphs
-        cfg.verify_save_full_context = True # Output full context dump for verification steps
+        cfg.save_intermediate_models = True  # Save the intermediate model graphs
+        cfg.verify_save_full_context = True  # Output full context dump for verification steps
         cfg.enable_instrumentation = True
-        #rtlsim_use_vivado_comps # TODO ?
-        #cfg.default_swg_exception
-        #cfg.large_fifo_mem_style
+        # rtlsim_use_vivado_comps # TODO ?
+        # cfg.default_swg_exception
+        # cfg.large_fifo_mem_style
 
         # Overwrite build config settings with run-specific YAML build definition
         for key in self.params:
@@ -312,15 +300,15 @@ def steps_full_build_flow(self):
             else:
                 if key not in custom_params:
                     pass
-                    #TODO: be more strict? support custom extra options like MetaFi uses?
-                    #raise Exception("Unrecognized builder config defined in YAML: %s" % key)
+                    # TODO: be more strict? support custom extra options like MetaFi uses?
+                    # raise Exception("Unrecognized builder config defined in YAML: %s" % key)
 
         # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
         # TODO: make configurable or set on pipeline level?
         os.environ["LIVENESS_THRESHOLD"] = "10000000"
 
-        ### BUILD ###
+        # BUILD
         build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg)
 
-        ### ANALYSIS ###
+        # ANALYSIS
         self.step_parse_builder_output(self.build_inputs["build_dir"])
diff --git a/src/finn/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py
index 8ce89fdccc..2c4a6b730a 100644
--- a/src/finn/benchmarking/dut/mvau.py
+++ b/src/finn/benchmarking/dut/mvau.py
@@ -1,31 +1,24 @@
-
+import json
 import math
 import numpy as np
-import json
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
-from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.util.basic import (
     calculate_matvec_accumulator_range,
     gen_finn_dt_tensor,
-    qonnx_make_model
+    qonnx_make_model,
 )
-from finn.transformation.fpgadataflow.minimize_accumulator_width import (
-    MinimizeAccumulatorWidth,
-)
-from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
-    MinimizeWeightBitWidth,
-)
-import finn.builder.build_dataflow_config as build_cfg
 
+import finn.builder.build_dataflow_config as build_cfg
 from finn.benchmarking.bench_base import bench
+from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth
 
-class bench_mvau(bench):
 
+class bench_mvau(bench):
     def _make_single_mvau_model(
         self,
         W,
@@ -77,7 +70,7 @@ def _make_single_mvau_model(
             actval = 0
             no_act = 1
         mvau_node = helper.make_node(
-            "MVAU_hls", #TODO: add rtl support (configurable as param)
+            "MVAU_hls",  # TODO: add rtl support (configurable as param)
             node_inp_list,
             ["outp"],
             domain="finn.custom_op.fpgadataflow.hls",
@@ -101,7 +94,9 @@ def _make_single_mvau_model(
             runtime_writeable_weights=0,
         )
 
-        graph = helper.make_graph(nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp])
+        graph = helper.make_graph(
+            nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp]
+        )
         model = qonnx_make_model(graph, producer_name="mvau-model")
         model = ModelWrapper(model)
 
@@ -194,10 +189,14 @@ def step_export_onnx(self, onnx_export_path):
                 W[idx] = 0.0
                 W = np.reshape(W, (mw, mh))
             elif sparsity_type == "rows_random":
-                idx_mw = np.random.choice(mw, size=int(self.params["sparsity_amount"] * mw), replace=False)
+                idx_mw = np.random.choice(
+                    mw, size=int(self.params["sparsity_amount"] * mw), replace=False
+                )
                 W[idx_mw, :] = 0.0
             elif sparsity_type == "cols_random":
-                idx_mh = np.random.choice(mh, size=int(self.params["sparsity_amount"] * mh), replace=False)
+                idx_mh = np.random.choice(
+                    mh, size=int(self.params["sparsity_amount"] * mh), replace=False
+                )
                 W[:, idx_mh] = 0.0
             elif sparsity_type == "rows_regular":
                 if self.params["sparsity_amount"] == 0.25:
@@ -206,7 +205,11 @@ def step_export_onnx(self, onnx_export_path):
                     idx_mw = np.arange(0, mw, step=2)
                 elif self.params["sparsity_amount"] == 0.75:
                     idx_mw = np.concatenate(
-                        (np.arange(0, mw, step=4), np.arange(1, mw, step=4), np.arange(2, mw, step=4))
+                        (
+                            np.arange(0, mw, step=4),
+                            np.arange(1, mw, step=4),
+                            np.arange(2, mw, step=4),
+                        )
                     )
                 else:
                     print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
@@ -219,7 +222,11 @@ def step_export_onnx(self, onnx_export_path):
                     idx_mh = np.arange(0, mh, step=2)
                 elif self.params["sparsity_amount"] == 0.75:
                     idx_mh = np.concatenate(
-                        (np.arange(0, mh, step=4), np.arange(1, mh, step=4), np.arange(2, mh, step=4))
+                        (
+                            np.arange(0, mh, step=4),
+                            np.arange(1, mh, step=4),
+                            np.arange(2, mh, step=4),
+                        )
                     )
                 else:
                     print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping")
@@ -262,13 +269,15 @@ def step_export_onnx(self, onnx_export_path):
                 odt = DataType["INT32"]
         else:
             odt = act
-            # set range for threshold values according to worst-case accumulator range (not weight value specific)
+            # set range for threshold values according to worst-case accumulator range
+            # (not weight value specific)
             # this could result in some thresholds being clipped by MinimizeAccumulatorWidth
             # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt)
             # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt)
             # acc_min = min(min(lower_range), min(upper_range))
             # acc_max = max(max(lower_range), max(upper_range))
-            # set range for threshold values according to actual accumulator range for the generated weights
+            # set range for threshold values according to actual accumulator range
+            # for the generated weights
             (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt)
             n_steps = act.get_num_possible_values() - 1
             T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32)
@@ -285,13 +294,26 @@ def step_export_onnx(self, onnx_export_path):
 
         # Create model
         model = self._make_single_mvau_model(
-            W, numInputVectors, pe, simd, m, wdt, idt, odt, T, tdt, mem_mode, ram_style, ram_style_thr
+            W,
+            numInputVectors,
+            pe,
+            simd,
+            m,
+            wdt,
+            idt,
+            odt,
+            T,
+            tdt,
+            mem_mode,
+            ram_style,
+            ram_style_thr,
         )
         model = model.transform(GiveUniqueNodeNames())
-        node = model.get_nodes_by_op_type("MVAU_hls")[0]
-        inst = getCustomOp(node)
+        # node = model.get_nodes_by_op_type("MVAU_hls")[0]
+        # inst = getCustomOp(node)
 
-        self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type
+        # display results of analysis passes only for the first occurence of this op type
+        self.target_node = "MVAU_hls"
 
         # log additional info about the generated model (e.g. SIMD/PE or sparsity)
         with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f:
@@ -317,6 +339,6 @@ def step_build_setup(self):
                 "step_synthesize_bitfile",
                 "step_make_driver",
                 "step_deployment_package",
-            ]
+            ],
         )
         return cfg
diff --git a/src/finn/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py
index b912e8b319..ff33436976 100644
--- a/src/finn/benchmarking/dut/synthetic_nonlinear.py
+++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py
@@ -1,15 +1,8 @@
-import json
 import numpy as np
-import os
-import shutil
-import torch
-import copy
-from brevitas.export import export_qonnx
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
-from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.general import (
     GiveRandomTensorNames,
     GiveReadableTensorNames,
@@ -21,16 +14,12 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.merge_onnx_models import MergeONNXModels
 from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
-import finn.builder.build_dataflow as build
-import finn.builder.build_dataflow_config as build_cfg
-from finn.util.basic import make_build_dir
-from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents
-from finn.util.test import get_trained_network_and_ishape
-from finn.util.basic import alveo_default_platform
 
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.bench_base import bench
 
+from finn.util.basic import make_build_dir
 
-from finn.benchmarking.bench_base import bench
 
 def generate_random_threshold_values(
     data_type, num_input_channels, num_steps, narrow=False, per_tensor=False
@@ -50,6 +39,7 @@ def generate_random_threshold_values(
 def sort_thresholds_increasing(thresholds):
     return np.sort(thresholds, axis=1)
 
+
 def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0):
     # hardcoded parameters
     idt = DataType["UINT4"]
@@ -164,7 +154,9 @@ def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window
 
 
 def combine_blocks(lb, rb, ifm_dim, ch, pe):
-    # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape
+    # assumes left branch (lb) and right branch (rb) each have a
+    # single (dynamic) input/output with the same shape
+
     # to avoid mix-ups, start by giving all tensors random names
     lb = lb.transform(GiveRandomTensorNames())
     rb = rb.transform(GiveRandomTensorNames())
@@ -249,17 +241,14 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe):
     model = model.transform(GiveReadableTensorNames())
     return model
 
+
 class bench_synthetic_nonlinear(bench):
     def step_export_onnx(self, onnx_export_path):
         np.random.seed(0)
         tmp_output_dir = make_build_dir("test_fifosizing")
 
-        #TODO: allow manual folding/fifo config as input
-
-        #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness?
-        #      e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does?
-
-        #TODO: how to determine rtlsim_n automatically?
+        # TODO: allow manual folding/fifo config as input
+        # TODO: how to determine rtlsim_n automatically?
 
         # conv parameters
         dim = self.params["dim"]
diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py
index 9023c94aff..83002ef418 100644
--- a/src/finn/benchmarking/dut/transformer.py
+++ b/src/finn/benchmarking/dut/transformer.py
@@ -1,52 +1,54 @@
 # Adapted from Christoph's attention-dummy repository
 
 # PyTorch base package: Math and Tensor Stuff
+import json
+import numpy as np
+import random
 import torch
-# Brevitas wrapper around PyTorch tensors adding quantization information
-from brevitas.quant_tensor import QuantTensor
+from brevitas.export import export_qonnx
+
 # Brevitas: Quantized versions of PyTorch layers
 from brevitas.nn import (
-    QuantMultiheadAttention,
     QuantEltwiseAdd,
     QuantIdentity,
     QuantLinear,
-    QuantReLU
+    QuantMultiheadAttention,
+    QuantReLU,
 )
-# Progressbar
-from tqdm import trange
-import numpy as np
-from brevitas.export import export_qonnx
-import random
-import json
-# FINN dataflow builder
-import finn.builder.build_dataflow_config as build_cfg
-from finn.builder.build_dataflow_config import AutoFIFOSizingMethod
+
+# Brevitas wrapper around PyTorch tensors adding quantization information
+from brevitas.quant_tensor import QuantTensor
 from qonnx.core.modelwrapper import ModelWrapper
-from finn.benchmarking.bench_base import bench
 
 # Range information structure for seeding the range analysis for converting
 # quantized activations to MultiThreshold
 from qonnx.util.range_analysis import RangeInfo
 
+# Progressbar
+from tqdm import trange
+
+# FINN dataflow builder
+import finn.builder.build_dataflow_config as build_cfg
+from finn.benchmarking.bench_base import bench
+
 # Custom build steps required to streamline and convert the attention operator
 from finn.builder.custom_step_library.transformer import (
+    node_by_node_cppsim,
     prepare_graph,
-    step_streamline,
+    set_fifo_depths,
+    set_target_parallelization,
+    step_apply_folding_config,
     step_convert_attention_to_hw,
+    step_convert_depth_wise_to_hw,
     step_convert_elementwise_binary_to_hw,
     step_convert_lookup_to_hw,
     step_convert_split_concat_to_hw,
-    step_convert_depth_wise_to_hw,
     step_replicate_streams,
-    set_target_parallelization,
-    set_fifo_depths,
-    step_apply_folding_config,
-    node_by_node_rtlsim,  # noqa: Maybe unused, only for debugging
-    node_by_node_cppsim,
+    step_streamline,
 )
 
 
-### ADAPTED FROM utils.py
+# ADAPTED FROM utils.py
 # Seeds all relevant random number generators to the same seed for
 # reproducibility
 def seed(s):
@@ -54,14 +56,15 @@ def seed(s):
     np.random.seed(s)
     torch.manual_seed(s)
 
-### ADAPTED FROM model.py
+
+# ADAPTED FROM model.py
 # Derives a weight quantizer from the brevitas bases leaving bit-width and
 # signedness configurable
 def weight_quantizer(bits, _signed=True):
     # Brevitas quantizer base classes
-    from brevitas.quant.base import NarrowIntQuant, MaxStatsScaling
-    from brevitas.quant.solver import WeightQuantSolver
     from brevitas.inject.enum import RestrictValueType
+    from brevitas.quant.base import MaxStatsScaling, NarrowIntQuant
+    from brevitas.quant.solver import WeightQuantSolver
 
     # Derive a Quantizer from the brevitas bases
     class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver):
@@ -103,14 +106,12 @@ class Quantizer(IntBias):
 # signedness configurable
 def act_quantizer(bits, _signed=True):
     # Brevitas quantizer base classes
+    from brevitas.inject.enum import RestrictValueType
     from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling
     from brevitas.quant.solver import ActQuantSolver
-    from brevitas.inject.enum import RestrictValueType
 
     # Derive a Quantizer from the brevitas bases
-    class Quantizer(
-        IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver
-    ):
+    class Quantizer(IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver):
         # Configure the quantization bit-width
         bit_width = bits
         # Signedness of the quantization output
@@ -141,7 +142,8 @@ def forward(self, x):  # noqa: May be static
         "layer-norm": torch.nn.LayerNorm(
             # Note: Disable affine parameters as potential negative scale causes
             # streamlining issues later
-            normalized_shape=normalized_shape, elementwise_affine=False
+            normalized_shape=normalized_shape,
+            elementwise_affine=False,
         ),
         # PyTorch default 1-dimensional batch normalization. Needs to transpose
         # embedding and sequence dimension to normalized over the embedding
@@ -149,11 +151,13 @@ def forward(self, x):  # noqa: May be static
         "batch-norm": torch.nn.Sequential(
             # Note: Disable affine parameters as potential negative scale causes
             # streamlining issues later
-            Transpose(), torch.nn.LazyBatchNorm1d(affine=False), Transpose()
+            Transpose(),
+            torch.nn.LazyBatchNorm1d(affine=False),
+            Transpose(),
         ),
         # No normalization by a PyTorch built-in identity layer. Should not
         # appear in the graph.
-        "none": torch.nn.Identity()
+        "none": torch.nn.Identity(),
     }
 
     # Select the normalization layer by key
@@ -172,7 +176,7 @@ def get_mask(key, length):
         # probability each
         "random": torch.where(  # noqa: Confused by types?
             torch.rand(length, length) > 0.5, -torch.inf, 0.0
-        )
+        ),
     }
     # Select the mask type by key
     return masks[key]
@@ -181,9 +185,7 @@ def get_mask(key, length):
 # Single-layer scaled dot-product attention block with MLP and normalization
 class TransformerBlock(torch.nn.Module):
     # Initializes the model and registers the module parameters
-    def __init__(
-            self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits
-    ):
+    def __init__(self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits):
         # Initialize the PyTorch Module superclass
         super().__init__()
 
@@ -197,7 +199,7 @@ def __init__(
             # Quantize at the output
             act_quant=act_quantizer(bits, _signed=True),
             # Pass quantization information on to the next layer.
-            return_quant_tensor=True
+            return_quant_tensor=True,
         )
         # Quantized scaled dot-product attention operator
         self.sdp = QuantMultiheadAttention(
@@ -232,28 +234,24 @@ def __init__(
             # No quantization in front of the input projections as this is
             # either done by a standalone quantizer preceding the whole block
             in_proj_input_quant=None,
-
             # Quantize the output projections weights as configured
             out_proj_weight_quant=weight_quantizer(bits, _signed=True),
             # Quantize the bias of the output projections as configured
             out_proj_bias_quant=bias_quantizer(bits, _signed=True),
             # Quantize the input to the output projection as configured
             out_proj_input_quant=act_quantizer(bits, _signed=True),
-
             # Quantizer the key after projections as configured
             k_transposed_quant=act_quantizer(bits, _signed=True),
             # Quantize the queries after projections as configured
             q_scaled_quant=act_quantizer(bits, _signed=True),
             # Quantize the values after projection as configured
             v_quant=act_quantizer(bits, _signed=True),
-
             # No output quantization for now, as stacking multiple layers
             # results in multiple multi-thresholds in succession
             out_proj_output_quant=None,
-
             # Return the quantization parameters so the next layer can
             # quantize the bias
-            return_quant_tensor=True
+            return_quant_tensor=True,
         )
         # Residual branch addition skipping over the attention layer
         self.residual_sdp = QuantEltwiseAdd(
@@ -266,7 +264,7 @@ def __init__(
             # fine and does not require re-quantization.
             output_quant=None,
             # Pass quantization information on to the next layer.
-            return_quant_tensor=True
+            return_quant_tensor=True,
         )
         # Normalization following the attention layer
         self.norm_sdp = torch.nn.Sequential(
@@ -284,7 +282,7 @@ def __init__(
                 # Quantize at the output
                 act_quant=act_quantizer(bits, _signed=True),
                 # Pass quantization information on to the next layer.
-                return_quant_tensor=True
+                return_quant_tensor=True,
             ),
             # First mlp layer projecting to the mlp dimension
             QuantLinear(
@@ -309,7 +307,7 @@ def __init__(
                 output_quant=None,
                 # Return the quantization parameters so the next layer can
                 # quantize the bias
-                return_quant_tensor=True
+                return_quant_tensor=True,
             ),
             # Use the ReLU activation function instead of the more commonly used
             # GELU, as the latter is not mapped easily to hardware with FINN
@@ -318,7 +316,7 @@ def __init__(
                 act_quant=act_quantizer(bits, _signed=False),
                 # Return the quantization parameters so the next layer can
                 # quantize the bias
-                return_quant_tensor=True
+                return_quant_tensor=True,
             ),
             # Second mlp layer projecting back to the embedding dimension
             QuantLinear(
@@ -342,7 +340,7 @@ def __init__(
                 # quantized element-wise addition taking care of quantization
                 output_quant=None,
                 # Pass quantization information on to the next layer.
-                return_quant_tensor=True
+                return_quant_tensor=True,
             ),
         )
         # Residual branch addition skipping over the MLP layer
@@ -359,7 +357,7 @@ def __init__(
             # Note: Not for the last layer to allow this to be combined with
             # standard pytorch calls like .detach() or .numpy(), which are
             # not directly available on QuantTensor.
-            return_quant_tensor=True
+            return_quant_tensor=True,
         )
         # Normalization following the attention layer
         self.norm_mlp = torch.nn.Sequential(
@@ -378,9 +376,7 @@ def forward(self, x):
         # Quantize the input to the attention block
         q = self.sdp_input_quant(x)
         # Scaled dot-product attention with residual branch and normalization
-        x = self.norm_sdp(
-            self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0])
-        )
+        x = self.norm_sdp(self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0]))
         # MLP layer with residual branch and normalization
         return self.norm_mlp(self.residual_mlp(x, self.mlp(x)))
 
@@ -399,7 +395,7 @@ def __init__(self, input_quant, output_quant, return_quant_tensor):
             # Quantize the outputs after adding input and positional encoding
             output_quant=output_quant,
             # Returns quantization information to the next layer
-            return_quant_tensor=return_quant_tensor
+            return_quant_tensor=return_quant_tensor,
         )
 
     # Forward pass adding positional encoding to the input tensor
@@ -426,14 +422,7 @@ def forward(self, x):
 # Quantized learned positional encoding layer
 class QuantLearnedPositionalEncoding(torch.nn.Module):
     # Initializes the model and registers the module parameters
-    def __init__(
-            self,
-            seq_len,
-            emb_dim,
-            input_quant,
-            output_quant,
-            return_quant_tensor
-    ):
+    def __init__(self, seq_len, emb_dim, input_quant, output_quant, return_quant_tensor):
         # Initialize the PyTorch Module superclass
         super().__init__()
         # Adds the quantized input and positional encoding
@@ -444,7 +433,7 @@ def __init__(
             # Quantize the outputs after adding input and positional encoding
             output_quant=output_quant,
             # Returns quantization information to the next layer
-            return_quant_tensor=return_quant_tensor
+            return_quant_tensor=return_quant_tensor,
         )
         # Register a parameter tensor representing the not quantized positional
         # encoding
@@ -467,7 +456,7 @@ def forward(self, x):
 # Lazy version of the learned encoding not requiring input dimensions at
 # initialization, inferring these at the first forward pass
 class LazyQuantLearnedPositionalEncoding(
-    torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa
+    torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding  # noqa
 ):
     # Once initialized, this will become a QuantLearnedPositionalEncoding as
     # defined above
@@ -520,7 +509,7 @@ def __init__(self, input_quant, output_quant, return_quant_tensor):
             # Quantize the outputs after adding input and positional encoding
             output_quant=output_quant,
             # Returns quantization information to the next layer
-            return_quant_tensor=return_quant_tensor
+            return_quant_tensor=return_quant_tensor,
         )
 
     # Forward pass adding positional encoding to the input tensor
@@ -530,9 +519,7 @@ def forward(self, x):
         _, seq, emb = x.shape
         # Binary positional encoding fills the embedding dimension with the bit
         # pattern corresponding to the position in the sequence
-        pos = torch.as_tensor([
-            [(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)
-        ])
+        pos = torch.as_tensor([[(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)])
         # Move the encoding tensor to the same device as the input tensor
         pos = pos.to(x.device, dtype=x.dtype)
         # Add the quantized encoding tp the quantized input
@@ -542,28 +529,22 @@ def forward(self, x):
 
 # Gets the positional encoding layer from configuration key, quantizers and
 # shape
-def get_positional_encoding(
-        key, input_quant, output_quant, return_quant_tensor
-):
+def get_positional_encoding(key, input_quant, output_quant, return_quant_tensor):
     # Dictionary mapping keys to supported normalization layer implementations
     masks = {
         # No positional encoding
-        "none": QuantIdentity(
-            act_quant=input_quant, return_quant_tensor=return_quant_tensor
-        ),
+        "none": QuantIdentity(act_quant=input_quant, return_quant_tensor=return_quant_tensor),
         # Fixed, sinusoidal positional encoding according to Vaswani et al. with
         # added quantizers
         "sinusoidal": QuantSinusoidalPositionalEncoding(
             input_quant, output_quant, return_quant_tensor
         ),
         # Fixed, binary positional encoding with quantizers
-        "binary": QuantBinaryPositionalEncoding(
-            input_quant, output_quant, return_quant_tensor
-        ),
+        "binary": QuantBinaryPositionalEncoding(input_quant, output_quant, return_quant_tensor),
         # Learned positional encoding with quantizers
         "learned": LazyQuantLearnedPositionalEncoding(
             input_quant, output_quant, return_quant_tensor
-        )
+        ),
     }
     # Select the positional encoding type by key
     return masks[key]
@@ -583,31 +564,31 @@ def unpack_from_quant(tensor: torch.Tensor | QuantTensor):
 class DummyTransformer(torch.nn.Module):
     # Initializes the model and registers the module parameters
     def __init__(
-            self,
-            # Number of layers of attention blocks
-            num_layers,
-            # Number of attention heads per block
-            num_heads,
-            # Size of embedding dimension going into/out of the attention block
-            emb_dim,
-            # Size of MLP dimension in each attention block
-            mlp_dim,
-            # Length of the input sequence, i.e., context size
-            seq_len,
-            # Enables bias term added to Linear layers
-            bias,
-            # Quantization bit-width: For now all layers are quantized to the
-            # same bit-width
-            bits,
-            # Type of normalization layer to use in the transformer blocks
-            #   Options are: layer-norm, batch-norm and none
-            norm="none",
-            # Type of attention mask to use
-            #   Options are: none, causal or const
-            mask="none",
-            # Type of positional encoding to use at the input
-            #   Options are: none, sinusoidal, binary, learned
-            positional_encoding="none"
+        self,
+        # Number of layers of attention blocks
+        num_layers,
+        # Number of attention heads per block
+        num_heads,
+        # Size of embedding dimension going into/out of the attention block
+        emb_dim,
+        # Size of MLP dimension in each attention block
+        mlp_dim,
+        # Length of the input sequence, i.e., context size
+        seq_len,
+        # Enables bias term added to Linear layers
+        bias,
+        # Quantization bit-width: For now all layers are quantized to the
+        # same bit-width
+        bits,
+        # Type of normalization layer to use in the transformer blocks
+        #   Options are: layer-norm, batch-norm and none
+        norm="none",
+        # Type of attention mask to use
+        #   Options are: none, causal or const
+        mask="none",
+        # Type of positional encoding to use at the input
+        #   Options are: none, sinusoidal, binary, learned
+        positional_encoding="none",
     ):
         # Initialize the PyTorch Module superclass
         super().__init__()
@@ -623,15 +604,16 @@ def __init__(
             # bit-width as the input
             output_quant=None,
             # Pass quantization information on to the next layer
-            return_quant_tensor=True
+            return_quant_tensor=True,
         )
 
         # Sequence of num_layers transformer encoder blocks
-        self.encoder = torch.nn.Sequential(*[
-            TransformerBlock(
-                num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits
-            ) for _ in range(num_layers)
-        ])
+        self.encoder = torch.nn.Sequential(
+            *[
+                TransformerBlock(num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits)
+                for _ in range(num_layers)
+            ]
+        )
 
     # Model forward pass taking an input sequence and returning a single set of
     # class probabilities
@@ -642,7 +624,9 @@ def forward(self, x):
         # single output from the model.
         return unpack_from_quant(self.encoder(self.pos(x)))
 
-### ADAPTED FROM export.py
+
+# ADAPTED FROM export.py
+
 
 # Check whether a layer is a normalization layer of some supported type
 def is_norm_layer(module):
@@ -672,21 +656,18 @@ def patch_non_affine_norms(model: torch.nn.Module):  # noqa: Shadows model
                 if hasattr(module, "running_var"):
                     # Patch the affine bias by all 1 tensor of the same shape,
                     # type and device as the running variance
-                    module.weight = torch.nn.Parameter(
-                        torch.ones_like(module.running_var)
-                    )
+                    module.weight = torch.nn.Parameter(torch.ones_like(module.running_var))
             # Check whether affine bias parameters are missing
             if hasattr(module, "bias") and module.bias is None:
                 # There need to be running statistics to patch the scales
                 if hasattr(module, "running_mean"):
                     # Patch the affine bias by all 0 tensor of the same shape,
                     # type and device as the running mean
-                    module.bias = torch.nn.Parameter(
-                        torch.zeros_like(module.running_var)
-                    )
+                    module.bias = torch.nn.Parameter(torch.zeros_like(module.running_var))
     # Return the patched model container
     return model
 
+
 template_folding_yaml = """
 # Per operator type default configurations
 defaults:
@@ -780,30 +761,31 @@ def patch_non_affine_norms(model: torch.nn.Module):  # noqa: Shadows model
     # ...
 """
 
+
 class bench_transformer(bench):
     def step_export_onnx(self, output_onnx_path):
         # Generates a dummy transformer block,
         # not used for actual models (RadioML, GPT, etc.)
 
         # Load the parameters file
-        #params = dvc.api.params_show("params.yaml")
+        # params = dvc.api.params_show("params.yaml")
         # Seed all RNGs
         seed(self.params["seed"])
         # Make PyTorch behave deterministically if possible
         torch.use_deterministic_algorithms(mode=True, warn_only=True)
         # Create a model instance from the configuration parameters
-        #model = DummyTransformer(**params["model"])
+        # model = DummyTransformer(**params["model"])
         model = DummyTransformer(
-            num_layers = self.params["model_num_layers"],
-            num_heads = self.params["model_num_heads"],
-            emb_dim = self.params["model_emb_dim"],
-            mlp_dim = self.params["model_mlp_dim"],
-            seq_len = self.params["model_seq_len"],
-            bias = self.params["model_bias"],
-            bits = self.params["model_bits"],
-            norm = self.params["model_norm"],
-            mask = self.params["model_mask"],
-            positional_encoding = self.params["model_positional_encoding"],
+            num_layers=self.params["model_num_layers"],
+            num_heads=self.params["model_num_heads"],
+            emb_dim=self.params["model_emb_dim"],
+            mlp_dim=self.params["model_mlp_dim"],
+            seq_len=self.params["model_seq_len"],
+            bias=self.params["model_bias"],
+            bits=self.params["model_bits"],
+            norm=self.params["model_norm"],
+            mask=self.params["model_mask"],
+            positional_encoding=self.params["model_positional_encoding"],
         )
 
         # Get the configured sequence length and embedding dimension to generate
@@ -813,7 +795,7 @@ def step_export_onnx(self, output_onnx_path):
         with torch.no_grad():
             # Check whether GPU training is available and select the appropriate
             # device
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             # Move the model to the training device
             model = model.to(device)
             # Multiple passes of calibration might be necessary for larger/deep
@@ -840,13 +822,11 @@ def step_export_onnx(self, output_onnx_path):
         self.build_inputs["input_npy_path"] = "inp.npy"
         self.build_inputs["output_npy_path"] = "out.npy"
         # Export the model graph to QONNX
-        #export_qonnx(model, (x,), "attention.onnx", **self.params["export"])
-        export_qonnx(model, (x,), output_onnx_path, 
-                    opset_version = 14, 
-                    do_constant_folding = True)
+        # export_qonnx(model, (x,), "attention.onnx", **self.params["export"])
+        export_qonnx(model, (x,), output_onnx_path, opset_version=14, do_constant_folding=True)
 
     def step_build_setup(self):
-        #with open("params.yaml") as file:
+        # with open("params.yaml") as file:
         #    params = yaml.safe_load(file)
         # Seed all RNGs
         seed(self.params["seed"])
@@ -863,41 +843,38 @@ def step_build_setup(self):
             else:
                 # for GPTs (why is this different?)
                 model = ModelWrapper(self.build_inputs["onnx_path"])
-                _, seq_len, emb_dim = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0")
+                _, seq_len, emb_dim = model.get_tensor_shape(
+                    "/emb_add/input_quant/export_handler/Quant_output_0"
+                )
 
         # Read the input value range information for the dataset from the parameters
         # Note: Consider calibrating this on the fly from the dataset
-        value_range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable?
+        value_range = [-100, +100]  # params["build"]["range"] # TODO: make configurable?
         input_range = tuple(np.array([value_range]).T)
         # Construct the seed range information of the input tensor
         range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range)
-    
+
         # Prepare config files
         # TODO: make configurable
-        # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs
+        # TODO: log intermediate files such as inp.npy, folding.yaml,
+        # or specialize_layers.jon as artifacts, maybe create in unique temp dirs
         specialize_layers_dict = {
-            "Defaults": {
-                "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]
-            },
-            "": {
-                "preferred_impl_style": ""
-            }
+            "Defaults": {"preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]},
+            "": {"preferred_impl_style": ""},
         }
         with open("specialize_layers.json", "w") as f:
-                json.dump(specialize_layers_dict, f, indent=2)
+            json.dump(specialize_layers_dict, f, indent=2)
         with open("folding.yaml", "w") as f:
-                f.write(template_folding_yaml)
-
+            f.write(template_folding_yaml)
 
         # Create a configuration for building the scaled dot-product attention
         # operator to a hardware accelerator
         cfg = build_cfg.DataflowBuildConfig(
-            folding_config_file = "folding.yaml",
-            specialize_layers_config_file = "specialize_layers.json",
-            standalone_thresholds = True,
-            max_multithreshold_bit_width = 16,
-            mvau_wwidth_max = 2048,
-
+            folding_config_file="folding.yaml",
+            specialize_layers_config_file="specialize_layers.json",
+            standalone_thresholds=True,
+            max_multithreshold_bit_width=16,
+            mvau_wwidth_max=2048,
             verify_steps=[
                 # Verify the model after converting to the FINN onnx dialect
                 build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,
@@ -908,7 +885,8 @@ def step_build_setup(self):
                 # converting to HLS
                 build_cfg.VerificationStepType.TIDY_UP_PYTHON,
                 # Verify the model after generating C++ HLS and applying folding
-                #build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, #only inserted if live FIFO-sizing is off
+                # only inserted if live FIFO-sizing is off:
+                # build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM,
                 # No RTL Simulation support for now
             ],
             # File with test inputs for verification
@@ -963,17 +941,17 @@ def step_build_setup(self):
                 # model before creating the stitched IP
                 # Note: end-to-end verification of the stitched IP in RTL simulation
                 # is still not possible due to missing float IPs
-                #node_by_node_cppsim, #only inserted if live FIFO-sizing is off
+                # node_by_node_cppsim, #only inserted if live FIFO-sizing is off
                 # Only for debugging for now, does not work if "vivado" style
                 # StreamingFIFOs are used
                 # node_by_node_rtlsim,
                 "step_create_stitched_ip",
                 # "step_measure_rtlsim_performance", # not possible due to float components
-                "step_out_of_context_synthesis", # for synthesis results (e.g. utilization)
-                "step_synthesize_bitfile", 
+                "step_out_of_context_synthesis",  # for synthesis results (e.g. utilization)
+                "step_synthesize_bitfile",
                 "step_make_driver",
                 "step_deployment_package",
-            ]
+            ],
         )
 
         # TESTING custom vs live FIFO-sizing
@@ -981,14 +959,16 @@ def step_build_setup(self):
             # insert default FIFO-sizing step (behind step_generate_estimate_reports)
             for i in range(len(cfg.steps)):
                 if cfg.steps[i] == "step_generate_estimate_reports":
-                    cfg.steps.insert(i+1, "step_set_fifo_depths")
+                    cfg.steps.insert(i + 1, "step_set_fifo_depths")
         else:
             # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen)
             for i in range(len(cfg.steps)):
                 if cfg.steps[i] == "step_hw_ipgen":
-                    cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len))
+                    cfg.steps.insert(
+                        i + 1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len)
+                    )
                     # also enable cppsim, which doesn't work with virtual FIFOs
-                    cfg.steps.insert(i+2, node_by_node_cppsim)
+                    cfg.steps.insert(i + 2, node_by_node_cppsim)
                     cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM)
 
         return cfg
diff --git a/src/finn/benchmarking/templates.py b/src/finn/benchmarking/templates.py
index c8bf944380..44c2ebced8 100644
--- a/src/finn/benchmarking/templates.py
+++ b/src/finn/benchmarking/templates.py
@@ -1,5 +1,6 @@
 # Template strings for benchmarking
 
+# flake8: noqa
 
 # power report scripting based on Lucas Reuter:
 template_open = """
diff --git a/src/finn/benchmarking/util.py b/src/finn/benchmarking/util.py
index 23ecc0a984..1e08bd2501 100644
--- a/src/finn/benchmarking/util.py
+++ b/src/finn/benchmarking/util.py
@@ -1,8 +1,10 @@
 # Utility functions for benchmarking
-import os, shutil
 import json
+import os
+import shutil
 import xml.etree.ElementTree as ET
 
+
 def _find_rows_and_headers(table):
     rows = table.findall("tablerow")
     headers = []
@@ -13,6 +15,7 @@ def _find_rows_and_headers(table):
             break
     return (rows, headers)
 
+
 def summarize_table(table):
     table_summary = {}
     table_summary["headers"] = []
@@ -38,6 +41,7 @@ def summarize_table(table):
 
     return table_summary
 
+
 def summarize_section(section):
     section_summary = {}
     section_summary["tables"] = []
@@ -54,6 +58,7 @@ def summarize_section(section):
 
     return section_summary
 
+
 def power_xml_to_dict(xml_path):
     tree = ET.parse(xml_path)
     root = tree.getroot()
@@ -65,6 +70,7 @@ def power_xml_to_dict(xml_path):
 
     return result
 
+
 def delete_dir_contents(dir):
     for filename in os.listdir(dir):
         file_path = os.path.join(dir, filename)
@@ -74,7 +80,8 @@ def delete_dir_contents(dir):
             elif os.path.isdir(file_path):
                 shutil.rmtree(file_path)
         except Exception as e:
-            print('Failed to delete %s. Reason: %s' % (file_path, e))
+            print("Failed to delete %s. Reason: %s" % (file_path, e))
+
 
 def merge_dicts(a: dict, b: dict):
     for key in b:
@@ -87,6 +94,7 @@ def merge_dicts(a: dict, b: dict):
             a[key] = b[key]
     return a
 
+
 def merge_logs(log_a, log_b, log_out):
     # merges json log (list of nested dicts) b into a, not vice versa (TODO)
 
@@ -98,8 +106,8 @@ def merge_logs(log_a, log_b, log_out):
     for idx, run_a in enumerate(a):
         for run_b in b:
             if run_a["run_id"] == run_b["run_id"]:
-                #a[idx] |= run_b # requires Python >= 3.9
-                #a[idx] = {**run_a, **run_b}
+                # a[idx] |= run_b # requires Python >= 3.9
+                # a[idx] = {**run_a, **run_b}
                 a[idx] = merge_dicts(run_a, run_b)
                 break
 
diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py
index 5545f66536..f6de8edaae 100644
--- a/src/finn/builder/custom_step_library/conv1d.py
+++ b/src/finn/builder/custom_step_library/conv1d.py
@@ -1,9 +1,10 @@
 from qonnx.core.modelwrapper import ModelWrapper
-from finn.builder.build_dataflow_config import DataflowBuildConfig
 from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors
 from qonnx.transformation.general import GiveUniqueNodeNames
+
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
+from finn.builder.build_dataflow_config import DataflowBuildConfig
 
 
 def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
@@ -11,6 +12,7 @@ def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(absorb.AbsorbScalarMulAddIntoTopK())
     return model
 
+
 def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
     model = model.transform(to_hw.InferChannelwiseLinearLayer())
     model = model.transform(to_hw.InferLabelSelectLayer())
diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py
index 6a2d8053b2..0c251ad299 100644
--- a/src/finn/builder/custom_step_library/mobilenet.py
+++ b/src/finn/builder/custom_step_library/mobilenet.py
@@ -1,12 +1,7 @@
-from finn.benchmarking.bench_base import bench
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d
 from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
-from qonnx.transformation.general import (
-    ApplyConfig,
-    GiveReadableTensorNames,
-    GiveUniqueNodeNames,
-)
+from qonnx.transformation.general import ApplyConfig, GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
@@ -116,4 +111,4 @@ def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: Da
     model = model.transform(InferShapes())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(GiveReadableTensorNames())
-    return model
\ No newline at end of file
+    return model
diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py
index a4082b1adf..3e1c61063b 100644
--- a/src/finn/builder/custom_step_library/resnet.py
+++ b/src/finn/builder/custom_step_library/resnet.py
@@ -34,7 +34,6 @@
 from qonnx.transformation.double_to_single_float import DoubleToSingleFloat
 from qonnx.transformation.fold_constants import FoldConstants
 from qonnx.transformation.general import (
-    ApplyConfig,
     ConvertDivToMul,
     ConvertSubToAdd,
     GiveReadableTensorNames,
@@ -52,7 +51,7 @@
 from qonnx.transformation.remove import RemoveIdentityOps
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
-from finn.builder.build_dataflow_config import DataflowBuildConfig, ShellFlowType
+from finn.builder.build_dataflow_config import DataflowBuildConfig
 from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
 from finn.transformation.streamline.absorb import (
     Absorb1BitMulIntoConv,
diff --git a/src/finn/builder/custom_step_library/transformer.py b/src/finn/builder/custom_step_library/transformer.py
index 5b0d39c756..79cfa29353 100644
--- a/src/finn/builder/custom_step_library/transformer.py
+++ b/src/finn/builder/custom_step_library/transformer.py
@@ -4,6 +4,7 @@
 
 # Copies (deep-copies) python objects
 import copy
+import json
 
 # Numpy for loading and comparing the verification input/output
 import numpy as np
@@ -11,8 +12,6 @@
 # YAML for loading experiment configurations
 import yaml
 
-import json
-
 # QONNX quantization data types
 from qonnx.core.datatype import DataType
 
@@ -113,10 +112,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 
 # Splitting and removing of FIFOs from the model graph
-from finn.transformation.fpgadataflow.set_fifo_depths import (
-    RemoveShallowFIFOs,
-    SplitLargeFIFOs,
-)
+from finn.transformation.fpgadataflow.set_fifo_depths import RemoveShallowFIFOs, SplitLargeFIFOs
 
 # Graph transformation setting the folding, i.e., parallelization configuration
 from finn.transformation.fpgadataflow.set_folding import SetFolding
@@ -130,15 +126,12 @@
 # Folds quantizers into weight tensor initializers, needed for lowering
 # convolutions to MatMuls
 from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights
-from finn.transformation.qonnx.quant_act_to_multithreshold import (
-    default_filter_function_generator,
-)
+from finn.transformation.qonnx.quant_act_to_multithreshold import default_filter_function_generator
 
 # Cleanup transformation getting rid of 3d data layout
 from finn.transformation.squeeze import Squeeze
 from finn.transformation.streamline.absorb import (
     AbsorbAddIntoMultiThreshold,
-    AbsorbMulIntoMultiThreshold,
     AbsorbSignBiasIntoMultiThreshold,
 )
 
@@ -148,14 +141,10 @@
 
 # FINN streamlining transformations removing nodes without real effect from the
 # graph
-from finn.transformation.streamline.remove import (
-    RemoveIdentityReshape,
-    RemoveIdentityTranspose,
-)
+from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose
 
 # FINN streamlining transformations reordering the graph
 from finn.transformation.streamline.reorder import (
-    MoveAddPastMul,
     MoveMulPastAdd,
     MoveSqueezePastMatMul,
     MoveSqueezePastMultiThreshold,
diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
index 16f1e7a029..0e2bc27114 100644
--- a/src/finn/qnn-data/templates/driver/validate.py
+++ b/src/finn/qnn-data/templates/driver/validate.py
@@ -27,14 +27,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import argparse
-import os
+import json
 import numpy as np
-from PIL import Image
+import os
 from dataset_loading import FileQueue, ImgQueue
-import json
-from pynq import PL
 from driver import io_shape_dict
 from driver_base import FINNExampleOverlay
+from PIL import Image
+from pynq import PL
+
 
 def img_resize(img, size):
     w, h = img.size
@@ -49,13 +50,15 @@ def img_resize(img, size):
         ow = int(size * w / h)
         return img.resize((ow, oh), Image.BILINEAR)
 
+
 def img_center_crop(img, size):
     crop_height, crop_width = (size, size)
     image_width, image_height = img.size
-    crop_top = int(round((image_height - crop_height) / 2.))
-    crop_left = int(round((image_width - crop_width) / 2.))
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
     return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height))
 
+
 def pre_process(img_np):
     img = Image.fromarray(img_np.astype(np.uint8))
     img = img_resize(img, 256)
@@ -63,9 +66,10 @@ def pre_process(img_np):
     img = np.array(img, dtype=np.uint8)
     return img
 
-def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images = 50000):
+
+def setup_dataloader(val_path, label_file_path=None, batch_size=100, n_images=50000):
     if label_file_path is None:
-        val_folders = [ f.name for f in os.scandir(val_path) if f.is_dir() ]
+        val_folders = [f.name for f in os.scandir(val_path) if f.is_dir()]
         val_folders = sorted(val_folders)
         assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val"
         files = []
@@ -74,18 +78,19 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
             current_files = sorted(os.listdir(os.path.join(val_path, folder)))
             current_files = [os.path.join(folder, file) for file in current_files]
             files.extend(current_files)
-            labels.extend([idx]*len(current_files))
+            labels.extend([idx] * len(current_files))
         files = files[:n_images]
     else:
-        files = ['ILSVRC2012_val_{:08d}.JPEG'.format(i) for i in range(1,n_images+1)]
+        files = ["ILSVRC2012_val_{:08d}.JPEG".format(i) for i in range(1, n_images + 1)]
         labels = np.loadtxt(label_file_path, dtype=int, usecols=1)
 
     file_queue = FileQueue()
-    file_queue.load_epochs(list(zip(files,labels)), shuffle=False)
+    file_queue.load_epochs(list(zip(files, labels)), shuffle=False)
     img_queue = ImgQueue(maxsize=batch_size)
     img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process)
     return img_queue
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Validate top-1 accuracy for FINN-generated accelerator"
@@ -93,7 +98,9 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
     parser.add_argument(
         "--batchsize", help="number of samples for inference", type=int, default=100
     )
-    parser.add_argument("--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="")
+    parser.add_argument(
+        "--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default=""
+    )
     parser.add_argument(
         "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma"
     )
@@ -154,6 +161,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
         )
     elif dataset == "cifar100":
         from dataset_loading import cifar
+
         trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data(
             dataset_root, download=True, one_hot=False, cifar10=False
         )
@@ -184,7 +192,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
             ibuf_normal = test_imgs[i].reshape(driver.ishape_normal())
             exp = test_labels[i]
             obuf_normal = driver.execute(ibuf_normal)
-            #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
             if obuf_normal.shape[1] > 1:
                 obuf_normal = np.argmax(obuf_normal, axis=1)
             ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)
@@ -202,7 +210,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images
             exp = np.array(lbls)
             ibuf_normal = imgs.reshape(driver.ishape_normal())
             obuf_normal = driver.execute(ibuf_normal)
-            #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
+            # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0]
             if obuf_normal.shape[1] > 1:
                 obuf_normal = np.argmax(obuf_normal, axis=1)
             ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2)

From 3a84a57f8584e669cfa80bdc65465aa52d8a21bb Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 16:08:48 +0200
Subject: [PATCH 115/125] Change log level

---
 src/finn/benchmarking/bench_base.py | 40 ++++++++++++++++-------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
index 5cebe09878..01e42b9c2a 100644
--- a/src/finn/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 import subprocess
+import yaml
 from shutil import copy as shcopy
 from shutil import copytree
 
@@ -18,8 +19,6 @@
 from finn.builder.build_dataflow_config import DataflowBuildConfig
 from finn.util.basic import alveo_default_platform, alveo_part_map, part_map
 
-# TODO: merge this file into bench.py once most functionality has been moved to builder
-
 
 def start_test_batch_fast(results_path, project_path, run_target, pairs):
     # Prepare tcl script
@@ -109,7 +108,7 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         # TODO: coordinate with new builder loggin setup
 
         # Setup some basic global default configuration
-        # TODO: are these class members even used anymore?
+        # TODO: clean up or remove these attributes
         if "synth_clk_period_ns" in params:
             self.clock_period_ns = params["synth_clk_period_ns"]
         else:
@@ -136,6 +135,23 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         else:
             self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ
 
+        # Load custom (= non build_dataflow_config) parameters from topology-specific .yml
+        custom_params = [
+            "model_dir",  # used to setup onnx/npy input
+            "model_path",  # used to setup onnx/npy input
+            # model-gen parameters, such as seed, simd, pe, etc.
+            # TODO: separate these more cleanly from builder options
+        ]
+
+        dut_yaml_name = self.params["dut"] + ".yml"
+        dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name)
+        if os.path.isfile(dut_path):
+            with open(dut_path, "r") as f:
+                dut_cfg = yaml.load(f, Loader=yaml.SafeLoader)
+            for key in dut_cfg:
+                if key in custom_params:
+                    self.params[key] = dut_cfg[key]
+
         # Clear FINN tmp build dir before every run
         print("Clearing FINN BUILD DIR ahead of run")
         delete_dir_contents(os.environ["FINN_BUILD_DIR"])
@@ -244,15 +260,6 @@ def step_parse_builder_output(self, build_dir):
 
     def steps_full_build_flow(self):
         # Default step sequence for benchmarking a full FINN builder flow
-
-        # LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig)
-        custom_params = [
-            "model_dir",  # used to setup onnx/npy input
-            "model_path",  # used to setup onnx/npy input
-            # model-gen parameters, such as seed, simd, pe, etc.
-            # TODO: separate these from builder options
-        ]
-
         # MODEL CREATION/IMPORT
         # TODO: track fixed input onnx models with DVC
         if "model_dir" in self.params:
@@ -281,7 +288,8 @@ def steps_full_build_flow(self):
         # enable extra performance optimizations (physopt)
         # TODO: check OMX synth strategy again!
         cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST
-        cfg.verbose = False
+        cfg.verbose = True
+        cfg.console_log_level = "ERROR"
         cfg.enable_build_pdb_debug = False
         # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration
         cfg.force_python_rtlsim = False
@@ -294,14 +302,10 @@ def steps_full_build_flow(self):
         # cfg.large_fifo_mem_style
 
         # Overwrite build config settings with run-specific YAML build definition
+        # TODO: warn/error if there are unrecognized options set?
         for key in self.params:
             if hasattr(cfg, key):
                 setattr(cfg, key, self.params[key])
-            else:
-                if key not in custom_params:
-                    pass
-                    # TODO: be more strict? support custom extra options like MetaFi uses?
-                    # raise Exception("Unrecognized builder config defined in YAML: %s" % key)
 
         # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M)
         # TODO: make configurable or set on pipeline level?

From 6054c6b02a7ba4f9aaff94fb21bbf2e401e4166b Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 17:50:38 +0200
Subject: [PATCH 116/125] dvc pull before saving dvclive experiments

---
 ci/.gitlab-bench.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml
index b5d17d7fdc..6ddeb11858 100644
--- a/ci/.gitlab-bench.yml
+++ b/ci/.gitlab-bench.yml
@@ -75,5 +75,7 @@ Result Collection:
     # Also run on failure of previous tasks to collect partial results
     - when: always
   script:
+    # pulling models seems to be needed for dvclive to save experiments, even though they are not used or modified
+    - dvc pull
     - python3.10 ci/collect.py
     - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git

From 8dd0c0854df0c121c2c1afe7c62970775d8d0ba2 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Fri, 23 May 2025 22:16:01 +0200
Subject: [PATCH 117/125] Fix report dir creation

---
 src/finn/builder/build_dataflow.py                  | 8 ++++----
 src/finn/transformation/fpgadataflow/make_driver.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 2d38be3ab3..020571a6ad 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -171,7 +171,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     print(f"Final outputs will be generated in {cfg.output_dir}")
     print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
     # create the output dir if it doesn't exist
-    os.makedirs(cfg.output_dir, exist_ok=True)
+    os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
 
     # set up logger
     logpath = os.path.join(cfg.output_dir, "build_dataflow.log")
@@ -285,7 +285,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
                 "status": "failed",
                 "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
             }
-            with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
+            with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
                 json.dump(metadata, f, indent=2)
             return -1  # A user error shouldn't be need to be fixed using PDB
 
@@ -297,7 +297,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             "status": "failed",
             "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
         }
-        with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
+        with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
             json.dump(metadata, f, indent=2)
         return -1
 
@@ -308,7 +308,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
         "status": "ok",
         "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
     }
-    with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f:
+    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
         json.dump(metadata, f, indent=2)
     print("Completed successfully")
     return 0
diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py
index 76880cb558..4b1e70369b 100644
--- a/src/finn/transformation/fpgadataflow/make_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_driver.py
@@ -312,7 +312,7 @@ class MakePYNQDriverIODMA(Transformation):
     under the runtime_weights/ subfolder of the pynq_driver_dir.
     """
 
-    def __init__(self, platform, validation_datset):
+    def __init__(self, platform, validation_datset=None):
         super().__init__()
         self.platform = platform
         self.validation_datset = validation_datset

From 807950b2a8764b1da3a68cc8ef5b6976b3456e77 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 26 May 2025 12:01:49 +0200
Subject: [PATCH 118/125] Use live FIFO sizes for MNv1, RN50

---
 models.dvc                                | 6 +++---
 src/finn/benchmarking/dut/mobilenetv1.yml | 2 +-
 src/finn/benchmarking/dut/resnet50.yml    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/models.dvc b/models.dvc
index 784500a21f..35b5292128 100644
--- a/models.dvc
+++ b/models.dvc
@@ -1,6 +1,6 @@
 outs:
-- md5: 5db49af689e7827c32280837e0c80470.dir
-  size: 202993533
-  nfiles: 40
+- md5: 20c3f996d17ef035c8189c0d0ac44cf6.dir
+  size: 203029833
+  nfiles: 42
   hash: md5
   path: models
diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml
index bb3b26f436..16a68f4143 100644
--- a/src/finn/benchmarking/dut/mobilenetv1.yml
+++ b/src/finn/benchmarking/dut/mobilenetv1.yml
@@ -1,5 +1,5 @@
 model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx
-folding_config_file: models/mobilenetv1/ZCU102_folding_config.json
+folding_config_file: models/mobilenetv1/ZCU102_folding_config_live_fifo.json
 specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json
 
 steps:
diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml
index 3a3211aad1..c8779e5654 100644
--- a/src/finn/benchmarking/dut/resnet50.yml
+++ b/src/finn/benchmarking/dut/resnet50.yml
@@ -1,5 +1,5 @@
 model_path: models/resnet50/resnet50_w1a2_exported.onnx
-folding_config_file: models/resnet50/U250_folding_config.json
+folding_config_file: models/resnet50/U250_folding_config_live_fifo.json
 specialize_layers_config_file: models/resnet50/U250_specialize_layers.json
 vitis_floorplan_file: models/resnet50/floorplan_resnet50.json
 

From 94abf2c7c795650476b35fc4bbd180451524d9e3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 26 May 2025 14:30:49 +0200
Subject: [PATCH 119/125] Make console and log output more consistent

---
 src/finn/benchmarking/bench.py            |  13 ++-
 src/finn/builder/build_dataflow.py        | 120 +++++++++++-----------
 src/finn/builder/build_dataflow_config.py |   8 +-
 3 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 995b3b565c..765e14e587 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -165,13 +165,12 @@ def get_default_session_options_new():
             result = bench_object.run()
             if result == "skipped":
                 log_dict["status"] = "skipped"
-                print("Run skipped")
+                print("BENCH RUN SKIPPED")
             else:
                 log_dict["status"] = "ok"
-                print("Run successfully completed")
         except Exception:
             log_dict["status"] = "failed"
-            print("Run failed: " + traceback.format_exc())
+            print("BENCH RUN FAILED WITH EXCEPTION: " + traceback.format_exc())
             exit_code = 1
 
         log_dict["output"] = bench_object.output_dict
@@ -183,8 +182,12 @@ def get_default_session_options_new():
             with open(builder_log_path, "r") as f:
                 builder_log = json.load(f)
             if builder_log["status"] == "failed":
-                print("Run failed (builder reported failure)")
+                print("BENCH RUN FAILED (BUILDER REPORTED FAILURE)")
                 exit_code = 1
+            else:
+                print("BENCH RUN COMPLETED (BUILDER REPORTED SUCCESS)")
+        else:
+            print("BENCH RUN COMPLETED")
 
         # log metadata of this run to its own report directory
         log_path = os.path.join(bench_object.report_dir, "metadata_bench.json")
@@ -196,5 +199,5 @@ def get_default_session_options_new():
         # save local artifacts of this run (e.g., full build dir, detailed debug info)
         bench_object.save_local_artifacts_collection()
 
-    print("Stopping job")
+    print("STOPPING JOB")
     return exit_code
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 020571a6ad..f96f205e72 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -39,8 +39,10 @@
 import sys
 import time
 from qonnx.core.modelwrapper import ModelWrapper
+from rich import print as rprint
 from rich.console import Console
 from rich.logging import RichHandler
+from rich.traceback import Traceback
 
 from finn.builder.build_dataflow_config import DataflowBuildConfig, default_build_dataflow_steps
 from finn.builder.build_dataflow_steps import build_dataflow_step_lookup
@@ -159,21 +161,41 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta:
     return filename
 
 
+def log_and_exit(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0):
+    if exit_code:
+        print("Build failed")
+        status = "failed"
+    else:
+        print("Build completed successfully")
+        status = "ok"
+
+    # Generate metadata_builder.json
+    metadata = {
+        "status": status,
+        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
+    }
+    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Generate time_per_step.json
+    if time_per_step is not None:
+        time_per_step["total_build_time"] = sum(time_per_step.values())
+        with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f:
+            json.dump(time_per_step, f, indent=2)
+
+    return exit_code
+
+
 def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     """Best-effort build a dataflow accelerator using the given configuration.
 
     :param model_filename: ONNX model filename to build
     :param cfg: Build configuration
     """
-    finn_build_dir = os.environ["FINN_BUILD_DIR"]
-
-    print(f"Intermediate outputs will be generated in {finn_build_dir}")
-    print(f"Final outputs will be generated in {cfg.output_dir}")
-    print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
-    # create the output dir if it doesn't exist
+    # Create the output (report) dir if it doesn't exist
     os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
 
-    # set up logger
+    # Set up logger
     logpath = os.path.join(cfg.output_dir, "build_dataflow.log")
     if cfg.verbose:
         logging.basicConfig(
@@ -195,15 +217,16 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
 
     log = logging.getLogger("build_dataflow")
 
-    # mirror stdout and stderr to log
+    # Mirror stdout and stderr to log
     sys.stdout = PrintLogger(log, logging.INFO, sys.stdout)
     sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr)
     console = Console(file=sys.stdout.console)
 
+    # Set up console logger
     if cfg.console_log_level != "NONE":
-        # set up console logger
-        consoleHandler = RichHandler(show_time=True, show_path=False, console=console)
-
+        consoleHandler = RichHandler(
+            show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console
+        )
         if cfg.console_log_level == "DEBUG":
             consoleHandler.setLevel(logging.DEBUG)
         elif cfg.console_log_level == "INFO":
@@ -216,9 +239,13 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             consoleHandler.setLevel(logging.CRITICAL)
         logging.getLogger().addHandler(consoleHandler)
 
-    # Setup done, start processing
+    print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}")
+    print(f"Final outputs will be generated in {cfg.output_dir}")
+    print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
+
+    # Setup done, start build flow
     try:
-        # if start_step is specified, override the input model
+        # If start_step is specified, override the input model
         if cfg.start_step is None:
             print(f"Building dataflow accelerator from {model_filename}")
             model = ModelWrapper(model_filename)
@@ -240,7 +267,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             model = ModelWrapper(intermediate_model_filename)
         assert type(model) is ModelWrapper
 
-        # start processing
+        # Start processing
         step_num = 1
         time_per_step = dict()
         build_dataflow_steps = resolve_build_steps(cfg)
@@ -249,7 +276,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             step_name = transform_step.__name__
             print(f"Running step: {step_name} [{step_num}/{len(build_dataflow_steps)}]")
 
-            # run the step
+            # Run the step
             step_start = time.time()
             model = transform_step(model, cfg)
             step_end = time.time()
@@ -263,55 +290,24 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             step_num += 1
     except KeyboardInterrupt:
         print("KeyboardInterrupt detected. Aborting...")
-        print("Build failed")
-        return -1
+        return log_and_exit(cfg, time_per_step, -1)
     except (Exception, FINNError) as e:
-        # Print full traceback if we are on debug log level
-        # or encountered a non-user error
-        print_full_traceback = True
-        if issubclass(type(e), FINNUserError) and log.level != logging.DEBUG:
-            print_full_traceback = False
-
-        extype, value, tb = sys.exc_info()
-        if print_full_traceback:
-            # print exception info and traceback
-            log.error("FINN Internal compiler error:")
-            console.print_exception(show_locals=False)
+        if issubclass(type(e), FINNUserError):
+            # Handle FINN USER ERROR
+            log.error(f"FINN ERROR: {e}")
         else:
-            console.print(f"[bold red]FINN Error: [/bold red]{e}")
-            log.error(f"{e}")
-            print("Build failed")
-            metadata = {
-                "status": "failed",
-                "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
-            }
-            with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
-                json.dump(metadata, f, indent=2)
-            return -1  # A user error shouldn't be need to be fixed using PDB
-
-        # start postmortem debug if configured
-        if cfg.enable_build_pdb_debug:
-            pdb.post_mortem(tb)
-        print("Build failed")
-        metadata = {
-            "status": "failed",
-            "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
-        }
-        with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
-            json.dump(metadata, f, indent=2)
-        return -1
-
-    time_per_step["total_build_time"] = sum(time_per_step.values())
-    with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f:
-        json.dump(time_per_step, f, indent=2)
-    metadata = {
-        "status": "ok",
-        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
-    }
-    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
-        json.dump(metadata, f, indent=2)
-    print("Completed successfully")
-    return 0
+            # Handle remaining errors (= FINN INTERNAL COMPILER ERROR)
+            log.error(f"FINN INTERNAL COMPILER ERROR: {e}")
+
+        # Print traceback for interal errors or if in debug mode
+        if not issubclass(type(e), FINNUserError) or log.level == logging.DEBUG:
+            rprint(Traceback(show_locals=False))
+            # Start postmortem debug if configured
+            if cfg.enable_build_pdb_debug:
+                pdb.post_mortem(e.__traceback__)
+
+        return log_and_exit(cfg, time_per_step, -1)
+    return log_and_exit(cfg, time_per_step, 0)
 
 
 def build_dataflow_directory(path_to_cfg_dir: str):
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index b14bcab1d4..57204c5745 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -350,14 +350,14 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin):
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = False
 
-    #: When True, additional verbose information will be written to the log file.
-    #: Otherwise, these additional information will be suppressed.
+    #: When True, additional information (level = DEBUG) will be written to the log file.
+    #: Otherwise, this additional information will be suppressed (level = INFO).
     verbose: Optional[bool] = False
 
     #: Log level to be used on the command line for finn-plus internal logging.
-    #: This is different from the log level used for the build process,
+    #: This is different from the log level used for build_dataflow.log,
     #: which is controlled using the verbose flag.
-    console_log_level: Optional[LogLevel] = LogLevel.NONE
+    console_log_level: Optional[LogLevel] = LogLevel.ERROR
 
     #: If given, only run the steps in the list. If not, run default steps.
     #: See `default_build_dataflow_steps` for the default list of steps.

From 9899d542efbee4756842d2b01ce15e5f47539680 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 26 May 2025 16:25:51 +0200
Subject: [PATCH 120/125] More verbose benchmarking logging

---
 src/finn/benchmarking/bench.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 765e14e587..5f29959712 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -134,8 +134,9 @@ def get_default_session_options_new():
 
     # Run benchmark
     # TODO: integrate this loop (especially status logging) into the bench class
-    # TODO: log stdout of individual tasks of the job array into seperate files as artifacts
-    # (GitLab web interface is not readable), coordinate with new logging
+    successful_runs = []
+    skipped_runs = []
+    failed_runs = []
     for run, run_id in enumerate(selected_runs):
         print(
             "Starting run %d/%d (id %d of %d total runs)"
@@ -165,12 +166,14 @@ def get_default_session_options_new():
             result = bench_object.run()
             if result == "skipped":
                 log_dict["status"] = "skipped"
-                print("BENCH RUN SKIPPED")
+                print("BENCH RUN %d SKIPPED" % run_id)
+                skipped_runs.append(run_id)
             else:
                 log_dict["status"] = "ok"
         except Exception:
             log_dict["status"] = "failed"
-            print("BENCH RUN FAILED WITH EXCEPTION: " + traceback.format_exc())
+            print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc()))
+            failed_runs.append(run_id)
             exit_code = 1
 
         log_dict["output"] = bench_object.output_dict
@@ -182,12 +185,15 @@ def get_default_session_options_new():
             with open(builder_log_path, "r") as f:
                 builder_log = json.load(f)
             if builder_log["status"] == "failed":
-                print("BENCH RUN FAILED (BUILDER REPORTED FAILURE)")
+                print("BENCH RUN %d FAILED (BUILDER REPORTED FAILURE)" % run_id)
+                failed_runs.append(run_id)
                 exit_code = 1
             else:
-                print("BENCH RUN COMPLETED (BUILDER REPORTED SUCCESS)")
+                print("BENCH RUN %d COMPLETED (BUILDER REPORTED SUCCESS)" % run_id)
+                successful_runs.append(run_id)
         else:
-            print("BENCH RUN COMPLETED")
+            print("BENCH RUN %d COMPLETED" % run_id)
+            successful_runs.append(run_id)
 
         # log metadata of this run to its own report directory
         log_path = os.path.join(bench_object.report_dir, "metadata_bench.json")
@@ -199,5 +205,8 @@ def get_default_session_options_new():
         # save local artifacts of this run (e.g., full build dir, detailed debug info)
         bench_object.save_local_artifacts_collection()
 
-    print("STOPPING JOB")
+    print("STOPPING JOB %d (of %d total jobs)" % (task_id, task_count))
+    print("JOB %d SUCCESSFUL RUNS: %s" % (task_id, successful_runs))
+    print("JOB %d SKIPPED RUNS: %s" % (task_id, skipped_runs))
+    print("JOB %d FAILED RUNS: %s" % (task_id, failed_runs))
     return exit_code

From 83a328d79cb0035a4f7cc015e6ed1761d43142a3 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Mon, 26 May 2025 17:56:57 +0200
Subject: [PATCH 121/125] Fix paths to moved report files

---
 notebooks/advanced/4_advanced_builder_settings.ipynb        | 4 ++--
 .../cybersecurity/3-build-accelerator-with-finn.ipynb       | 4 ++--
 src/finn/builder/build_dataflow.py                          | 4 ++++
 tests/end2end/test_end2end_cybsec_mlp.py                    | 4 ++--
 tests/fpgadataflow/test_fifosizing.py                       | 2 +-
 tests/util/test_build_dataflow.py                           | 6 +++---
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index 1e544cf513..73ae7f555c 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -964,7 +964,7 @@
    "source": [
     "import json\n",
     "\n",
-    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n",
     "    folding_config = json.load(json_file)\n",
     "\n",
     "print(json.dumps(folding_config, indent=1))"
@@ -1035,7 +1035,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+    "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n",
     "    folding_config = json.load(json_file)\n",
     "\n",
     "# Set all ram_style to LUT RAM\n",
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 7a23a3628e..39ae1dd5f6 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -323,7 +323,7 @@
    "source": [
     "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n",
     "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n",
-    "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")"
+    "assert os.path.exists(rtlsim_output_dir + \"/report/final_hw_config.json\")"
    ]
   },
   {
@@ -410,7 +410,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! cat {rtlsim_output_dir}/final_hw_config.json"
+    "! cat {rtlsim_output_dir}/report/final_hw_config.json"
    ]
   },
   {
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index f96f205e72..91dec71140 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -292,6 +292,10 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
         print("KeyboardInterrupt detected. Aborting...")
         return log_and_exit(cfg, time_per_step, -1)
     except (Exception, FINNError) as e:
+        # Re-raise exception if we are in a PyTest session so we don't miss it
+        if "PYTEST_CURRENT_TEST" in os.environ:
+            raise
+
         if issubclass(type(e), FINNUserError):
             # Handle FINN USER ERROR
             log.error(f"FINN ERROR: {e}")
diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py
index 4770066117..cf75fd273b 100644
--- a/tests/end2end/test_end2end_cybsec_mlp.py
+++ b/tests/end2end/test_end2end_cybsec_mlp.py
@@ -165,8 +165,8 @@ def test_end2end_cybsec_mlp_build(self):
         )
         build.build_dataflow_cfg(model_file, cfg)
         # check the generated files
-        assert os.path.isfile(output_dir + "/time_per_step.json")
-        assert os.path.isfile(output_dir + "/final_hw_config.json")
+        assert os.path.isfile(output_dir + "/report/time_per_step.json")
+        assert os.path.isfile(output_dir + "/report/final_hw_config.json")
         assert os.path.isfile(output_dir + "/template_specialize_layers_config.json")
         assert os.path.isfile(output_dir + "/driver/driver.py")
         est_cycles_report = output_dir + "/report/estimate_layer_cycles.json"
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index bb89e8ab84..97686235d0 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -95,7 +95,7 @@ def test_fifosizing_linear(method, topology):
     cfg_cmp.auto_fifo_depths = False
     cfg_cmp.target_fps = None
     cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP]
-    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    cfg_cmp.folding_config_file = tmp_output_dir + "/report/final_hw_config.json"
     build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
 
     model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx")
diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py
index 18f574bc8f..65d1942bed 100644
--- a/tests/util/test_build_dataflow.py
+++ b/tests/util/test_build_dataflow.py
@@ -48,9 +48,9 @@ def test_end2end_build_dataflow_directory():
     build_dataflow_directory(target_dir)
     # check the generated files
     output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1"
-    assert os.path.isfile(output_dir + "/time_per_step.json")
-    assert os.path.isfile(output_dir + "/auto_folding_config.json")
-    assert os.path.isfile(output_dir + "/final_hw_config.json")
+    assert os.path.isfile(output_dir + "/report/time_per_step.json")
+    assert os.path.isfile(output_dir + "/report/auto_folding_config.json")
+    assert os.path.isfile(output_dir + "/report/final_hw_config.json")
     assert os.path.isfile(output_dir + "/template_specialize_layers_config.json")
     assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml")
     assert os.path.isfile(output_dir + "/driver/driver.py")

From b8c9f74c39442efc412cb81e09def97f87f9ed57 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 27 May 2025 17:29:33 +0200
Subject: [PATCH 122/125] Increase logging robustness, verbosity

---
 src/finn/benchmarking/bench.py      |  40 +++++++++--
 src/finn/benchmarking/bench_base.py |   3 -
 src/finn/builder/build_dataflow.py  | 101 ++++++++++++++++------------
 3 files changed, 92 insertions(+), 52 deletions(-)

diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py
index 5f29959712..1a478a466c 100644
--- a/src/finn/benchmarking/bench.py
+++ b/src/finn/benchmarking/bench.py
@@ -2,6 +2,7 @@
 import json
 import onnxruntime as ort
 import os
+import sys
 import time
 import traceback
 import yaml
@@ -19,6 +20,24 @@
 dut["transformer"] = bench_transformer
 
 
+class PrefixPrinter(object):
+    """
+    Create a custom stream handler that adds a prefix
+    """
+
+    def __init__(self, prefix, originalstream):
+        self.console = originalstream
+        self.prefix = prefix
+        self.linebuf = ""
+
+    def write(self, buf):
+        for line in buf.rstrip().splitlines():
+            self.console.write(f"[{self.prefix}] " + line + "\n")
+
+    def flush(self):
+        self.console.flush()
+
+
 def start_bench_run(config_name):
     exit_code = 0
     # Attempt to work around onnxruntime issue on Slurm-managed clusters:
@@ -130,21 +149,23 @@ def get_default_session_options_new():
         while idx < total_runs:
             selected_runs.append(idx)
             idx = idx + task_count
-    print("This job will perform %d out of %d total runs" % (len(selected_runs), total_runs))
+    print(
+        "STARTING JOB %d. IT WILL PERFORM %d OUT OF %d TOTAL RUNS"
+        % (task_id, len(selected_runs), total_runs)
+    )
 
     # Run benchmark
-    # TODO: integrate this loop (especially status logging) into the bench class
     successful_runs = []
     skipped_runs = []
     failed_runs = []
     for run, run_id in enumerate(selected_runs):
         print(
-            "Starting run %d/%d (id %d of %d total runs)"
+            "STARTING RUN %d/%d (ID %d OF %d TOTAL RUNS)"
             % (run + 1, len(selected_runs), run_id, total_runs)
         )
 
         params = config_expanded[run_id]
-        print("Run parameters: %s" % (str(params)))
+        print("RUN %d PARAMETERS: %s" % (run_id, str(params)))
 
         log_dict = {"run_id": run_id, "task_id": task_id, "params": params}
 
@@ -159,11 +180,18 @@ def get_default_session_options_new():
                 # expect DUT-specific YAML definition instead
                 bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir)
         else:
-            print("ERROR: no DUT specified")
+            print("ERROR: NO DUT SPECIFIED")
             return 1
 
+        # Wrap stdout/stderr with an additional prefix to identify the run in the live console
+        original_stdout = sys.stdout
+        original_stderr = sys.stderr
+        sys.stdout = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stdout)
+        sys.stderr = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stderr)
         try:
             result = bench_object.run()
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
             if result == "skipped":
                 log_dict["status"] = "skipped"
                 print("BENCH RUN %d SKIPPED" % run_id)
@@ -171,6 +199,8 @@ def get_default_session_options_new():
             else:
                 log_dict["status"] = "ok"
         except Exception:
+            sys.stdout = original_stdout
+            sys.stderr = original_stderr
             log_dict["status"] = "failed"
             print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc()))
             failed_runs.append(run_id)
diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py
index 01e42b9c2a..e0bea7ee13 100644
--- a/src/finn/benchmarking/bench_base.py
+++ b/src/finn/benchmarking/bench_base.py
@@ -104,9 +104,6 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d
         self.save_dir = save_dir
         self.debug = debug
 
-        # TODO: setup a logger so output can go to console (with task id prefix)
-        # TODO: coordinate with new builder loggin setup
-
         # Setup some basic global default configuration
         # TODO: clean up or remove these attributes
         if "synth_clk_period_ns" in params:
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 91dec71140..7760cdbae7 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -161,41 +161,12 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta:
     return filename
 
 
-def log_and_exit(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0):
-    if exit_code:
-        print("Build failed")
-        status = "failed"
-    else:
-        print("Build completed successfully")
-        status = "ok"
-
-    # Generate metadata_builder.json
-    metadata = {
-        "status": status,
-        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
-    }
-    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
-        json.dump(metadata, f, indent=2)
-
-    # Generate time_per_step.json
-    if time_per_step is not None:
-        time_per_step["total_build_time"] = sum(time_per_step.values())
-        with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f:
-            json.dump(time_per_step, f, indent=2)
-
-    return exit_code
-
-
-def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
-    """Best-effort build a dataflow accelerator using the given configuration.
-
-    :param model_filename: ONNX model filename to build
-    :param cfg: Build configuration
-    """
-    # Create the output (report) dir if it doesn't exist
-    os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
-
-    # Set up logger
+def setup_logging(cfg: DataflowBuildConfig):
+    # Set up global logger, the force=True has the following effects:
+    # - If multiple build are run in a row, the log file will be re-created for each,
+    #   which is needed if the file was deleted/moved or the output dir changed
+    # - In a PyTest session, this logger will replace the PyTest log handlers, so logs
+    #   (+ captured warnings!) will end up in the log file instead of being collected by PyTest
     logpath = os.path.join(cfg.output_dir, "build_dataflow.log")
     if cfg.verbose:
         logging.basicConfig(
@@ -203,6 +174,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             format="[%(asctime)s]%(levelname)s: %(pathname)s:%(lineno)d: %(message)s",
             filename=logpath,
             filemode="w",
+            force=True,
         )
     else:
         logging.basicConfig(
@@ -210,19 +182,21 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             format="[%(asctime)s]%(levelname)s: %(message)s",
             filename=logpath,
             filemode="w",
+            force=True,
         )
 
-    # Capture all warnings.warn calls of qonnx,...
+    # Capture all warnings.warn calls of qonnx, ...
     logging.captureWarnings(True)
 
-    log = logging.getLogger("build_dataflow")
-
     # Mirror stdout and stderr to log
-    sys.stdout = PrintLogger(log, logging.INFO, sys.stdout)
-    sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr)
+    log = logging.getLogger("build_dataflow")
+    if not isinstance(sys.stdout, PrintLogger):
+        # Prevent rediricting stdout/sterr multiple times
+        sys.stdout = PrintLogger(log, logging.INFO, sys.stdout)
+        sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr)
     console = Console(file=sys.stdout.console)
 
-    # Set up console logger
+    # Mirror a configurable log level to console (default = ERROR)
     if cfg.console_log_level != "NONE":
         consoleHandler = RichHandler(
             show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console
@@ -239,6 +213,45 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             consoleHandler.setLevel(logging.CRITICAL)
         logging.getLogger().addHandler(consoleHandler)
 
+    return log
+
+
+def exit_buildflow(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0):
+    if exit_code:
+        print("Build failed")
+        status = "failed"
+    else:
+        print("Build completed successfully")
+        status = "ok"
+
+    # Generate metadata_builder.json
+    metadata = {
+        "status": status,
+        "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")),
+    }
+    with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f:
+        json.dump(metadata, f, indent=2)
+
+    # Generate time_per_step.json
+    if time_per_step is not None:
+        time_per_step["total_build_time"] = sum(time_per_step.values())
+        with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f:
+            json.dump(time_per_step, f, indent=2)
+
+    return exit_code
+
+
+def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
+    """Best-effort build a dataflow accelerator using the given configuration.
+
+    :param model_filename: ONNX model filename to build
+    :param cfg: Build configuration
+    """
+    log = setup_logging(cfg)
+
+    # Create the output (report) dir if it doesn't exist
+    os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
+
     print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}")
     print(f"Final outputs will be generated in {cfg.output_dir}")
     print(f"Build log is at {cfg.output_dir}/build_dataflow.log")
@@ -290,7 +303,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             step_num += 1
     except KeyboardInterrupt:
         print("KeyboardInterrupt detected. Aborting...")
-        return log_and_exit(cfg, time_per_step, -1)
+        return exit_buildflow(cfg, time_per_step, -1)
     except (Exception, FINNError) as e:
         # Re-raise exception if we are in a PyTest session so we don't miss it
         if "PYTEST_CURRENT_TEST" in os.environ:
@@ -310,8 +323,8 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
             if cfg.enable_build_pdb_debug:
                 pdb.post_mortem(e.__traceback__)
 
-        return log_and_exit(cfg, time_per_step, -1)
-    return log_and_exit(cfg, time_per_step, 0)
+        return exit_buildflow(cfg, time_per_step, -1)
+    return exit_buildflow(cfg, time_per_step, 0)
 
 
 def build_dataflow_directory(path_to_cfg_dir: str):

From c6cce9877333f1543df32d58d064a8554833b952 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 27 May 2025 18:19:50 +0200
Subject: [PATCH 123/125] Switch RN-50 to U280

---
 ci/cfg/regression_extended.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml
index f40c11ab11..a95dfa06d8 100644
--- a/ci/cfg/regression_extended.yml
+++ b/ci/cfg/regression_extended.yml
@@ -2,7 +2,7 @@
     # ResNet-50
     {
         "dut": ["resnet50"],
-        "board": ["U250"],
+        "board": ["U280"],
         "synth_clk_period_ns": [4],
         "rtlsim_batch_size": [3],
         # no deployment package because Alveo deployment is not yet supported by CI

From dbfd95509d673663c39361ed09a6ba3caa919583 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Tue, 27 May 2025 19:31:29 +0200
Subject: [PATCH 124/125] Extend launch_process_helper and use it in more
 places

---
 src/finn/core/rtlsim_exec.py                  |  12 +-
 .../fpgadataflow/create_stitched_ip.py        |  19 ++--
 .../fpgadataflow/make_zynq_proj.py            |  26 +++--
 .../fpgadataflow/vitis_build.py               |  53 ++++-----
 src/finn/util/basic.py                        | 104 ++++++++++--------
 src/finn/util/hls.py                          |  13 +--
 6 files changed, 115 insertions(+), 112 deletions(-)

diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 61f2762039..46616599cb 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -28,8 +28,8 @@
 
 import numpy as np
 import os
-import sys
 from qonnx.custom_op.registry import getCustomOp
+from subprocess import CalledProcessError
 
 from finn.util.basic import (
     get_liveness_threshold_cycles,
@@ -39,6 +39,7 @@
 )
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 from finn.util.deps import get_deps_path
+from finn.util.exception import FINNError
 from finn.util.logging import log
 
 try:
@@ -294,11 +295,12 @@ def rtlsim_exec_cppxsi(
     # write compilation command to a file for easy re-running/debugging
     with open(sim_base + "/compile_rtlsim.sh", "w") as f:
         f.write(" ".join(build_cmd))
-    stdout, stderr = launch_process_helper(build_cmd, cwd=sim_base)
+    try:
+        launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False)
+    except CalledProcessError:
+        raise FINNError("Failed to compile rtlsim executable")
     if not os.path.isfile(sim_base + "/rtlsim_xsi"):
-        print(stdout)
-        print(stderr, file=sys.stderr)
-        raise RuntimeError("Failed to compile rtlsim executable")
+        raise FINNError("Failed to compile rtlsim executable")
 
     # launch the rtlsim executable
     # important to specify LD_LIBRARY_PATH here for XSI to work correctly
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 7a8d38182d..39bed71c82 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -30,14 +30,15 @@
 import json
 import multiprocessing as mp
 import os
-import subprocess
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.util.basic import get_num_default_workers
 from shutil import copytree
+from subprocess import CalledProcessError
 
 from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths
-from finn.util.basic import make_build_dir
+from finn.util.basic import launch_process_helper, make_build_dir
+from finn.util.exception import FINNError
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 from finn.util.logging import log
 
@@ -633,14 +634,12 @@ def apply(self, model):
             f.write("vivado -mode batch -source make_project.tcl\n")
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", make_project_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        (_, stderr_data) = process_compile.communicate()
 
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for wrapper HDL
+            pass
 
         # wrapper may be created in different location depending on Vivado version
         if not os.path.isfile(wrapper_filename):
@@ -649,7 +648,7 @@ def apply(self, model):
             if os.path.isfile(wrapper_filename_alt):
                 model.set_metadata_prop("wrapper_filename", wrapper_filename_alt)
             else:
-                raise Exception(
+                raise FINNError(
                     """CreateStitchedIP failed, no wrapper HDL found under %s or %s.
                     Please check logs under the parent directory."""
                     % (wrapper_filename, wrapper_filename_alt)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 59d4293323..e280fba016 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -29,13 +29,13 @@
 
 import math
 import os
-import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_data_layouts import InferDataLayouts
 from shutil import copy
+from subprocess import CalledProcessError
 
 from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
@@ -47,9 +47,14 @@
 from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
+from finn.util.basic import (
+    launch_process_helper,
+    make_build_dir,
+    pynq_native_port_width,
+    pynq_part_map,
+)
 from finn.util.deps import get_deps_path
-from finn.util.logging import log
+from finn.util.exception import FINNError
 
 from . import templates
 
@@ -399,16 +404,15 @@ def apply(self, model):
 
         # call the synthesis script
         bash_command = ["bash", synth_project_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for bitfile
+            pass
+
         bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit"
         if not os.path.isfile(bitfile_name):
-            raise Exception(
+            raise FINNError(
                 "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir
             )
         deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit"
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index 222c9c2336..1c5a5eff91 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -29,7 +29,6 @@
 
 import json
 import os
-import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -38,6 +37,7 @@
     GiveUniqueNodeNames,
     RemoveUnusedTensors,
 )
+from subprocess import CalledProcessError
 
 from finn.builder.build_dataflow_config import FpgaMemoryType, VitisOptStrategy
 from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition
@@ -49,8 +49,8 @@
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.basic import make_build_dir
-from finn.util.logging import log
+from finn.util.basic import launch_process_helper, make_build_dir
+from finn.util.exception import FINNError
 
 from . import templates
 
@@ -142,16 +142,14 @@ def apply(self, model):
             f.write("vivado -mode batch -source gen_xo.tcl\n")
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", package_xo_sh]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
-        assert os.path.isfile(xo_path), (
-            "Vitis .xo file not created, check logs under %s" % vivado_proj_dir
-        )
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for .xo file
+            pass
+        if not os.path.isfile(xo_path):
+            raise FINNError("Vitis .xo file not created, check logs under %s" % vivado_proj_dir)
+
         return (model, False)
 
 
@@ -327,18 +325,17 @@ def apply(self, model):
             )
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
-        # TODO rename xclbin appropriately here?
+
+        try:
+            launch_process_helper(bash_command, print_stdout=False)
+        except CalledProcessError:
+            # Check success manually by looking for .xo file
+            pass
         xclbin = link_dir + "/a.xclbin"
-        assert os.path.isfile(xclbin), (
-            "Vitis .xclbin file not created, check logs under %s" % link_dir
-        )
+        if not os.path.isfile(xclbin):
+            raise FINNError("Vitis .xclbin file not created, check logs under %s" % link_dir)
+
+        # TODO rename xclbin appropriately here?
         model.set_metadata_prop("bitfile", xclbin)
 
         # run Vivado to gen xml report
@@ -350,13 +347,7 @@ def apply(self, model):
             f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl"))
             f.write("cd {}\n".format(working_dir))
         bash_command = ["bash", gen_rep_xml_sh]
-        process_genxml = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_genxml.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        launch_process_helper(bash_command, print_stdout=False)
         # filename for the synth utilization report
         synth_report_filename = link_dir + "/synth_report.xml"
         model.set_metadata_prop("vivado_synth_rpt", synth_report_filename)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index acb8bb1303..7f7e658146 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -151,6 +151,65 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path
     return str(tmpdir)
 
 
+def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
+    """Helper function to launch a process in a way that facilitates logging
+    stdout/stderr with Python loggers.
+    Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise."""
+    process = subprocess.run(args, capture_output=True, env=proc_env, cwd=cwd, text=True)
+    cmd_out = process.stdout.strip()
+    cmd_err = process.stderr.strip()
+
+    # Handle stdout
+    if cmd_out:
+        if print_stdout is True:
+            log.info(cmd_out)
+        else:
+            # Print with DEBUG level regardless
+            log.debug(cmd_out)
+
+    # Handle stderr, depending on return code
+    if process.returncode == 0:
+        # Process completed successfully, log stderr only as WARNING
+        if cmd_err:
+            log.warning(cmd_err)
+    else:
+        # Process failed, log stderr as ERROR
+        if cmd_err:
+            log.error(cmd_err)
+
+        # Log additional ERROR message
+        if isinstance(args, list):
+            cmd = " ".join(args)
+        else:
+            cmd = args
+        log.error(f"Launched process returned non-zero exit code ({process.returncode}): {cmd}")
+
+    # Raise CalledProcessError for non-zero return code
+    process.check_returncode()
+    return (cmd_out, cmd_err)
+
+
+def which(program):
+    "Python equivalent of the shell cmd 'which'."
+
+    # source:
+    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
+    def is_exe(fpath):
+        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
+
+    fpath, fname = os.path.split(program)
+    if fpath:
+        if is_exe(program):
+            return program
+    else:
+        for path in os.environ["PATH"].split(os.pathsep):
+            exe_file = os.path.join(path, program)
+            if is_exe(exe_file):
+                return exe_file
+
+    return None
+
+
 class CppBuilder:
     """Builds the g++ compiler command to produces the executable of the c++ code
     in code_gen_dir which is passed to the function build() of this class."""
@@ -194,50 +253,7 @@ def build(self, code_gen_dir):
             f.write("#!/bin/bash \n")
             f.write(bash_compile + "\n")
         bash_command = ["bash", self.compile_script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
-        )
-        _, stderr_data = process_compile.communicate()
-        if stderr_data.strip():
-            log.critical(stderr_data.strip())  # Decode bytes and log as critical
-
-
-def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True):
-    """Helper function to launch a process in a way that facilitates logging
-    stdout/stderr with Python loggers.
-    Returns (cmd_out, cmd_err)."""
-    if proc_env is None:
-        proc_env = os.environ.copy()
-    with subprocess.Popen(
-        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=proc_env, cwd=cwd, text=True
-    ) as proc:
-        (cmd_out, cmd_err) = proc.communicate()
-    if cmd_out.strip() and print_stdout is True:
-        log.info(cmd_out.strip())
-    if cmd_err.strip():
-        log.critical(cmd_err.strip())
-    return (cmd_out, cmd_err)
-
-
-def which(program):
-    "Python equivalent of the shell cmd 'which'."
-
-    # source:
-    # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python
-    def is_exe(fpath):
-        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
-
-    fpath, fname = os.path.split(program)
-    if fpath:
-        if is_exe(program):
-            return program
-    else:
-        for path in os.environ["PATH"].split(os.pathsep):
-            exe_file = os.path.join(path, program)
-            if is_exe(exe_file):
-                return exe_file
-
-    return None
+        launch_process_helper(bash_command, print_stdout=False)
 
 
 mem_primitives_versal = {
diff --git a/src/finn/util/hls.py b/src/finn/util/hls.py
index b1b88dbafe..dc153c0f52 100644
--- a/src/finn/util/hls.py
+++ b/src/finn/util/hls.py
@@ -27,10 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import subprocess
-
-from finn.util.basic import which
-from finn.util.logging import log
+from finn.util.basic import launch_process_helper, which
 
 
 class CallHLS:
@@ -65,10 +62,4 @@ def build(self, code_gen_dir):
         f.write("cd {}\n".format(working_dir))
         f.close()
         bash_command = ["bash", self.ipgen_script]
-        process_compile = subprocess.Popen(
-            bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _, stderr_data = process_compile.communicate()
-        stderr_stripped = stderr_data.decode().strip()
-        if stderr_stripped != "" and stderr_stripped is not None:
-            log.critical(stderr_stripped)  # Decode bytes and log as critical
+        launch_process_helper(bash_command, print_stdout=False)

From 8abf5fe7a5ce3fca76f3bc4c7eb91e553f348eb8 Mon Sep 17 00:00:00 2001
From: Felix Jentzsch <felix.jentzsch@upb.de>
Date: Wed, 28 May 2025 10:33:58 +0200
Subject: [PATCH 125/125] Fix build dir creation

---
 src/finn/builder/build_dataflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index 7760cdbae7..2184531443 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -247,11 +247,11 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
     :param model_filename: ONNX model filename to build
     :param cfg: Build configuration
     """
-    log = setup_logging(cfg)
-
     # Create the output (report) dir if it doesn't exist
     os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True)
 
+    log = setup_logging(cfg)
+
     print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}")
     print(f"Final outputs will be generated in {cfg.output_dir}")
     print(f"Build log is at {cfg.output_dir}/build_dataflow.log")