From 0aec7f047493a693f287d2ac09cca74f32e95d86 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 12:34:54 +0000 Subject: [PATCH 001/125] Check-in core benchmarking code --- .gitlab-ci.yml | 84 ++ benchmarking/bench-ci.yml | 53 + benchmarking/bench.py | 180 +++ benchmarking/bench_base.py | 933 +++++++++++++++ benchmarking/bench_rtl_swg.py | 403 +++++++ benchmarking/cfg/fifosizing_test.json | 21 + benchmarking/cfg/metafi_fifosizing_test.json | 17 + benchmarking/cfg/mvau_test.json | 29 + .../cfg/resnet50_fifosizing_test.json | 19 + benchmarking/cfg/transformer_gpt_all.json | 22 + benchmarking/cfg/transformer_radioml_all.json | 7 + benchmarking/cfg/transformer_sweep.json | 92 ++ benchmarking/cfg/transformer_test.json | 20 + benchmarking/collect.py | 90 ++ benchmarking/dut/fifosizing.py | 576 +++++++++ benchmarking/dut/mvau.py | 295 +++++ benchmarking/dut/resnet50_custom_steps.py | 252 ++++ benchmarking/dut/transformer.py | 1046 +++++++++++++++++ benchmarking/dut/transformer_custom_steps.py | 878 ++++++++++++++ benchmarking/dut/transformer_gpt.py | 348 ++++++ benchmarking/dut/transformer_radioml.py | 336 ++++++ benchmarking/harness/sink/ip/component.xml | 256 ++++ .../harness/sink/ip/src/harness_sink.v | 39 + .../sink/ip/xgui/harness_sink_v1_0.tcl | 25 + benchmarking/harness/vector_xor.v | 32 + benchmarking/templates.py | 213 ++++ benchmarking/util.py | 87 ++ 27 files changed, 6353 insertions(+) create mode 100644 .gitlab-ci.yml create mode 100644 benchmarking/bench-ci.yml create mode 100644 benchmarking/bench.py create mode 100644 benchmarking/bench_base.py create mode 100644 benchmarking/bench_rtl_swg.py create mode 100644 benchmarking/cfg/fifosizing_test.json create mode 100644 benchmarking/cfg/metafi_fifosizing_test.json create mode 100644 benchmarking/cfg/mvau_test.json create mode 100644 benchmarking/cfg/resnet50_fifosizing_test.json create mode 100644 benchmarking/cfg/transformer_gpt_all.json create mode 100644 benchmarking/cfg/transformer_radioml_all.json create mode 100644 benchmarking/cfg/transformer_sweep.json create mode 100644 benchmarking/cfg/transformer_test.json create mode 100644 benchmarking/collect.py create mode 100644 benchmarking/dut/fifosizing.py create mode 100644 benchmarking/dut/mvau.py create mode 100644 benchmarking/dut/resnet50_custom_steps.py create mode 100644 benchmarking/dut/transformer.py create mode 100644 benchmarking/dut/transformer_custom_steps.py create mode 100644 benchmarking/dut/transformer_gpt.py create mode 100644 benchmarking/dut/transformer_radioml.py create mode 100644 benchmarking/harness/sink/ip/component.xml create mode 100644 benchmarking/harness/sink/ip/src/harness_sink.v create mode 100644 benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl create mode 100644 benchmarking/harness/vector_xor.v create mode 100644 benchmarking/templates.py create mode 100644 benchmarking/util.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000..ebfa2f6f88 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,84 @@ +stages: + - update + - build + - load_deps + - test + - trigger_benchmarks + +variables: + PIPELINE_NAME: + description: "Optional name to better identify this pipeline" + value: "" + CPU_CORES: + description: "Select number of CPU cores and test workers" + value: "8" + PARALLEL_JOBS: + description: "Number of parallel Slurm array jobs per CI job" + value: "2" + SLURM_TIMEOUT: + description: "Timeout" + value: "2-0" # [days-hours] + MANUAL_CFG_PATH: + description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner" + value: "" + SLURM_PARTITION: + description: "Slurm partition (e.g., normal, largemem, fpga, gpu)" + value: "normal" + SLURM_QOS: + description: "Optional QoS option (include --qos, e.g., --qos express)" + value: "" + FINN_XILINX_VERSION: + value: "2022.2" + +workflow: + name: '$PIPELINE_NAME' + +Fetch Repos: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: load_deps + tags: + - login + cache: + key: $CI_COMMIT_SHA + paths: + - deps + script: + - ./fetch-repos.sh + +Bench (Manual): + stage: trigger_benchmarks + rules: + - if: $MANUAL_CFG_PATH != "" + trigger: + include: benchmarking/bench-ci.yml + strategy: depend + forward: + pipeline_variables: true + variables: + BENCH_CFG: "manual" + +Bench: + stage: trigger_benchmarks + rules: + - if: $MANUAL_CFG_PATH == "" + trigger: + include: benchmarking/bench-ci.yml + strategy: depend + forward: + pipeline_variables: true + parallel: + matrix: + - BENCH_CFG: [mvau_test] + +#dev: mvau_test +#fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test +#transformer: transformer_test, transformer_radioml_all + +#TODO: introduce result collect job on parent level for easier visualization/excel interfacing +#TODO: more control via (optional) variables +#TODO: move power measurement from polling-based script to its own job/runner +#TODO: ensure a freshly initialized workdir on job/runner level (e.g. created directories seem to stay there) +#TODO: (optionally) save ALL build artifacts/logs/temporary files to artifacts or PFS for debugging (maybe via Jacamar feature of setting individual persistent workdirs?) +#TODO: fix clock frequency discrepancies between setting, synth, and driver \ No newline at end of file diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml new file mode 100644 index 0000000000..f50bd1d3f8 --- /dev/null +++ b/benchmarking/bench-ci.yml @@ -0,0 +1,53 @@ +stages: + - synth + - measure + - collect + +variables: + BENCH_CFG: + description: "Select config, usually provided by parent pipeline" + value: "" + +workflow: + name: "bench_$BENCH_CFG" + +FINN Build: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: synth + variables: + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" + PYTEST_PARALLEL: "$CPU_CORES" + FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/xilinx/finn_dev.sif" + before_script: + - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) + - cd $PATH_WORKDIR/finn-plus + - module load system singularity + script: + - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG + cache: + key: $CI_COMMIT_SHA + policy: pull + paths: + - deps + artifacts: + name: "bench_artifacts" + when: always + paths: + - bench_artifacts/ + +Result Collection: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: collect + tags: + - image_build + script: + - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json + artifacts: + name: "bench_results" + when: always + paths: + - bench_results.json diff --git a/benchmarking/bench.py b/benchmarking/bench.py new file mode 100644 index 0000000000..77f62bd775 --- /dev/null +++ b/benchmarking/bench.py @@ -0,0 +1,180 @@ +import itertools +import sys +import os +import json +import time +import traceback +import onnxruntime as ort + +from dut.mvau import bench_mvau +from dut.transformer import bench_transformer +from dut.transformer_radioml import bench_transformer_radioml +from dut.transformer_gpt import bench_transformer_gpt +from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing + + +def main(config_name): + exit_code = 0 + # Attempt to work around onnxruntime issue on Slurm-managed clusters: + # See https://github.com/microsoft/onnxruntime/issues/8313 + # This seems to happen only when assigned CPU cores are not contiguous + _default_session_options = ort.capi._pybind_state.get_default_session_options() + def get_default_session_options_new(): + _default_session_options.inter_op_num_threads = 1 + _default_session_options.intra_op_num_threads = 1 + return _default_session_options + ort.capi._pybind_state.get_default_session_options = get_default_session_options_new + + # Gather job array info + job_id = int(os.environ["SLURM_JOB_ID"]) + #TODO: allow portable execution on any platform by making as many env vars as possible optional + print("Job launched with ID: %d" % (job_id)) + try: + array_id = int(os.environ["SLURM_ARRAY_JOB_ID"]) + task_id = int(os.environ["SLURM_ARRAY_TASK_ID"]) + task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"]) + print( + "Launched as job array (Array ID: %d, Task ID: %d, Task count: %d)" + % (array_id, task_id, task_count) + ) + except KeyError: + array_id = job_id + task_id = 0 + task_count = 1 + print("Launched as single job") + + # Prepare result directory + # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk) + experiment_dir = os.environ.get("CI_PROJECT_DIR") + + artifacts_dir = os.path.join(experiment_dir, "bench_artifacts") + print("Collecting results in path: %s" % artifacts_dir) + os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True) + log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id)) + + # save dir for saving bitstreams (and optionally full build artifacts for debugging (TODO)) + # TODO: make this more configurable or switch to job/artifact based power measurement + if job_id == 0: + #DEBUG mode + save_dir = experiment_dir + "_save" + else: + save_dir = os.path.join("/scratch/hpc-prf-radioml/felix/jobs/", + "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME")) + print("Saving additional artifacts in path: %s" % save_dir) + os.makedirs(save_dir, exist_ok=True) + + # Gather benchmarking configs + if config_name == "manual": + configs_path, config_select = os.path.split(os.environ.get("MANUAL_CFG_PATH")) + else: + configs_path = os.path.join(os.path.dirname(__file__), "cfg") + config_select = config_name + ".json" + + # Load config + config_path = os.path.join(configs_path, config_select) + print("Loading config %s" % (config_path)) + if os.path.exists(config_path): + with open(config_path, "r") as f: + config = json.load(f) + else: + print("ERROR: config file not found") + return + + # Expand all specified config combinations (gridsearch) + config_expanded = [] + for param_set in config: + param_set_expanded = list( + dict(zip(param_set.keys(), x)) for x in itertools.product(*param_set.values()) + ) + config_expanded.extend(param_set_expanded) + + # Save config (only first job of array) for logging purposes + if task_id == 0: + with open(os.path.join(artifacts_dir, "bench_config.json"), "w") as f: + json.dump(config, f, indent=2) + with open(os.path.join(artifacts_dir, "bench_config_exp.json"), "w") as f: + json.dump(config_expanded, f, indent=2) + + # Determine which runs this job will work on + total_runs = len(config_expanded) + if total_runs <= task_count: + if task_id < total_runs: + selected_runs = [task_id] + else: + return + else: + selected_runs = [] + idx = task_id + while idx < total_runs: + selected_runs.append(idx) + idx = idx + task_count + print("This job will perform %d out of %d total runs" % (len(selected_runs), total_runs)) + + # Run benchmark + # TODO: integrate this loop (especially status logging) into the bench class + # TODO: log additional info as artifact or directly into info section of json (e.g. dut, versions, date) + # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable) + log = [] + for run, run_id in enumerate(selected_runs): + print( + "Starting run %d/%d (id %d of %d total runs)" + % (run + 1, len(selected_runs), run_id, total_runs) + ) + + params = config_expanded[run_id] + print("Run parameters: %s" % (str(params))) + + log_dict = {"run_id": run_id, "task_id": task_id, "params": params} + + # Determine which DUT to run TODO: do this lookup more generically? + # give bench subclass name directly in config? + if config_select.startswith("mvau"): + bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("transformer_radioml"): + bench_object = bench_transformer_radioml(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("transformer_gpt"): + bench_object = bench_transformer_gpt(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("transformer"): + bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("fifosizing"): + bench_object = bench_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("metafi_fifosizing"): + bench_object = bench_metafi_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) + elif config_select.startswith("resnet50_fifosizing"): + bench_object = bench_resnet50_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) + else: + print("ERROR: unknown DUT specified") + + start_time = time.time() + try: + bench_object.run() + output_dict = bench_object.output_dict + if output_dict is None: + output_dict = {} + log_dict["status"] = "skipped" + print("Run skipped") + else: + log_dict["status"] = "ok" + print("Run completed") + except Exception: + output_dict = {} + log_dict["status"] = "failed" + print("Run failed: " + traceback.format_exc()) + exit_code = 1 + + log_dict["total_time"] = int(time.time() - start_time) + log_dict["output"] = output_dict + log.append(log_dict) + # overwrite output log file every time to allow early abort + with open(log_path, "w") as f: + json.dump(log, f, indent=2) + + # save local artifacts of this run (e.g., detailed debug info) + bench_object.save_local_artifacts_collection() + print("Stopping job") + return exit_code + #TODO: add additional exit codes (e.g. when some verification within the run failed)? + +if __name__ == "__main__": + exit_code = main(sys.argv[1]) + sys.exit(exit_code) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py new file mode 100644 index 0000000000..5c191d911f --- /dev/null +++ b/benchmarking/bench_base.py @@ -0,0 +1,933 @@ +import itertools +import os +import subprocess +import copy +import json +import time +import traceback +import glob +from shutil import copy as shcopy +from shutil import copytree +import finn.core.onnx_exec as oxe +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation +from finn.analysis.fpgadataflow.res_estimation import res_estimation +from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs +from finn.util.basic import make_build_dir, pynq_native_port_width, part_map +from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template +from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from qonnx.util.basic import ( + gen_finn_dt_tensor, + roundup_to_integer_multiple, +) +from finn.analysis.fpgadataflow.post_synth_res import post_synth_res +from qonnx.core.modelwrapper import ModelWrapper +from finn.builder.build_dataflow_config import DataflowBuildConfig +import pandas as pd +import onnxruntime as ort + +class MakeZYNQHarnessProject(Transformation): + """Based on MakeZYNQProject transformation, but integrates IP into test harness instead of DMA shell.""" + + def __init__(self, platform, output_dir, dut_duplication=1, clock_period_ns=10): + super().__init__() + self.platform = platform + self.output_dir = output_dir + self.dut_duplication = dut_duplication + self.clock_period_ns = clock_period_ns + + def apply(self, model): + # create a config file and empty list of xo files + config = [] + idma_idx = 0 + odma_idx = 0 + aximm_idx = 0 + axilite_idx = 0 + global_clk_ns = 0 + + # assume single stitched-ip (previously dataflowpartition) as DUT + # assume single primary input/output + input_tensor = model.graph.input[0] + output_tensor = model.graph.output[0] + input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) + output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) + instream_width = input_node_inst.get_instream_width_padded() + outstream_width = output_node_inst.get_outstream_width_padded() + + # assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" + # sdp_node = getCustomOp(node) + # dataflow_model_filename = sdp_node.get_nodeattr("model") + # kernel_model = ModelWrapper(dataflow_model_filename) + kernel_model = model + + ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj") + if ipstitch_path is None or (not os.path.isdir(ipstitch_path)): + raise Exception("No stitched IPI design found, apply CreateStitchedIP first.") + + vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv") + if vivado_stitch_vlnv is None: + raise Exception("No vlnv found, apply CreateStitchedIP first.") + + ip_dirs = ["list"] + ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path) + ip_dirs.append("$::env(FINN_ROOT)/benchmarking/harness/sink/ip") + ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) + config.append( + "set_property ip_repo_paths " + "[concat [get_property ip_repo_paths [current_project]] %s] " + "[current_project]" % ip_dirs_str + ) + config.append("update_ip_catalog -rebuild -scan_changes") + config.append( + "import_files -fileset sources_1 -norecurse $::env(FINN_ROOT)/benchmarking/harness/vector_xor.v" + ) + + # get metadata property clk_ns to calculate clock frequency + clk_ns = float(kernel_model.get_metadata_prop("clk_ns")) + if clk_ns > global_clk_ns: + global_clk_ns = clk_ns + + ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames")) + + # instantiate DUT, TODO: switch to wrapper verilog file for (multiple-) DUT instantiation + for id in range(self.dut_duplication): + dut_instance_name = "finn_design_%d" % id + config.append( + "create_bd_cell -type ip -vlnv %s %s" % (vivado_stitch_vlnv, dut_instance_name) + ) + # sdp_node.set_nodeattr("instance_name", instance_names[node.name]) + config.append( + "connect_bd_net [get_bd_pins %s/ap_clk] [get_bd_pins axi_interconnect_0/aclk]" + % dut_instance_name + ) + config.append( + "connect_bd_net [get_bd_pins %s/ap_rst_n] [get_bd_pins axi_interconnect_0/aresetn]" + % dut_instance_name + ) + + # instantiate input harness + if instream_width > 8192: + print("ERROR: DUT input stream width > 8192") + raise Exception("ERROR: DUT input stream width > 8192") + elif instream_width > 4096: + num_sources = 8 + source_width = roundup_to_integer_multiple(instream_width / 8, 8) + elif instream_width > 2048: + num_sources = 4 + source_width = roundup_to_integer_multiple(instream_width / 4, 8) + elif instream_width > 1024: + num_sources = 2 + source_width = roundup_to_integer_multiple(instream_width / 2, 8) + else: + num_sources = 1 + source_width = instream_width + + if self.dut_duplication > 1: + if num_sources > 1: + print("ERROR: DUT duplication with >1024 stream width not supported!") + raise Exception("ERROR: DUT duplication with >1024 stream width not supported!") + + num_sources = self.dut_duplication # one source per DUT instance + seed = 0xABCD + for id in range(num_sources): + config.append( + "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d" + % id + ) + config.append( + "set_property -dict [list \ + CONFIG.C_ATG_MODE {AXI4-Stream} \ + CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \ + CONFIG.C_AXIS_SPARSE_EN {false} \ + CONFIG.C_AXIS_TDATA_WIDTH {%d} \ + CONFIG.C_AXIS_TDEST_WIDTH {0} \ + CONFIG.C_AXIS_TID_WIDTH {0} \ + CONFIG.C_AXIS_TUSER_WIDTH {0} \ + CONFIG.STRM_DATA_SEED {%s} \ + ] [get_bd_cells axi_traffic_gen_%d]" + % (source_width, "0x{:04X}".format(seed), id) + ) + config.append( + "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]" + % id + ) + config.append( + "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]" + % id + ) + seed = seed + 99 + + config.append( + "connect_bd_intf_net [get_bd_intf_pins axi_traffic_gen_%d/M_AXIS_MASTER] [get_bd_intf_pins finn_design_%d/s_axis_0]" + % (id, id) + ) + + else: + seed = 0xABCD + for id in range(num_sources): + config.append( + "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d" + % id + ) + config.append( + "set_property -dict [list \ + CONFIG.C_ATG_MODE {AXI4-Stream} \ + CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \ + CONFIG.C_AXIS_SPARSE_EN {false} \ + CONFIG.C_AXIS_TDATA_WIDTH {%d} \ + CONFIG.C_AXIS_TDEST_WIDTH {0} \ + CONFIG.C_AXIS_TID_WIDTH {0} \ + CONFIG.C_AXIS_TUSER_WIDTH {0} \ + CONFIG.STRM_DATA_SEED {%s} \ + ] [get_bd_cells axi_traffic_gen_%d]" + % (source_width, "0x{:04X}".format(seed), id) + ) + config.append( + "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]" + % id + ) + config.append( + "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]" + % id + ) + config.append( + "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tready] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tready]" + % id + ) + seed = seed + 99 + + if num_sources > 1: + config.append( + "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_tdata" + ) + config.append( + "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_tdata]" % num_sources + ) + + for id in range(num_sources): + config.append( + "connect_bd_net [get_bd_pins xlconcat_tdata/In%d] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tdata]" + % (id, id) + ) + + config.append( + "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins xlconcat_tdata/dout]" + ) + else: + config.append( + "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins axi_traffic_gen_0/m_axis_1_tdata]" + ) + + # only connect valid from source 0 to DUT + config.append( + "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tvalid] [get_bd_pins axi_traffic_gen_0/m_axis_1_tvalid]" + ) + + # instantiate output harness + for id in range(self.dut_duplication): + config.append( + "create_bd_cell -type ip -vlnv xilinx.com:user:harness_sink:1.0 sink_%d" % id + ) + config.append( + "set_property -dict [list CONFIG.STREAM_WIDTH {%d}] [get_bd_cells sink_%d]" + % (outstream_width, id) + ) + config.append( + "connect_bd_intf_net [get_bd_intf_pins sink_%d/s_axis_0] [get_bd_intf_pins finn_design_%d/m_axis_0]" + % (id, id) + ) + + # GPIO control (TODO: connect interrupt) + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0") + config.append( + "set_property -dict [list \ + CONFIG.C_ALL_INPUTS {0} \ + CONFIG.C_GPIO_WIDTH {5} \ + CONFIG.C_INTERRUPT_PRESENT {1} \ + ] [get_bd_cells axi_gpio_0]" + ) + config.append( + "connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] " + "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx) + ) + config.append("assign_axi_addr_proc axi_gpio_0/S_AXI") + axilite_idx += 1 + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_0") + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_1") + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_2") + config.append( + "set_property -dict [list \ + CONFIG.DIN_FROM {0} \ + CONFIG.DIN_TO {0} \ + CONFIG.DIN_WIDTH {5} \ + ] [get_bd_cells xlslice_0]" + ) + config.append( + "set_property -dict [list \ + CONFIG.DIN_FROM {1} \ + CONFIG.DIN_TO {1} \ + CONFIG.DIN_WIDTH {5} \ + ] [get_bd_cells xlslice_1]" + ) + config.append( + "set_property -dict [list \ + CONFIG.DIN_FROM {2} \ + CONFIG.DIN_TO {2} \ + CONFIG.DIN_WIDTH {5} \ + ] [get_bd_cells xlslice_2]" + ) + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_0") + config.append( + "set_property -dict [list CONFIG.IN1_WIDTH.VALUE_SRC USER CONFIG.IN2_WIDTH.VALUE_SRC USER CONFIG.IN0_WIDTH.VALUE_SRC USER] [get_bd_cells xlconcat_0]" + ) + config.append( + "set_property -dict [list \ + CONFIG.IN0_WIDTH {3} \ + CONFIG.NUM_PORTS {3} \ + ] [get_bd_cells xlconcat_0]" + ) + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0") + config.append( + "set_property -dict [list \ + CONFIG.CONST_VAL {0} \ + CONFIG.CONST_WIDTH {3} \ + ] [get_bd_cells xlconstant_0]" + ) + config.append( + """ + connect_bd_net [get_bd_pins xlslice_0/Din] [get_bd_pins axi_gpio_0/gpio_io_o] + connect_bd_net [get_bd_pins xlslice_1/Din] [get_bd_pins axi_gpio_0/gpio_io_o] + connect_bd_net [get_bd_pins xlslice_2/Din] [get_bd_pins axi_gpio_0/gpio_io_o] + connect_bd_net [get_bd_pins xlconstant_0/dout] [get_bd_pins xlconcat_0/In0] + connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_i] [get_bd_pins xlconcat_0/dout] + """ + ) + if self.dut_duplication > 1: + config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_valid") + config.append( + "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_valid]" + % self.dut_duplication + ) + config.append( + "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_checksum" + ) + config.append( + "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_checksum]" + % self.dut_duplication + ) + + config.append("create_bd_cell -type module -reference vector_xor vector_xor_valid") + config.append( + "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_valid]" + % self.dut_duplication + ) + config.append("create_bd_cell -type module -reference vector_xor vector_xor_checksum") + config.append( + "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_checksum]" + % self.dut_duplication + ) + + config.append( + "connect_bd_net [get_bd_pins vector_xor_valid/in_data] [get_bd_pins xlconcat_valid/dout]" + ) + config.append( + "connect_bd_net [get_bd_pins vector_xor_checksum/in_data] [get_bd_pins xlconcat_checksum/dout]" + ) + config.append( + "connect_bd_net [get_bd_pins vector_xor_valid/out_data] [get_bd_pins xlconcat_0/In1]" + ) + config.append( + "connect_bd_net [get_bd_pins vector_xor_checksum/out_data] [get_bd_pins xlconcat_0/In2]" + ) + for id in range(self.dut_duplication): + config.append( + "connect_bd_net [get_bd_pins sink_%d/valid] [get_bd_pins xlconcat_valid/In%d]" + % (id, id) + ) + config.append( + "connect_bd_net [get_bd_pins sink_%d/checksum] [get_bd_pins xlconcat_checksum/In%d]" + % (id, id) + ) + else: + config.append("connect_bd_net [get_bd_pins sink_0/valid] [get_bd_pins xlconcat_0/In1]") + config.append( + "connect_bd_net [get_bd_pins sink_0/checksum] [get_bd_pins xlconcat_0/In2]" + ) + for id in range(self.dut_duplication): + config.append( + "connect_bd_net [get_bd_pins xlslice_2/Dout] [get_bd_pins sink_%d/enable]" % id + ) + for id in range(num_sources): + config.append( + "connect_bd_net [get_bd_pins xlslice_0/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_start]" + % id + ) + config.append( + "connect_bd_net [get_bd_pins xlslice_1/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_stop]" + % id + ) + + # create a temporary folder for the project + vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") + model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) + + fclk_mhz = int(1 / (global_clk_ns * 0.001)) + + # create a TCL recipe for the project + ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl" + config = "\n".join(config) + "\n" + with open(ipcfg, "w") as f: + f.write( + zynq_harness_template + % ( + fclk_mhz, + axilite_idx, + aximm_idx, + self.platform, + part_map[self.platform], + config, + ) + ) + + # create a TCL recipe for the project + synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh" + working_dir = os.environ["PWD"] + with open(synth_project_sh, "w") as f: + f.write("#!/bin/bash \n") + f.write("cd {}\n".format(vivado_pynq_proj_dir)) + f.write("vivado -mode batch -source %s\n" % ipcfg) + f.write("cd {}\n".format(working_dir)) + + # call the synthesis script + bash_command = ["bash", synth_project_sh] + process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) + process_compile.communicate() + + # collect results + os.makedirs(self.output_dir, exist_ok=True) + + bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" + if not os.path.isfile(bitfile_name): + raise Exception( + "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir + ) + hwh_name = vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh" + if not os.path.isfile(hwh_name): + raise Exception( + "Synthesis failed, no hwh file found. Check logs under %s" % vivado_pynq_proj_dir + ) + synth_report_name = vivado_pynq_proj_dir + "/synth_report.xml" + model.set_metadata_prop("vivado_synth_rpt", synth_report_name) + model.set_metadata_prop("bitfile", bitfile_name) + model.set_metadata_prop("hw_handoff", hwh_name) + + shcopy(bitfile_name, self.output_dir) + shcopy(hwh_name, self.output_dir) + shcopy(synth_report_name, self.output_dir) + + post_synth_resources = model.analysis(post_synth_res) + with open(self.output_dir + "/post_synth_resources.json", "w") as f: + json.dump(post_synth_resources, f, indent=2) + + timing_rpt = ("%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"% vivado_pynq_proj_dir) + shcopy(timing_rpt, self.output_dir + "/post_route_timing.rpt") + return (model, False) + +def step_synth_harness(model: ModelWrapper, cfg: DataflowBuildConfig): + # Build step version of above transformation (used for full builds) + model = model.transform(MakeZYNQHarnessProject( + platform=cfg.board, + output_dir=os.path.join(cfg.output_dir, "harness"), + #dut_duplication=dut_duplication, #TODO: enable for full builds + clock_period_ns=cfg.synth_clk_period_ns + )) + return model + +def start_test_batch_fast(results_path, project_path, run_target, pairs): + # Prepare tcl script + script = template_open.replace("$PROJ_PATH$", project_path) + # script = script.replace("$PERIOD$", period) + script = script.replace("$RUN$", run_target) + for toggle_rate, static_prob in pairs: + script = script + template_single_test + script = script.replace("$TOGGLE_RATE$", str(toggle_rate)) + script = script.replace("$STATIC_PROB$", str(static_prob)) + # script = script.replace("$SWITCH_TARGET$", switch_target) + script = script.replace("$REPORT_PATH$", results_path) + script = script.replace("$REPORT_NAME$", f"{toggle_rate}_{static_prob}") + with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file: + tcl_file.write(script) + + # Prepare bash script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash \n") + script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n") + + # Run script + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + # Parse results + for toggle_rate, static_prob in pairs: + power_report_dict = power_xml_to_dict(f"{results_path}/{toggle_rate}_{static_prob}.xml") + power_report_json = f"{results_path}/{toggle_rate}_{static_prob}.json" + with open(power_report_json, "w") as json_file: + json_file.write(json.dumps(power_report_dict, indent=2)) + + +def sim_power_report(results_path, project_path, in_width, out_width, dtype_width, sim_duration_ns): + # Prepare tcl script + script = template_open.replace("$PROJ_PATH$", project_path) + script = script.replace("$RUN$", "impl_1") + script = script + template_sim_power + script = script.replace("$TB_FILE_PATH$", os.getcwd() + "/switching_simulation_tb.v") + script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif") + script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns))) + script = script.replace("$REPORT_PATH$", results_path) + script = script.replace("$REPORT_NAME$", f"sim") + with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file: + tcl_file.write(script) + + # Prepare testbench + testbench = template_switching_simulation_tb.replace("$INSTREAM_WIDTH$", str(in_width)) + testbench = testbench.replace("$OUTSTREAM_WIDTH$", str(out_width)) + testbench = testbench.replace("$DTYPE_WIDTH$", str(dtype_width)) + testbench = testbench.replace( + "$RANDOM_FUNCTION$", "$urandom_range(0, {max})".format(max=2**dtype_width - 1) + ) + with open(os.getcwd() + "/switching_simulation_tb.v", "w") as tb_file: + tb_file.write(testbench) + + # Prepare shell script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash \n") + script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n") + + # Run script + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + # Parse results + power_report_dict = power_xml_to_dict(f"{results_path}/sim.xml") + power_report_json = f"{results_path}/sim.json" + with open(power_report_json, "w") as json_file: + json_file.write(json.dumps(power_report_dict, indent=2)) + +class bench(): + def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True): + super().__init__() + self.params = params + self.task_id = task_id + self.run_id = run_id + self.artifacts_dir = artifacts_dir + self.save_dir = save_dir + self.debug = debug + + #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously + + # General configuration + # TODO: do not allow multiple targets in a single bench job due to measurement? + if "board" in params: + self.board = params["board"] + else: + self.board = "RFSoC2x2" + + if "part" in params: + self.part = params["part"] + elif self.board in part_map: + self.part = part_map[self.board] + else: + raise Exception("No part specified for board %s" % self.board) + + if "clock_period_ns" in params: + self.clock_period_ns = params["clock_period_ns"] + else: + self.clock_period_ns = 10 + + # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts) + print("Clearing FINN BUILD DIR ahead of run") + delete_dir_contents(os.environ["FINN_BUILD_DIR"]) + + # Initialize output directories (might exist from other runs of the same job) + self.artifacts_dir_models = os.path.join(self.artifacts_dir, "models") + os.makedirs(self.artifacts_dir_models, exist_ok=True) + self.artifacts_dir_power = os.path.join(self.artifacts_dir, "power_vivado", "run_%d" % (self.run_id)) + os.makedirs(self.artifacts_dir_power, exist_ok=True) + + self.save_dir_bitstreams = os.path.join(self.save_dir, "bitstreams") + os.makedirs(self.save_dir_bitstreams, exist_ok=True) + + # Intermediate models saved between steps + # TODO: create setter functions for intermediate models or other artifacts that log them to gitlab artifacts or local dir automatically + self.model_initial = None + self.model_step_hls = None + self.model_step_synthesis = None + + # Initialize dictionary to collect all benchmark results + self.output_dict = {} + + # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow + self.build_inputs = {} + + # Collect tuples of (name, source path) to save as local artifacts upon run completion or fail by exception + self.local_artifacts_collection = [] + if self.debug: + # Save entire FINN build dir and working dir + # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure) + self.local_artifacts_collection.append(("finn_tmp", os.environ["FINN_BUILD_DIR"])) + self.local_artifacts_collection.append(("finn_cwd", os.environ["FINN_ROOT"])) + + def save_artifact(self, name, source_path): + target_path = os.path.join(self.artifacts_dir, name, "run_%d" % (self.run_id)) + os.makedirs(target_path, exist_ok=True) + if os.path.isdir(source_path): + copytree(source_path, target_path, dirs_exist_ok=True) + else: + shcopy(source_path, target_path) + + def save_local_artifact(self, name, source_path): + target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) + os.makedirs(target_path, exist_ok=True) + if os.path.isdir(source_path): + copytree(source_path, target_path, dirs_exist_ok=True) + else: + shcopy(source_path, target_path) + + def save_local_artifacts_collection(self): + # this should be called upon successful or failed completion of a run + for (name, source_path) in self.local_artifacts_collection: + self.save_local_artifact(name, source_path) + + def step_make_model(self): + # may be implemented in subclass + pass + + def step_export_onnx(self): + # may be implemented in subclass + pass + + def step_build(self): + # may be implemented in subclass + pass + + def run(self): + # must be implemented in subclass + pass + + def step_finn_estimate(self): + # Gather FINN estimates + print("Gathering FINN estimates") + + model = self.model_initial + finn_resources_model = res_estimation(model, fpgapart=self.part) + finn_cycles_model = model.analysis(exp_cycles_per_layer) + if self.target_node: + node = model.get_nodes_by_op_type(self.target_node)[0] + finn_resources = finn_resources_model[node.name] + finn_cycles = finn_cycles_model[node.name] + else: + finn_resources = finn_resources_model # TODO: aggregate? + finn_cycles = 0 # TODO: aggregate or drop + finn_estimates = finn_resources + finn_estimates["CYCLES"] = finn_cycles + self.output_dict["finn_estimates"] = finn_estimates + + def step_hls(self): + # Perform Vitis HLS synthesis for HLS resource/performance reports + start_time = time.time() + print("Performing Vitis HLS synthesis") + model = self.model_initial + model = model.transform(PrepareIP(self.part, self.clock_period_ns)) + model = model.transform(HLSSynthIP()) + + hls_resources_model = model.analysis(hls_synth_res_estimation) + if self.target_node: + node = model.get_nodes_by_op_type(self.target_node)[0] + hls_resources = hls_resources_model[node.name] + else: + hls_resources = hls_resources_model # TODO: aggregate? + self.output_dict["hls_estimates"] = hls_resources + self.output_dict["hls_time"] = int(time.time() - start_time) + + self.model_step_hls = copy.deepcopy(model) + + def step_rtlsim(self): + # Perform RTL simulation for performance measurement + start_time = time.time() + print("Performing Verilator RTL simulation (n=1)") + # Prepare + model = self.model_step_hls + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + # Generate input data + input_tensor = model.graph.input[0] + input_shape = model.get_tensor_shape(input_tensor.name) + input_dtype = model.get_tensor_datatype(input_tensor.name) + x = gen_finn_dt_tensor(input_dtype, input_shape) + input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case + # Run + oxe.execute_onnx(model, input_dict)["outp"] # do not check output for correctness TODO: add functional verification throughout benchmarking steps + # Log result + node = model.get_nodes_by_op_type("MVAU_hls")[0] + inst = getCustomOp(node) + rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim") + self.output_dict["rtlsim_cycles"] = rtlsim_cycles + self.output_dict["rtlsim_time"] = int(time.time() - start_time) + + def step_synthesis(self): + # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports + # TODO: avoid duplicate synthesis by using shell build also for post_synth_resources and power sim? + # TODO: check OMX synth strategy again! + start_time = time.time() + print("Performing Vivado (stitched-ip, out-of-context) synthesis") + model = self.model_step_hls + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) + model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns)) + ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth")) + + start_test_batch_fast( + results_path=self.artifacts_dir_power, + project_path=os.path.join( + ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" + ), + run_target="impl_1", + pairs=[(25, 0.5), (50, 0.5), (75, 0.5)], + ) + + # Log most important power results directly (refer to detailed logs for more) + for reportname in ["25_0.5", "50_0.5", "75_0.5"]: + with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: + report = json.load(f) + power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) + power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) + ooc_synth_results["power_%s" % reportname] = power + ooc_synth_results["power_dyn_%s" % reportname] = power_dyn + + self.output_dict["ooc_synth"] = ooc_synth_results + self.output_dict["ooc_synth_time"] = int(time.time() - start_time) + + # Save model for logging purposes + model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id))) + self.model_step_synthesis = copy.deepcopy(model) + + def step_sim_power(self): + # Perform Vivado simulation for accurate power report + start_time = time.time() + if "ooc_synth" not in self.output_dict: + print("ERROR: step_sim_power requires step_synthesis") + print("Performing Vivado simulation for power report") + if "rtlsim_cycles" in self.output_dict: + sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns + else: + sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns + + model = self.model_step_synthesis + input_tensor = model.graph.input[0] + output_tensor = model.graph.output[0] + input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) + output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) + sim_power_report( + results_path=self.artifacts_dir_power, + project_path=os.path.join( + self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" + ), + in_width=input_node_inst.get_instream_width(), + out_width=output_node_inst.get_outstream_width(), + dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(), + sim_duration_ns=sim_duration_ns, + ) + + # Log most important power results directly (refer to detailed logs for more) + for reportname in ["sim"]: + with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: + report = json.load(f) + power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) + power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) + self.output_dict["power_%s" % reportname] = power + self.output_dict["power_dyn%s" % reportname] = power_dyn + + self.output_dict["sim_power_time"] = int(time.time() - start_time) + + def step_synth_power(self): + # Perform Vivado synthesis for on-hardware power measurement + start_time = time.time() + if self.model_step_hls is None: + print("ERROR: step_synth_power requires step_hls") + print("Performing Vivado synthesis with test harness integration for power measurement") + + if "dut_duplication" in self.params: + dut_duplication = self.params["dut_duplication"] + else: + dut_duplication = 1 + + model = self.model_step_hls.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) + + build_dir = "temp_output_harness_build" + # TODO: replace hold harness with new instr wrapper implementation + #TODO: if synth fails this could contain stale bitstreams which will be power tested + model = model.transform( + MakeZYNQHarnessProject( + platform=self.board, + output_dir=build_dir, + dut_duplication=dut_duplication, + clock_period_ns=self.clock_period_ns + ) + ) + + # COPY bitstreams and other outputs + # TODO: integrate better (e.g. as artifact) and remove redundant copy + # TODO: make this more configurable or switch to job/artifact based power measurement + shcopy(os.path.join(build_dir, "top_wrapper.bit"), + os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) + shcopy(os.path.join(build_dir, "top.hwh"), + os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) + shcopy(os.path.join(build_dir, "synth_report.xml"), + os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) + clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) + measurement_settings = {"freq_mhz": clock_period_mhz} + with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: + json.dump(measurement_settings, f, indent=2) + + self.output_dict["synth_power_time"] = int(time.time() - start_time) + + # Save model for logging purposes + model.save(os.path.join(self.artifacts_dir_models, "model_%d_synth_power.onnx" % (self.run_id))) + + def step_parse_builder_output(self, build_dir): + # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow + + # COPY bitstreams and other outputs + # TODO: integrate better (e.g. as artifact) and remove redundant copy + # TODO: make this more configurable or switch to job/artifact based power measurement + # TODO: make compatible to new instr wrapper (or however we generate these outputs) + shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), + os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) + shcopy(os.path.join(build_dir, "harness/top.hwh"), + os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) + shcopy(os.path.join(build_dir, "harness/synth_report.xml"), + os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) + clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) + measurement_settings = {"freq_mhz": clock_period_mhz} + with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: + json.dump(measurement_settings, f, indent=2) + + # CHECK FOR VERIFICATION STEP SUCCESS + # Collect all verification output filenames + outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy")) + # Extract the verification status for each verification output by matching + # to the SUCCESS string contained in the filename + status = all([ + out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs + ]) + + # Construct a dictionary reporting the verification status as string + self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]} + # TODO: mark job as failed if verification fails + + # PARSE LOGS + report_path = os.path.join(build_dir, "harness/post_synth_resources.json") + # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run) + report_filter = "(top)" + # Open the report file + with open(report_path) as file: + # Load the JSON formatted report + report = pd.read_json(file, orient="index") + # Filter the reported rows according to some regex filter rule + report = report.filter(regex=report_filter, axis="rows") + # Generate a summary of the total resources + summary = report.sum() + + #TODO: parse finn estimates, hls estimates, step times, (rtlsim n=1, n=100) + #TODO: add vivado latency simulation for special transformer case + + self.output_dict["builder"] = summary.to_dict() + + def steps_simple_model_flow(self): + # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops) + do_hls = self.params["do_hls"] if "do_hls" in self.params else False + do_rtlsim = self.params["do_rtlsim"] if "do_rtlsim" in self.params else False + do_synthesis = self.params["do_synthesis"] if "do_synthesis" in self.params else False + do_sim_power = self.params["do_sim_power"] if "do_sim_power" in self.params else False + do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False + + # Perform steps + model, dut_info = self.step_make_model() + + # Save model for logging purposes + # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure + # E.g. parameters and paths could be stored as onnx attributes and benchmarking steps as generic or specialized custom_op transformations + model.save(os.path.join(self.artifacts_dir_models, "model_%d_initial.onnx" % (self.run_id))) + + # Save model for use in other steps + self.model_initial = model + + # Log dict reported by DUT-specific scripts to overall result dict + # E.g. this could contain SIMD/PE derived from folding factors or weight distribution information + self.output_dict["info"] = dut_info + + self.step_finn_estimate() + + if do_hls: + self.step_hls() + if do_rtlsim: + self.step_rtlsim() + if do_synthesis: + self.step_synthesis() + if do_sim_power: + self.step_sim_power() + if do_synth_power: + self.step_synth_power() + + def steps_full_build_flow(self): + # Default step sequence for benchmarking a full FINN builder flow + + # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) + # Ensure it exists but is empty (clear potential artifacts from previous runs) + tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow") + os.makedirs(tmp_buildflow_dir, exist_ok=True) + delete_dir_contents(tmp_buildflow_dir) + self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") + os.makedirs(self.build_inputs["build_dir"], exist_ok=True) + self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"])) + + if "model_dir" in self.params: + # input ONNX model and verification input/output pairs are provided + model_dir = self.params["model_dir"] + self.build_inputs["onnx_path"] = os.path.join(model_dir, "model.onnx") + self.build_inputs["input_npy_path"] = os.path.join(model_dir, "inp.npy") + self.build_inputs["output_npy_path"] = os.path.join(model_dir, "out.npy") + elif "model_path" in self.params: + self.build_inputs["onnx_path"] = self.params["model_path"] + else: + # input ONNX model (+ optional I/O pair for verification) will be generated + self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx") + self.step_export_onnx(self.build_inputs["onnx_path"]) + self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"]) + + if "folding_path" in self.params: + self.build_inputs["folding_path"] = self.params["folding_path"] + if "specialize_path" in self.params: + self.build_inputs["specialize_path"] = self.params["specialize_path"] + if "floorplan_path" in self.params: + self.build_inputs["floorplan_path"] = self.params["floorplan_path"] + + self.step_build() + + self.step_parse_builder_output(self.build_inputs["build_dir"]) diff --git a/benchmarking/bench_rtl_swg.py b/benchmarking/bench_rtl_swg.py new file mode 100644 index 0000000000..37995be10e --- /dev/null +++ b/benchmarking/bench_rtl_swg.py @@ -0,0 +1,403 @@ +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation +from finn.analysis.fpgadataflow.res_estimation import res_estimation +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( + ReplaceVerilogRelPaths, +) +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext + + +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + im2col_node = helper.make_node( + "Im2Col", + ["inp"], + ["outp"], + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], + pad_amount=[0, 0, 0, 0], + pad_value=0, + ) + graph = helper.make_graph( + nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="im2col-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + return model + + +def make_single_slidingwindow_modelwrapper( + type, + k, + ifm_ch, + ifm_dim, + ofm_dim, + simd, + m, + parallel_window, + stride, + dilation, + idt, + dw=0, + ram_style="auto", +): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + + odt = idt + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] + ) + + SlidingWindow_node = helper.make_node( + type, + ["inp"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ifm_dim_h, ifm_dim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=simd, + M=m, + parallel_window=parallel_window, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=idt.name, + outputDataType=odt.name, + depthwise=dw, + ram_style=ram_style, + ) + graph = helper.make_graph( + nodes=[SlidingWindow_node], + name="slidingwindow_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="slidingwindow-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + + # DEBUG + # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] + # swg_inst = getCustomOp(swg_node) + # swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd") + + return model + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +def bench_rtl_swg(params, task_id, run_id, results_dir): + # Read params + idt = params["idt"] + k = params["k"] + ifm_dim = params["ifm_dim"] + ifm_ch = params["ifm_ch"] + stride = params["stride"] + dilation = params["dilation"] + dw = params["dw"] + simd = params["simd"] + m = params["m"] + parallel_window = params["parallel_window"] + flip = params["flip"] + ram_style = params["ram_style"] + + only_estimates = params["only_estimates"] + skip_rtlsim = params["skip_rtlsim"] + skip_synth = params["skip_synth"] + synthesize_hls_comparison = params["synthesize_hls_comparison"] + + output_dict = {} + + # convert string to FINN DataType + idt = DataType[idt] + + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + return + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + # inter-dependent test parameters + if simd == "ifm_ch": + simd = ifm_ch + + # skip conditions + if simd > ifm_ch: + return + if ifm_ch % simd != 0: + return + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + return + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + return + if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( + k_w == 1 and (stride_w != 1 or dilation_w != 1) + ): + return + if k_h == 1 and k_w == 1 and simd != ifm_ch: + return + if parallel_window and simd != ifm_ch: + return + if not parallel_window and m > 1: + return + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + model = make_single_slidingwindow_modelwrapper( + type="ConvolutionInputGenerator_rtl", + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ram_style=ram_style, + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + if not only_estimates: + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) + model = model.transform(PrepareRTLSim()) + + node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] + inst = getCustomOp(node) + + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + exp_res_dict = model.analysis(res_estimation) + exp_res = exp_res_dict[node.name] + + output_dict["est_Cycles"] = exp_cycles + output_dict["est_LUT"] = exp_res["LUT"] + output_dict["est_BRAM"] = exp_res["BRAM_18K"] * 0.5 + output_dict["est_URAM"] = exp_res["URAM"] + + if only_estimates: + return output_dict + + if not skip_rtlsim: + # prepare input data + input_dict = prepare_inputs(x) + # execute model + oxe.execute_onnx(model, input_dict)["outp"] + + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + output_dict["Cycles"] = cycles_rtlsim + print("RTLSIM cycles: %d" % cycles_rtlsim) + + if not skip_synth: + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5)) + model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5)) + ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth")) + output_dict["LUT"] = ooc_res_dict["LUT"] + output_dict["BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"] + output_dict["URAM"] = ooc_res_dict["URAM"] + output_dict["WNS"] = ooc_res_dict["WNS"] + output_dict["Fmax"] = ooc_res_dict["fmax_mhz"] + + ############################################################### + # HLS COMPARISON: + if synthesize_hls_comparison: + output_dict["HLS_compatible"] = "yes" + + is_square = True + props_to_check = [k, ifm_dim, ofm_dim, stride, dilation] + for prop in props_to_check: + is_square = prop[0] == prop[1] + if not is_square: + is_square = False + + if not is_square or dilation[0] != 1 or dilation[1] != 1: + # try 1D HLS ConvInpGen + + # rectangular case not supported + if ifm_dim[0] == 1: + if ofm_dim[0] != 1 or k[0] != 1 or stride[0] != 1 or dilation[0] != 1: + output_dict["HLS_compatible"] = "no" + elif ifm_dim[1] == 1: + if ofm_dim[1] != 1 or k[1] != 1 or stride[1] != 1 or dilation[1] != 1: + output_dict["HLS_compatible"] = "no" + else: + output_dict["HLS_compatible"] = "no" + + # unsupported parallelization + if m > 1: + output_dict["HLS_compatible"] = "no" + if parallel_window > 0: + fully_unfolded = simd == ifm_ch + non_dws = dw == 0 + no_stride = stride_h == 1 and stride_w == 1 + no_dilation = dilation_h == 1 and dilation_w == 1 + supported_ram_style = ram_style in ["auto", "distributed"] + if not ( + fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style + ): + output_dict["HLS_compatible"] = "no" + + # unsupported hyperparams + if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): + output_dict["HLS_compatible"] = "no" + if (dilation_h > 1 or dilation_w > 1) and dw == 0: + output_dict["HLS_compatible"] = "no" + + model = make_single_slidingwindow_modelwrapper( + type="ConvolutionInputGenerator1D", + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ram_style=ram_style, + ) + else: + # try 2D HLS ConvInpGen + + # unsupported parallelization + if m > 1 or parallel_window > 0: + output_dict["HLS_compatible"] = "no" + + model = make_single_slidingwindow_modelwrapper( + type="ConvolutionInputGenerator", + k=k, + ifm_ch=ifm_ch, + ifm_dim=ifm_dim, + ofm_dim=ofm_dim, + simd=simd, + m=m, + parallel_window=parallel_window, + stride=stride, + dilation=dilation, + idt=idt, + dw=dw, + ram_style=ram_style, + ) + + if output_dict["HLS_compatible"] == "no": + return output_dict + + # perform usual RTLSIM steps + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # extract first results (estimates) + node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator") + if len(node_) == 0: + node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator1D") + node = node_[0] + inst = getCustomOp(node) + + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + output_dict["HLS_FINN_est_Cycles"] = exp_cycles + + exp_res_dict = model.analysis(res_estimation) + exp_res = exp_res_dict[node.name] + output_dict["HLS_FINN_est_LUT"] = exp_res["LUT"] + output_dict["HLS_FINN_est_BRAM"] = exp_res["BRAM_18K"] * 0.5 + output_dict["HLS_FINN_est_URAM"] = exp_res["URAM"] + + exp_res_dict_hls = model.analysis(hls_synth_res_estimation) + exp_res_hls = exp_res_dict_hls[node.name] + output_dict["HLS_HLS_est_LUT"] = int(exp_res_hls["LUT"]) + output_dict["HLS_HLS_est_BRAM"] = int(exp_res_hls["BRAM_18K"]) * 0.5 + output_dict["HLS_HLS_est_URAM"] = int(exp_res_hls["URAM"]) + + # perform rtlsim (for cycle measurement) + if not skip_rtlsim: + input_dict = prepare_inputs(x) + oxe.execute_onnx(model, input_dict)["outp"] + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + output_dict["HLS_Cycles"] = cycles_rtlsim + + # perform ooc synthesis (for resource/slack measurement) + model = model.transform(ReplaceVerilogRelPaths()) + model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5)) + model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5)) + ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth")) + output_dict["HLS_LUT"] = ooc_res_dict["LUT"] + output_dict["HLS_BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"] + output_dict["HLS_URAM"] = ooc_res_dict["URAM"] + output_dict["HLS_WNS"] = ooc_res_dict["WNS"] + output_dict["HLS_Fmax"] = ooc_res_dict["fmax_mhz"] + + return output_dict diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json new file mode 100644 index 0000000000..890f4c5b66 --- /dev/null +++ b/benchmarking/cfg/fifosizing_test.json @@ -0,0 +1,21 @@ +[ + { + "dim": [32], + "kernel_size": [5], + "ch": [4], + "simd": [4], + "pe": [4], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [3], + + "strategy": ["analytical", "rtlsim"], + + "rtlsim_n": [10], + "throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [64], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + } + ] \ No newline at end of file diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json new file mode 100644 index 0000000000..2a3aa895ab --- /dev/null +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -0,0 +1,17 @@ +[ + { + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "strategy": ["analytical"], + + "rtlsim_n": [10], + "throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [1024], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + } + ] \ No newline at end of file diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json new file mode 100644 index 0000000000..0c3abdb574 --- /dev/null +++ b/benchmarking/cfg/mvau_test.json @@ -0,0 +1,29 @@ +[ + { + "idt": ["INT4","INT2"], + "wdt": ["INT4"], + "act": ["INT4"], + + "sparsity_type": ["none"], + "sparsity_amount": [0], + + "nhw": [[1,32,32]], + "mw": [64], + "mh": [64], + "sf": [-1], + "nf": [-1], + "m": [1], + + "mem_mode": ["internal_embedded"], + "ram_style": ["distributed"], + "ram_style_thr": ["distributed"], + + "do_hls": [true], + "do_rtlsim": [true], + "do_synthesis": [true], + "do_sim_power": [true], + "do_synth_power": [true], + + "dut_duplication": [1] + } + ] diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json new file mode 100644 index 0000000000..1e85b972da --- /dev/null +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -0,0 +1,19 @@ +[ + { + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["U250"], + "clock_period_ns": [4], + + "strategy": ["analytical"], + + "rtlsim_n": [2], + "throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [1024], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + } + ] \ No newline at end of file diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json new file mode 100644 index 0000000000..27c426606e --- /dev/null +++ b/benchmarking/cfg/transformer_gpt_all.json @@ -0,0 +1,22 @@ +[ + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"], + "dut_duplication": [1] + }, + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"], + "dut_duplication": [1] + }, + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"], + "dut_duplication": [1] + }, + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], + "dut_duplication": [1] + } +] diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json new file mode 100644 index 0000000000..7dbdc217d7 --- /dev/null +++ b/benchmarking/cfg/transformer_radioml_all.json @@ -0,0 +1,7 @@ +[ + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], + "dut_duplication": [1] + } +] diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json new file mode 100644 index 0000000000..d10c4d94ca --- /dev/null +++ b/benchmarking/cfg/transformer_sweep.json @@ -0,0 +1,92 @@ +[ + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [1], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [32], + "model_mlp_dim": [1536], + "model_seq_len": [512], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + }, + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [8], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [256], + "model_mlp_dim": [1536], + "model_seq_len": [512], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + }, + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [12], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [384], + "model_mlp_dim": [1536], + "model_seq_len": [512], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + }, + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [12], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [96], + "model_mlp_dim": [1536], + "model_seq_len": [512], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + }, + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [1], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [32], + "model_mlp_dim": [1536], + "model_seq_len": [512], + "model_bits": [2, 4, 6, 8], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + } +] diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json new file mode 100644 index 0000000000..784d96f93d --- /dev/null +++ b/benchmarking/cfg/transformer_test.json @@ -0,0 +1,20 @@ +[ + { + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [1], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [32], + "model_mlp_dim": [192], + "model_seq_len": [64], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "dut_duplication": [1] + } +] diff --git a/benchmarking/collect.py b/benchmarking/collect.py new file mode 100644 index 0000000000..3bc9aaf04b --- /dev/null +++ b/benchmarking/collect.py @@ -0,0 +1,90 @@ +import itertools +import json +import os +import sys +import time + +def merge_dicts(a: dict, b: dict): + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge_dicts(a[key], b[key]) + elif a[key] != b[key]: + raise Exception("ERROR: Dict merge conflict") + else: + a[key] = b[key] + return a + +def consolidate_logs(path, output_filepath): + log = [] + i = 0 + while (i < 1024): + if (os.path.isfile(os.path.join(path,"task_%d.json"%(i)))): + with open(os.path.join(path,"task_%d.json"%(i)), "r") as f: + log_task = json.load(f) + log.extend(log_task) + i = i + 1 + + with open(output_filepath, "w") as f: + json.dump(log, f, indent=2) + +def merge_logs(log_a, log_b, log_out): + # merges json log (list of nested dicts) b into a, not vice versa (TODO) + + with open(log_a, "r") as f: + a = json.load(f) + with open(log_b, "r") as f: + b = json.load(f) + + for idx, run_a in enumerate(a): + for run_b in b: + if run_a["run_id"] == run_b["run_id"]: + #a[idx] |= run_b # requires Python >= 3.9 + #a[idx] = {**run_a, **run_b} + a[idx] = merge_dicts(run_a, run_b) + break + + # also sort by run id + out = sorted(a, key=lambda x: x["run_id"]) + + with open(log_out, "w") as f: + json.dump(out, f, indent=2) + +def wait_for_power_measurements(): + # TODO: detect when no bitstreams are to be measured (e.g. for fifosizing) and skip + # TODO: make configurable, relative to some env variable due to different mountint points + bitstreams_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", + "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), + "bitstreams") + + power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", + "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), + "power_measure.json") + + # count bitstreams to measure (can't rely on total number of runs since some of them could've failed) + files = os.listdir(bitstreams_path) + bitstream_count = len(list(filter(lambda x : ".bit" in x, files))) + + log = [] + print("Checking if all bitstreams of pipeline have been measured..") + while(len(log) < bitstream_count): + if os.path.isfile(power_log_path): + with open(power_log_path, "r") as f: + log = json.load(f) + print("Found measurements for %d/%d bitstreams"%(len(log),bitstream_count)) + time.sleep(60) + print("Power measurement complete") + +if __name__ == "__main__": + print("Consolidating synthesis results from all sub-jobs of the array") + consolidate_logs(sys.argv[1], sys.argv[2]) + + # TODO: disabled for now, update accordingly to new runner-based measurement setup + # wait_for_power_measurements() + # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", + # "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), + # "power_measure.json") + # if os.path.isfile(power_log_path): + # print("Merging power measurement logs with remaining logs") + # merge_logs(sys.argv[2], power_log_path, sys.argv[2]) + print("Done") diff --git a/benchmarking/dut/fifosizing.py b/benchmarking/dut/fifosizing.py new file mode 100644 index 0000000000..46b972deb0 --- /dev/null +++ b/benchmarking/dut/fifosizing.py @@ -0,0 +1,576 @@ +import json +import numpy as np +import os +import shutil +import torch +import copy +from brevitas.export import export_qonnx +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import ( + GiveRandomTensorNames, + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.merge_onnx_models import MergeONNXModels +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import make_build_dir +from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents +from finn.util.test import get_trained_network_and_ishape +from finn.util.basic import alveo_default_platform + +from dut.resnet50_custom_steps import ( + step_resnet50_tidy, + step_resnet50_streamline, + step_resnet50_convert_to_hw, + step_resnet50_slr_floorplan, + ) + +from bench_base import bench + +def generate_random_threshold_values( + data_type, num_input_channels, num_steps, narrow=False, per_tensor=False +): + if per_tensor: + num_input_channels = 1 + if narrow: + num_steps -= 1 + + return np.random.randint( + data_type.min(), + data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + +def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0): + # hardcoded parameters + idt = DataType["UINT4"] + wdt = DataType["UINT4"] + odt = DataType["UINT4"] + tdt = DataType["UINT32"] + stride = 1 + in_ch = out_ch = ch # input channel = output channel for stacking + # pad so that input dim = output dim for stacking (only supports odd kernel_size for now) + pad = int(np.floor(kernel_size / 2)) + + total_pad = 2 * pad + out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad) + weights_shape = [in_ch * kernel_size * kernel_size, out_ch] + thresholds_shape = [1, odt.get_num_possible_values() - 1] + input_shape = [1, ifm_dim, ifm_dim, in_ch] + padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch] + inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size] + output_shape = [1, out_feature_dim, out_feature_dim, out_ch] + + assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable" + + padding_config = {} + padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + padding_config["backend"] = "fpgadataflow" + padding_config["ImgDim"] = [ifm_dim, ifm_dim] + padding_config["NumChannels"] = in_ch + padding_config["SIMD"] = simd + padding_config["Padding"] = [pad, pad, pad, pad] + padding_config["inputDataType"] = idt.name + + inpgen_config = {} + inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + inpgen_config["backend"] = "fpgadataflow" + inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size] + inpgen_config["IFMChannels"] = in_ch + inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad] + inpgen_config["OFMDim"] = [ifm_dim, ifm_dim] + inpgen_config["inputDataType"] = idt.name + inpgen_config["outputDataType"] = idt.name + inpgen_config["SIMD"] = simd + inpgen_config["parallel_window"] = parallel_window + inpgen_config["Stride"] = [stride, stride] + inpgen_config["Dilation"] = [1, 1] + + mvau_config = {} + mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls" + mvau_config["backend"] = "fpgadataflow" + mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + mvau_config["MW"] = in_ch * kernel_size * kernel_size + mvau_config["MH"] = in_ch + mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size + mvau_config["PE"] = pe + mvau_config["resType"] = "lut" + mvau_config["mem_mode"] = "internal_embedded" # internal_decoupled + mvau_config["inputDataType"] = idt.name + mvau_config["weightDataType"] = wdt.name + mvau_config["outputDataType"] = odt.name + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape), + helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape), + helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape), + helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape), + ] + + modelproto = qonnx_make_model( + helper.make_graph( + name="building_block", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config), + helper.make_node( + "ConvolutionInputGenerator_rtl", + ["padding_out"], + ["inpgen_out"], + **inpgen_config, + ), + helper.make_node( + "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config + ), + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_layout("top_in", ["N", "H", "W", "C"]) + model.set_tensor_datatype("top_out", odt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("thresholds", tdt) + + weights = gen_finn_dt_tensor(wdt, weights_shape) + # TODO: thresholds are all the same + thresholds = generate_random_threshold_values( + tdt, out_ch, odt.get_num_possible_values() - 1, False, True + ) + thresholds = sort_thresholds_increasing(thresholds) + + model.set_initializer("weights", weights) + model.set_initializer("thresholds", thresholds) + + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def combine_blocks(lb, rb, ifm_dim, ch, pe): + # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape + # to avoid mix-ups, start by giving all tensors random names + lb = lb.transform(GiveRandomTensorNames()) + rb = rb.transform(GiveRandomTensorNames()) + # erase all node names to avoid conflict + for n in lb.graph.node: + n.name = "" + for n in rb.graph.node: + n.name = "" + + lb_input = lb.graph.input[0] + lb_output = lb.graph.output[0] + rb_input = rb.graph.input[0] + rb_output = rb.graph.output[0] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + + dup_config = {} + dup_config["domain"] = "finn.custom_op.fpgadataflow.hls" + dup_config["backend"] = "fpgadataflow" + dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + dup_config["NumChannels"] = ch + dup_config["PE"] = pe + dup_config["NumOutputStreams"] = 2 + dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name + + add_config = {} + add_config["domain"] = "finn.custom_op.fpgadataflow.hls" + add_config["backend"] = "fpgadataflow" + add_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + add_config["NumChannels"] = ch + add_config["PE"] = pe + add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name + + nodes_lb = [node for node in lb.graph.node] + nodes_rb = [node for node in rb.graph.node] + nodes_new = ( + nodes_lb + + nodes_rb + + [ + helper.make_node( + "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config + ), + helper.make_node( + "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config + ), + ] + ) + + value_info_lb = [x for x in lb.graph.value_info] + value_info_rb = [x for x in rb.graph.value_info] + value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output] + + initializer_lb = [x for x in lb.graph.initializer] + initializer_rb = [x for x in rb.graph.initializer] + initializer_new = initializer_lb + initializer_rb + modelproto = qonnx_make_model( + helper.make_graph( + name="branching_model", + inputs=[top_in], + outputs=[top_out], + value_info=value_info_new, + nodes=nodes_new, + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name)) + model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name)) + for i in initializer_new: + model.graph.initializer.append(i) + + # tidy-up + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(GiveReadableTensorNames()) + return model + +class bench_fifosizing(bench): + def step_export_onnx(self, onnx_export_path): + np.random.seed(0) + tmp_output_dir = make_build_dir("test_fifosizing") + + #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests + # or make into a build flow output product "fifo_report" + #TODO: allow manual folding/fifo config as input + + #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness? + # e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does? + + #TODO: how to determine rtlsim_n automatically? + + # conv parameters + dim = self.params["dim"] + kernel_size = self.params["kernel_size"] + ch = self.params["ch"] + simd = self.params["simd"] + pe = self.params["pe"] + parallel_window = self.params["parallel_window"] + + lb = None + for i in range(self.params["lb_num_layers"]): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block)) + lb.save(tmp_output_dir + "/lb.onnx") + + rb = None + for i in range(self.params["rb_num_layers"]): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block)) + rb.save(tmp_output_dir + "/rb.onnx") + + model = combine_blocks(lb, rb, dim, ch, pe=4) + model.save(onnx_export_path) + + def step_build_setup(self): + # create build config for synthetic test models + + cfg = build_cfg.DataflowBuildConfig( + output_dir = self.build_inputs["build_dir"], + synth_clk_period_ns = self.clock_period_ns, + verbose=False, + # only works with characterization-based FIFO-sizing + auto_fifo_depths=True, + auto_fifo_strategy="characterize", + characteristic_function_strategy=self.params["strategy"], + split_large_fifos=False, + # manual folding + target_fps=None, + # general rtlsim settings + force_python_rtlsim=False, + rtlsim_batch_size=self.params["rtlsim_n"], + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + ], + ) + + return cfg + + def step_fifotest(self, onnx_path, cfg, build_dir): + log = {} + build.build_dataflow_cfg(onnx_path, cfg) + + # load performance reports + with open(build_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(build_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] + deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + log["deadlock"] = deadlock.tolist() + + # check rtlsim throughput + throughput = sim_data["throughput[images/s]"] + stable_throughput = sim_data["stable_throughput[images/s]"] + estimated_throughput = est_data["estimated_throughput_fps"] + throughput_factor = throughput / estimated_throughput + stable_throughput_factor = stable_throughput / estimated_throughput + + # TODO: Take throughput or stable_throughput? + throughput_pass = throughput_factor > self.params["throughput_factor_threshold"] + + log["throughput_pass"] = throughput_pass + log["throughput"] = throughput + log["stable_throughput"] = stable_throughput + log["estimated_throughput"] = estimated_throughput + + # log FIFO sizes for easier inspection + log["fifo_depths"] = {} + log["fifo_sizes"] = {} + total_fifo_size = 0 + for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + log["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") + log["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") + total_fifo_size += log["fifo_sizes"][node.name] + log["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) + + # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear + fifo_reduction_pass = [] + log["fifo_reduction_results"] = {} + model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx") + for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"): + model = copy.deepcopy(model_orig) + node = model.get_node_from_name(node_orig.name) + node_inst = getCustomOp(node) + + # skip shallow FIFOs + # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado? + if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]: + log["fifo_reduction_results"][node.name] = "skip" + continue + + # reduce depth of current FIFO and reset generated code + node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"])) + node_inst.set_nodeattr("code_gen_dir_ipgen", "") + node_inst.set_nodeattr("ip_path", "") + node_inst.set_nodeattr("ipgen_path", "") + + # save model variation + tmp_output_dir_var = build_dir + "/variations/" + node.name + os.makedirs(tmp_output_dir_var) + model.save(tmp_output_dir_var + "/model.onnx") + + # build again, only re-run necessary steps to save time + cfg.output_dir = tmp_output_dir_var + cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"] + build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg) + + # load performance report + with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] + var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + + # check rtlsim throughput + var_throughput = sim_data["throughput[images/s]"] + var_stable_throughput = sim_data["stable_throughput[images/s]"] + # TODO: take throughput or stable_throughput? + throughput_drop = (throughput - var_throughput) / throughput + + if var_deadlock: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = 1.0 + elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = throughput_drop + else: + fifo_reduction_pass.append(False) + log["fifo_reduction_results"][node.name] = "fail (no drop)" + + self.output_dict["fifosizing_testresults"] = log + + def step_build(self): + # TODO: rename steps to model three phases: model creation/import, dataflow build, analysis + # dataflow build should be easily swappable and adpaptable to finn-examples + cfg = self.step_build_setup() + cfg.board = self.board + if "folding_path" in self.build_inputs: + cfg.folding_config_file = self.build_inputs["folding_path"] + if "specialize_path" in self.build_inputs: + cfg.specialize_layers_config_file = self.build_inputs["specialize_path"] + self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"]) + + def step_parse_builder_output(self, build_dir): + # build output itself is not relevant here (yet) + pass + + def run(self): + self.steps_full_build_flow() + + +# # custom steps +# from custom_steps import ( +# step_extract_absorb_bias, +# step_pre_streamline, +# step_residual_convert_to_hw, +# step_residual_streamline, +# step_residual_tidy, +# step_residual_topo, +# step_set_preferred_impl_style, +# step_convert_final_layers +# ) + +# TODO: put these definitions into separate files/classes so we can use them for other types of benchmaks as well +class bench_metafi_fifosizing(bench_fifosizing): + def step_build_setup(self): + # create build config for MetaFi models + + steps = [ + # step_residual_tidy, + # step_extract_absorb_bias, + # step_residual_topo, + # step_pre_streamline, + # step_residual_streamline, + # step_residual_convert_to_hw, + "step_create_dataflow_partition", + # step_set_preferred_impl_style, + "step_specialize_layers", + "step_target_fps_parallelization", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_set_fifo_depths", + "step_hw_codegen", + "step_hw_ipgen", + "step_create_stitched_ip", + "step_measure_rtlsim_performance", + "step_out_of_context_synthesis", + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + + cfg = build_cfg.DataflowBuildConfig( + output_dir = self.build_inputs["build_dir"], + synth_clk_period_ns = self.clock_period_ns, + steps=steps, + verbose=False, + target_fps=None, #23 + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end + #vitis_platform=vitis_platform, + + auto_fifo_depths=False, + split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test + + # general rtlsim settings + force_python_rtlsim=False, + rtlsim_batch_size=self.params["rtlsim_n"], + + # folding_config_file=folding_config_file, + # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", + # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", + # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json", + auto_fifo_strategy="characterize", + characteristic_function_strategy=self.params["strategy"], + #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, + # standalone_thresholds=True, + # enable extra performance optimizations (physopt) + vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing + ], + ) + + # where is this used and why? + cfg.use_conv_rtl = True, # use rtl for conv layers (MVAU cannot use rtl in our model) + + return cfg + + +class bench_resnet50_fifosizing(bench_fifosizing): + def step_build_setup(self): + # create build config for ResNet-50 (based on finn-examples) + + resnet50_build_steps = [ + step_resnet50_tidy, + step_resnet50_streamline, + step_resnet50_convert_to_hw, + "step_create_dataflow_partition", + "step_specialize_layers", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_set_fifo_depths", + "step_hw_codegen", + "step_hw_ipgen", + step_resnet50_slr_floorplan, + "step_create_stitched_ip", # was not in finn-examples + "step_measure_rtlsim_performance", # was not in finn-examples + "step_out_of_context_synthesis", # was not in finn-examples + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + + cfg = build_cfg.DataflowBuildConfig( + output_dir = self.build_inputs["build_dir"], + synth_clk_period_ns = self.clock_period_ns, + steps=resnet50_build_steps, + shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end + auto_fifo_depths=False, + split_large_fifos=True, + vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end + + # enable extra performance optimizations (physopt) + vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing + ], + ) + + # non-standard build parameter for custom step + cfg.floorplan_path = self.build_inputs["floorplan_path"] + + return cfg \ No newline at end of file diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py new file mode 100644 index 0000000000..a41eec694b --- /dev/null +++ b/benchmarking/dut/mvau.py @@ -0,0 +1,295 @@ + +import math +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + gen_finn_dt_tensor, + qonnx_make_model +) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) +from bench_base import bench + +class bench_mvau(bench): + + def _make_single_mvau_model( + self, + W, + numInputVectors, + pe, + simd, + m, + wdt, + idt, + odt, + T=None, + tdt=None, + mem_mode="const", + ram_style="auto", + ram_style_thresholds="auto", + ): + mw = W.shape[0] + mh = W.shape[1] + + # there are two ways to implement bipolar weights and inputs for + # MatrixVectorActivation: + # - specify their datatypes as such + # - specify their datatypes as BINARY as use binaryXnorMode + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # we'll internally convert weights/inputs to binary and specify the + # datatypes as such, and also set the binaryXnorMode attribute to 1 + export_wdt = DataType["BINARY"] + export_idt = DataType["BINARY"] + binary_xnor_mode = 1 + else: + export_wdt = wdt + export_idt = idt + binary_xnor_mode = 0 + + # numInputVectors for dense = [N] + # numInputVectors for conv = [N, H, W] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, numInputVectors + [mw]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, numInputVectors + [mh]) + if T is not None: + no_act = 0 + node_inp_list = ["inp", "weights", "thresh"] + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() + else: + # no thresholds + node_inp_list = ["inp", "weights"] + actval = 0 + no_act = 1 + mvau_node = helper.make_node( + "MVAU_hls", #TODO: add rtl support (configurable as param) + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow.hls", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + M=m, + numInputVectors=numInputVectors, + inputDataType=export_idt.name, + weightDataType=export_wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=binary_xnor_mode, + noActivation=no_act, + resType="lut", + mem_mode=mem_mode, + ram_style=ram_style, + ram_style_thresholds=ram_style_thresholds, + runtime_writeable_weights=0, + ) + + graph = helper.make_graph(nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp]) + model = qonnx_make_model(graph, producer_name="mvau-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model.set_tensor_datatype("weights", wdt) + # model.set_tensor_shape("weights", (channels, 1, k_h, k_w)) from VVAU + if binary_xnor_mode: + # convert bipolar to binary + model.set_initializer("weights", (W + 1) / 2) + else: + model.set_initializer("weights", W) + if T is not None: + model.set_tensor_datatype("thresh", tdt) + model.set_initializer("thresh", T) + + # Minimize weight & accumulator width to obtain realistic resource consumption + # model = model.transform(InferShapes()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferDataTypes()) + + return model + + def step_make_model(self): + # Read params + idt = self.params["idt"] + wdt = self.params["wdt"] + act = self.params["act"] + + numInputVectors = self.params["nhw"] + mw = self.params["mw"] + mh = self.params["mh"] + sf = self.params["sf"] + nf = self.params["nf"] + m = self.params["m"] + + mem_mode = self.params["mem_mode"] + ram_style = self.params["ram_style"] + ram_style_thr = self.params["ram_style_thr"] + + output_dict = {} + + # convert string to FINN DataType + idt = DataType[idt] + wdt = DataType[wdt] + if act is not None: + act = DataType[act] + + # Determine and log folding + if sf == -1: + sf = mw + simd = mw // sf + if nf == -1: + nf = mh + pe = mh // nf + if mw % simd != 0 or mh % pe != 0: + print("Invalid simd/pe configuration, skipping") + return + if m > 1 and (simd != mw or pe != mh): + print("M > 1 not possible for non-max simd/pe, skipping") + return + output_dict["simd"] = simd + output_dict["pe"] = pe + + # Generate weights + np.random.seed(123456) # TODO: verify or switch to modern numpy random generation + + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + if "sparsity_type" in self.params: + sparsity_type = self.params["sparsity_type"] + else: + sparsity_type = "none" + + if sparsity_type == "none": + if "sparsity_amount" in self.params: + if self.params["sparsity_amount"] > 0: + print("sparsity amount > 0 not applicable for none sparsity, skipping") + return + else: + if self.params["sparsity_amount"] == 0: + print("sparsity amount = 0 not applicable for selected sparsity, skipping") + return + if sparsity_type == "unstructured": + idx = np.random.choice( + mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False + ) + W = np.reshape(W, -1) + W[idx] = 0.0 + W = np.reshape(W, (mw, mh)) + elif sparsity_type == "rows_random": + idx_mw = np.random.choice(mw, size=int(self.params["sparsity_amount"] * mw), replace=False) + W[idx_mw, :] = 0.0 + elif sparsity_type == "cols_random": + idx_mh = np.random.choice(mh, size=int(self.params["sparsity_amount"] * mh), replace=False) + W[:, idx_mh] = 0.0 + elif sparsity_type == "rows_regular": + if self.params["sparsity_amount"] == 0.25: + idx_mw = np.arange(0, mw, step=4) + elif self.params["sparsity_amount"] == 0.5: + idx_mw = np.arange(0, mw, step=2) + elif self.params["sparsity_amount"] == 0.75: + idx_mw = np.concatenate( + (np.arange(0, mw, step=4), np.arange(1, mw, step=4), np.arange(2, mw, step=4)) + ) + else: + print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") + return + W[idx_mw, :] = 0.0 + elif sparsity_type == "cols_regular": + if self.params["sparsity_amount"] == 0.25: + idx_mh = np.arange(0, mh, step=4) + elif self.params["sparsity_amount"] == 0.5: + idx_mh = np.arange(0, mh, step=2) + elif self.params["sparsity_amount"] == 0.75: + idx_mh = np.concatenate( + (np.arange(0, mh, step=4), np.arange(1, mh, step=4), np.arange(2, mh, step=4)) + ) + else: + print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") + return + W[:, idx_mh] = 0.0 + + else: + print("ERROR: unknown sparsity type") + raise Exception("ERROR: unknown sparsity type") + + # TODO: implement enforce option which prevents naturally occurring sparsity + # params["sparsity_enforce"] + # TODO: implement distribution option which selects between uniform/normal/?? + # params["sparsity_distribution"] + + # log resulting sparsity statistics + # could be higher than selected due to naturally occurring sparsity + num_zeros = (W == 0).sum() + num_ones = (W == 1).sum() + (W == -1).sum() + num_p2 = 0 + for w in np.nditer(W): + if w != 0 and w != 1 and w != -1: + if w > 0: + if math.log2(w).is_integer(): + num_p2 = num_p2 + 1 + else: + if math.log2(-w).is_integer(): + num_p2 = num_p2 + 1 + output_dict["zero_weights"] = round(num_zeros / W.size, 2) + output_dict["easy_weights"] = round((num_zeros + num_ones + num_p2) / W.size, 2) + + # Generate thresholds + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + # set range for threshold values according to worst-case accumulator range (not weight value specific) + # this could result in some thresholds being clipped by MinimizeAccumulatorWidth + # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt) + # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt) + # acc_min = min(min(lower_range), min(upper_range)) + # acc_max = max(max(lower_range), max(upper_range)) + # set range for threshold values according to actual accumulator range for the generated weights + (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + + # Create model + model = self._make_single_mvau_model( + W, numInputVectors, pe, simd, m, wdt, idt, odt, T, tdt, mem_mode, ram_style, ram_style_thr + ) + model = model.transform(GiveUniqueNodeNames()) + node = model.get_nodes_by_op_type("MVAU_hls")[0] + inst = getCustomOp(node) + + self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type + return model, output_dict + + def run(self): + self.steps_simple_model_flow() diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py new file mode 100644 index 0000000000..ddf8b0d0de --- /dev/null +++ b/benchmarking/dut/resnet50_custom_steps.py @@ -0,0 +1,252 @@ +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.core.modelwrapper import ModelWrapper + +from qonnx.transformation.fold_constants import FoldConstants + +from qonnx.transformation.general import ( + ConvertSubToAdd, + ConvertDivToMul, + GiveReadableTensorNames, + GiveUniqueNodeNames, + SortGraph, + RemoveUnusedTensors, + GiveUniqueParameterTensors, + RemoveStaticGraphInputs, + ApplyConfig, +) + +from finn.transformation.streamline.absorb import ( + AbsorbScalarMulAddIntoTopK, + AbsorbAddIntoMultiThreshold, + AbsorbMulIntoMultiThreshold, + FactorOutMulSignMagnitude, + Absorb1BitMulIntoMatMul, + Absorb1BitMulIntoConv, + AbsorbConsecutiveTransposes, + AbsorbTransposeIntoMultiThreshold, +) + +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedAdd, + CollapseRepeatedMul, +) + +from finn.transformation.streamline.reorder import ( + MoveAddPastMul, + MoveScalarMulPastMatMul, + MoveScalarAddPastMatMul, + MoveAddPastConv, + MoveScalarMulPastConv, + MoveScalarLinearPastInvariants, + MoveMaxPoolPastMultiThreshold, +) + +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from finn.transformation.streamline.sign_to_thres import ConvertSignToThres +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine + +# just for not linear +from finn.transformation.streamline.reorder import ( + MoveLinearPastEltwiseAdd, + MoveLinearPastFork, +) + +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +from qonnx.transformation.remove import RemoveIdentityOps +from qonnx.core.datatype import DataType + +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.insert_topk import InsertTopK +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul + +from finn.builder.build_dataflow_config import ( + DataflowBuildConfig, + ShellFlowType, +) + +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten + + +def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(RemoveStaticGraphInputs()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + return model + + +def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig): + streamline_transformations = [ + AbsorbScalarMulAddIntoTopK(), # before MoveAddPastMul to avoid int->float + ConvertSubToAdd(), + ConvertDivToMul(), + RemoveIdentityOps(), + CollapseRepeatedMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + MoveMaxPoolPastMultiThreshold(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + RoundAndClipThresholds(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig): + streamline_transformations = [ + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + for iter_id in range(4): + model = step_resnet50_streamline_linear(model, cfg) + model = step_resnet50_streamline_nonlinear(model, cfg) + + # big loop tidy up + model = model.transform(RemoveUnusedTensors()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + model = model.transform(DoubleToSingleFloat()) + + return model + + +def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) + model = model.transform(InferDataLayouts()) + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + to_hw_transformations = [ + to_hw.InferAddStreamsLayer, + LowerConvsToMatMul, + to_hw.InferChannelwiseLinearLayer, + to_hw.InferPool, + AbsorbTransposeIntoMultiThreshold, + RoundAndClipThresholds, + to_hw.InferQuantizedMatrixVectorActivation, + to_hw.InferThresholdingLayer, + AbsorbConsecutiveTransposes, + to_hw.InferConvInpGen, + to_hw.InferDuplicateStreamsLayer, + to_hw.InferLabelSelectLayer, + ] + for trn in to_hw_transformations: + model = model.transform(trn()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferDataTypes()) + + model = model.transform(RemoveCNVtoFCFlatten()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(RemoveUnusedTensors()) + model = model.transform(SortGraph()) + + return model + + +def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): + if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: + # previously, we would always ran the finn experimental partitioner on ResNet-50 + # this is now changed and a fixed floorplan is applied + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(ApplyConfig(cfg.floorplan_path)) + print("Fixed SLR floorplanning applied") + + # if you would like to try out the experimental partitioner + # please uncomment the lines (that are not marked as comment) below. + + # import numpy as np + # from finnexperimental.analysis.partitioning import partition + + # comment: apply partitioning of the model, restricting the first and last layer to SLR0 + # default_slr = 0 + # abs_anchors = [(0, [default_slr]), (-1, [default_slr])] + + # comment: increase resource limits to make partitioning feasible, except for SLR0 + # comment: which also has DDR subsystem + # limits = np.array( + # [ + # [0.75, 0.5, 0.7, 0.6, 0.6], + # [1, 0.7, 0.9, 0.8, 0.8], + # [1, 0.7, 0.9, 0.8, 0.8], + # [1, 0.7, 0.9, 0.8, 0.8], + # ] + # ) + # floorplan = partition( + # model, + # cfg.synth_clk_period_ns, + # cfg.board, + # abs_anchors=abs_anchors, + # multivariant=False, + # linear_cuts=True, + # limits=limits, + # )[0] + + # comment: apply floorplan to model + # model = model.transform(ApplyConfig(floorplan)) + # print("SLR floorplanning applied from partitioner") + return model \ No newline at end of file diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py new file mode 100644 index 0000000000..0dc6444a55 --- /dev/null +++ b/benchmarking/dut/transformer.py @@ -0,0 +1,1046 @@ +# Adapted from Christoph's attention-dummy repository + +# PyTorch base package: Math and Tensor Stuff +import torch +# Brevitas wrapper around PyTorch tensors adding quantization information +from brevitas.quant_tensor import QuantTensor +# Brevitas: Quantized versions of PyTorch layers +from brevitas.nn import ( + QuantMultiheadAttention, + QuantEltwiseAdd, + QuantIdentity, + QuantLinear, + QuantReLU +) +import os +# Progressbar +from tqdm import trange +import numpy as np +from brevitas.export import export_qonnx +import random +import json +import subprocess +from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents +# FINN dataflow builder +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.builder.build_dataflow_config import AutoFIFOSizingMethod +from bench_base import bench, step_synth_harness + +# Custom build steps required to streamline and convert the attention operator +from dut.transformer_custom_steps import ( + step_tidy_up_pre_attention, + step_tidy_up_post_attention, + step_streamline_attention, + step_streamline_residual, + step_streamline_norms, + step_streamline_positional, + step_convert_attention_to_hw, + step_convert_elementwise_binary_to_hw, + step_convert_lookup_to_hw, + step_replicate_streams, + set_target_parallelization, + set_fifo_depths, + step_apply_folding_config, + node_by_node_rtlsim, + node_by_node_cppsim +) +from performance.platform_build_steps import( + test_step_gen_vitis_xo, + test_step_gen_instrumentation_wrapper, + test_step_gen_instrwrap_sim, + test_step_insert_tlastmarker, + test_step_export_xo, + test_step_build_platform, + test_step_run_instrwrap_sim +) + +### ADAPTED FROM utils.py +# Seeds all relevant random number generators to the same seed for +# reproducibility +def seed(s): + random.seed(s) + np.random.seed(s) + torch.manual_seed(s) + +### ADAPTED FROM model.py +# Derives a weight quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def weight_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.quant.base import NarrowIntQuant, MaxStatsScaling + from brevitas.quant.solver import WeightQuantSolver + from brevitas.inject.enum import RestrictValueType + + # Derive a Quantizer from the brevitas bases + class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Per tensor quantization, not per channel + scaling_per_output_channel = False + # What is this? Copied from PerTensorFloatScaling* + # Probably restricts the scale to be floating-point? + restrict_scaling_type = RestrictValueType.FP + + # Return the derived quantizer configuration + return Quantizer + + +# Derives a bias quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def bias_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.quant import IntBias + + # Derive a Quantizer from the brevitas bases + class Quantizer(IntBias): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Do not require the bit-width to be adjusted to fit the accumulator to + # which the bias is added + requires_input_bit_width = False + + # Return the derived quantizer configuration + return Quantizer + + +# Derives an activation quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def act_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling + from brevitas.quant.solver import ActQuantSolver + from brevitas.inject.enum import RestrictValueType + + # Derive a Quantizer from the brevitas bases + class Quantizer( + IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver + ): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Per tensor quantization, not per channel + scaling_per_output_channel = False + # What is this? Copied from PerTensorFloatScaling* + # Probably restricts the scale to be floating-point? + restrict_scaling_type = RestrictValueType.FP + + # Return the derived quantizer configuration + return Quantizer + + +# Gets the normalization layer from configuration key +def get_norm(key, normalized_shape): + # Transposes Sequence and Embedding dimensions + class Transpose(torch.nn.Module): + # Forward pass transposing the feature map + def forward(self, x): # noqa: May be static + # Transpose the last two dimensions of batch x seq x emb layout + return torch.transpose(x, dim0=-1, dim1=-2) + + # Dictionary mapping keys to supported normalization layer implementations + norms = { + # PyTorch default layer normalization. Needs to know the shape of the + # feature map to be normalized + "layer-norm": torch.nn.LayerNorm( + # Note: Disable affine parameters as potential negative scale causes + # streamlining issues later + normalized_shape=normalized_shape, elementwise_affine=False + ), + # PyTorch default 1-dimensional batch normalization. Needs to transpose + # embedding and sequence dimension to normalized over the embedding + # dimension, which is expected to be second. + "batch-norm": torch.nn.Sequential( + # Note: Disable affine parameters as potential negative scale causes + # streamlining issues later + Transpose(), torch.nn.LazyBatchNorm1d(affine=False), Transpose() + ), + # No normalization by a PyTorch built-in identity layer. Should not + # appear in the graph. + "none": torch.nn.Identity() + } + + # Select the normalization layer by key + return norms[key] + + +# Gets the attention mask from configuration key and shape +def get_mask(key, length): + # Dictionary mapping keys to supported normalization layer implementations + masks = { + # No attention mask + "none": None, + # Generate the upper triangular mask for causal attention + "causal": torch.nn.Transformer.generate_square_subsequent_mask(length), + # Square matrix with entries randomly set to -inf or 0.0 with 50% + # probability each + "random": torch.where( # noqa: Confused by types? + torch.rand(length, length) > 0.5, -torch.inf, 0.0 + ) + } + # Select the mask type by key + return masks[key] + + +# Single-layer scaled dot-product attention block with MLP and normalization +class TransformerBlock(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__( + self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits + ): + # Initialize the PyTorch Module superclass + super().__init__() + + # Input quantizer to the scaled dot-product attention operations, shared + # by queries, keys and values inputs. It is important to have this + # quantizer separate and not preceding the fork node of the residual + # branches to avoid consecutive quantizers in the skip branch. + # Note: For some reason it seems not to be possible to use the + # in_proj_input_quant of the attention operator + self.sdp_input_quant = QuantIdentity( + # Quantize at the output + act_quant=act_quantizer(bits, _signed=True), + # Pass quantization information on to the next layer. + return_quant_tensor=True + ) + # Quantized scaled dot-product attention operator + self.sdp = QuantMultiheadAttention( + # Size of the embedding dimension (input and output) + embed_dim=emb_dim, + # Number of attention heads + num_heads=num_heads, + # Enable a bias added to the input and output projections + bias=bias, + # Layout of the inputs: + # Batch x Sequence x Embedding (batch-first, True) + # Sequence x Batch x Embedding (batch-second, False) + batch_first=True, + # If query, key and value input are the same, packed input + # projections use a single, large linear projection to produce + # the actual query, key and value inputs. Otherwise, use + # separate linear projections on each individual input. + packed_in_proj=False, + # Brevitas has this as an unsigned quantizer by default, but + # finn can only handle signed quantizer + attn_output_weights_quant=act_quantizer(bits, _signed=True), + # Insert an additional quantizer in front ot the softmax. In our + # finn custom-op, this will be matched to the quantizer + # following the query and key matmul. + # Note: Disable to prevent the quantizer from tripping over -inf + # from the attention mask + softmax_input_quant=None, + # Quantize the input projections weights as configured + in_proj_weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias of the input projections as configured + in_proj_bias_quant=bias_quantizer(bits, _signed=True), + # No quantization in front of the input projections as this is + # either done by a standalone quantizer preceding the whole block + in_proj_input_quant=None, + + # Quantize the output projections weights as configured + out_proj_weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias of the output projections as configured + out_proj_bias_quant=bias_quantizer(bits, _signed=True), + # Quantize the input to the output projection as configured + out_proj_input_quant=act_quantizer(bits, _signed=True), + + # Quantizer the key after projections as configured + k_transposed_quant=act_quantizer(bits, _signed=True), + # Quantize the queries after projections as configured + q_scaled_quant=act_quantizer(bits, _signed=True), + # Quantize the values after projection as configured + v_quant=act_quantizer(bits, _signed=True), + + # No output quantization for now, as stacking multiple layers + # results in multiple multi-thresholds in succession + out_proj_output_quant=None, + + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True + ) + # Residual branch addition skipping over the attention layer + self.residual_sdp = QuantEltwiseAdd( + # Shared input activation quantizer such that the scales at both + # input branches are identical. This allows floating point scale + # factor to be streamlined past the add-node. + input_quant=act_quantizer(bits, _signed=True), + # Disable the output quantizer after the add operation. Output of + # the add will have one more bit than the inputs, which is probably + # fine and does not require re-quantization. + output_quant=None, + # Pass quantization information on to the next layer. + return_quant_tensor=True + ) + # Normalization following the attention layer + self.norm_sdp = torch.nn.Sequential( + # Select the normalization layer implementation + get_norm(key=norm, normalized_shape=emb_dim), + # No quantizer to avoid consecutive quantizer in the MLP residual + # branch. See input quantizer in front of the first MLP layer. + ) + + # Quantized MLP following the scaled dot-product attention + self.mlp = torch.nn.Sequential( + # Quantize the inputs to the MLP block. Placed here to not have this + # at the input of the residual branch. + QuantIdentity( + # Quantize at the output + act_quant=act_quantizer(bits, _signed=True), + # Pass quantization information on to the next layer. + return_quant_tensor=True + ), + # First mlp layer projecting to the mlp dimension + QuantLinear( + # Inputs have the size of the attention embedding dimension + emb_dim, + # Project to the configured mlp dimension, which is typically + # larger than the embedding dimension + mlp_dim, + # Enable the learned bias vector + bias=bias, + # Quantize weights to the same representation as all other + # layers + weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias to the same representation as all other + # layers + bias_quant=bias_quantizer(bits, _signed=True), + # No input quantizer as this is directly preceded by a + # standalone quantizer + input_quant=None, + # Not output quantizer as this is directly followed by a + # quantized ReLU activation taking care of quantization + output_quant=None, + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True + ), + # Use the ReLU activation function instead of the more commonly used + # GELU, as the latter is not mapped easily to hardware with FINN + QuantReLU( + # Note: ReLU must be quantized to unsigned representation + act_quant=act_quantizer(bits, _signed=False), + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True + ), + # Second mlp layer projecting back to the embedding dimension + QuantLinear( + # Inputs have the configured mlp dimension, which is typically + # larger than the embedding dimension + mlp_dim, + # Project back to the size of the attention embedding dimension + emb_dim, + # Enable the learned bias vector + bias=bias, + # Quantize weights to the same representation as all other + # layers + weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias to the same representation as all other + # layers + bias_quant=bias_quantizer(bits, _signed=True), + # No input quantizer as the inputs are already quantized by the + # preceding ReLU layer + input_quant=None, + # Not output quantizer as this is directly followed by a + # quantized element-wise addition taking care of quantization + output_quant=None, + # Pass quantization information on to the next layer. + return_quant_tensor=True + ), + ) + # Residual branch addition skipping over the MLP layer + self.residual_mlp = QuantEltwiseAdd( + # Shared input activation quantizer such that the scales at both + # input branches are identical. This allows floating point scale + # factor to be streamlined past the add-node. + input_quant=act_quantizer(bits, _signed=True), + # Disable the output quantizer after the add operation. Output of + # the add will have one more bit than the inputs, which is probably + # fine and does not require re-quantization. + output_quant=None, + # Pass quantization information on to the next layer. + # Note: Not for the last layer to allow this to be combined with + # standard pytorch calls like .detach() or .numpy(), which are + # not directly available on QuantTensor. + return_quant_tensor=True + ) + # Normalization following the attention layer + self.norm_mlp = torch.nn.Sequential( + # Select the normalization layer implementation + get_norm(key=norm, normalized_shape=emb_dim), + # No quantizer to avoid consecutive quantizer in the SDP residual + # branch + ) + # Generate the attention mask according to configuration + self.mask = get_mask(mask, seq_len) + + # Forward pass through the transformer block + def forward(self, x): + # Move the mask to the same device as the input, just in case... + mask = self.mask.to(x.device) if self.mask is not None else None + # Quantize the input to the attention block + q = self.sdp_input_quant(x) + # Scaled dot-product attention with residual branch and normalization + x = self.norm_sdp( + self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0]) + ) + # MLP layer with residual branch and normalization + return self.norm_mlp(self.residual_mlp(x, self.mlp(x))) + + +# Quantized sinusoidal positional encoding layer +class QuantSinusoidalPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor + ) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Get the size of the inputs to dynamically generate encodings of the + # same size + _, seq, emb = x.shape + # Start by enumerating all steps of the sequence + i = torch.as_tensor([[n] for n in range(seq)]) + # Scale factor adjusting the frequency/wavelength of the sinusoid + # depending on the embedding dimension index + f = torch.as_tensor([1e4 ** -(i / emb) for i in range(0, emb, 2)]) + # Prepare empty positional encoding tensor of the same size as the input + pos = torch.empty(seq, emb) + # Fill the positional encoding with alternating sine and cosine waves + pos[:, 0::2] = torch.sin(f * i) + pos[:, 1::2] = torch.cos(f * i) + # Move the encoding tensor to the same device as the input tensor + pos = pos.to(x.device, dtype=x.dtype) + # Add the quantized encoding to the quantized input + return self.add(x, pos) + + +# Quantized learned positional encoding layer +class QuantLearnedPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__( + self, + seq_len, + emb_dim, + input_quant, + output_quant, + return_quant_tensor + ): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor + ) + # Register a parameter tensor representing the not quantized positional + # encoding + self.pos = torch.nn.Parameter(torch.empty(seq_len, emb_dim)) + # Reset/Initialize the parameter tensor + self.reset_parameters() + + # Resets/Initializes the positional encoding parameter tensor + def reset_parameters(self): + # Initialize the positional encoding from a normal distribution with + # zero mean and unit standard deviation + torch.nn.init.normal_(self.pos, mean=0, std=1) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Add the quantized encoding to the quantized input + return self.add(x, self.pos) + + +# Lazy version of the learned encoding not requiring input dimensions at +# initialization, inferring these at the first forward pass +class LazyQuantLearnedPositionalEncoding( + torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa +): + # Once initialized, this will become a QuantLearnedPositionalEncoding as + # defined above + cls_to_become = QuantLearnedPositionalEncoding + # Parameter tensor of the QuantLearnedPositionalEncoding is uninitialized + pos: torch.nn.UninitializedParameter + + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the quantizer parts of QuantLearnedPositionalEncoding, + # leaving the dimensions empty + super().__init__(0, 0, input_quant, output_quant, return_quant_tensor) + # Register an uninitialized parameter tensor for the positional encoding + self.pos = torch.nn.UninitializedParameter() + + # Resets/Initializes the positional encoding parameter tensor + def reset_parameters(self): + # If this has already been initialized, delegate to the actual + # implementation + if not self.has_uninitialized_params(): + super().reset_parameters() + + # Initializes/Materializes the uninitialized parameter tensor given some + # sample input tensor to infer the dimensions + def initialize_parameters(self, x): + # Only materialize the parameter tensor if it is not yet initialized + if self.has_uninitialized_params(): + # Do not accumulate gradient information from initialization + with torch.no_grad(): + # Get the size of the inputs to generate encodings of the same + # size + _, seq, emb = x.shape + # Materialize the positional encoding parameter tensor + self.pos.materialize((seq, emb)) + # Properly initialize the parameters by resetting the values + self.reset_parameters() + + +# Quantized binary positional encoding layer +class QuantBinaryPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor + ) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Get the size of the inputs to dynamically generate encodings of the + # same size + _, seq, emb = x.shape + # Binary positional encoding fills the embedding dimension with the bit + # pattern corresponding to the position in the sequence + pos = torch.as_tensor([ + [(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq) + ]) + # Move the encoding tensor to the same device as the input tensor + pos = pos.to(x.device, dtype=x.dtype) + # Add the quantized encoding tp the quantized input + # Note: Convert encoding to bipolar representation + return self.add(x, 2 * pos - 1) + + +# Gets the positional encoding layer from configuration key, quantizers and +# shape +def get_positional_encoding( + key, input_quant, output_quant, return_quant_tensor +): + # Dictionary mapping keys to supported normalization layer implementations + masks = { + # No positional encoding + "none": QuantIdentity( + act_quant=input_quant, return_quant_tensor=return_quant_tensor + ), + # Fixed, sinusoidal positional encoding according to Vaswani et al. with + # added quantizers + "sinusoidal": QuantSinusoidalPositionalEncoding( + input_quant, output_quant, return_quant_tensor + ), + # Fixed, binary positional encoding with quantizers + "binary": QuantBinaryPositionalEncoding( + input_quant, output_quant, return_quant_tensor + ), + # Learned positional encoding with quantizers + "learned": LazyQuantLearnedPositionalEncoding( + input_quant, output_quant, return_quant_tensor + ) + } + # Select the positional encoding type by key + return masks[key] + + +# Unpacks the standard PyTorch tensor from a brevitas QuantTensor +def unpack_from_quant(tensor: torch.Tensor | QuantTensor): + # If this is a QuantTensor we can extract the wrapped tensor + if isinstance(tensor, QuantTensor): + # The underlying tensor is wrapped as the value attribute + return tensor.value + # Assume this is already a plain PyTorch tensor + return tensor + + +# Dummy transformer encoder model +class DummyTransformer(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__( + self, + # Number of layers of attention blocks + num_layers, + # Number of attention heads per block + num_heads, + # Size of embedding dimension going into/out of the attention block + emb_dim, + # Size of MLP dimension in each attention block + mlp_dim, + # Length of the input sequence, i.e., context size + seq_len, + # Enables bias term added to Linear layers + bias, + # Quantization bit-width: For now all layers are quantized to the + # same bit-width + bits, + # Type of normalization layer to use in the transformer blocks + # Options are: layer-norm, batch-norm and none + norm="none", + # Type of attention mask to use + # Options are: none, causal or const + mask="none", + # Type of positional encoding to use at the input + # Options are: none, sinusoidal, binary, learned + positional_encoding="none" + ): + # Initialize the PyTorch Module superclass + super().__init__() + + # Positional encoding layer at the input + self.pos = get_positional_encoding( + # Select the implementation by configuration key + key=positional_encoding, + # Quantize the inputs to the positional encoding to the same + # bit-width as the input + input_quant=act_quantizer(bits, _signed=True), + # Quantize the sum of input and positional encoding to the same + # bit-width as the input + output_quant=None, + # Pass quantization information on to the next layer + return_quant_tensor=True + ) + + # Sequence of num_layers transformer encoder blocks + self.encoder = torch.nn.Sequential(*[ + TransformerBlock( + num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits + ) for _ in range(num_layers) + ]) + + # Model forward pass taking an input sequence and returning a single set of + # class probabilities + def forward(self, x): + # Add positional encoding to the input and feed through the encoder + # stack + # Note: Get the wrapped value out of the QuantTensor to have only a + # single output from the model. + return unpack_from_quant(self.encoder(self.pos(x))) + +### ADAPTED FROM export.py + +# Check whether a layer is a normalization layer of some supported type +def is_norm_layer(module): + # Set of normalization layer (bases) which maybe need to be patched + norm_layers = { + # All BatchNorm and InstanceNorm variants derive from this baseclass + torch.nn.modules.batchnorm._NormBase, # noqa: Access to _NormBase + # LayerNorm has a unique implementation + torch.nn.LayerNorm, + } + # Check the module against all supported norm layer types + return any(isinstance(module, norm) for norm in norm_layers) + + +# Fixes export issues of normalization layers with disabled affine parameters. +# Somehow the export to ONNX trips when it encounters the weight and bias tensor +# to be 'None'. +def patch_non_affine_norms(model: torch.nn.Module): # noqa: Shadows model + # Iterate all modules in the model container + for name, module in model.named_modules(): + # If the module is a normalization layer it might require patching the + # affine parameters + if is_norm_layer(module): + # Check whether affine scale parameters are missing + if hasattr(module, "weight") and module.weight is None: + # There need to be running statistics to patch the scales + if hasattr(module, "running_var"): + # Patch the affine bias by all 1 tensor of the same shape, + # type and device as the running variance + module.weight = torch.nn.Parameter( + torch.ones_like(module.running_var) + ) + # Check whether affine bias parameters are missing + if hasattr(module, "bias") and module.bias is None: + # There need to be running statistics to patch the scales + if hasattr(module, "running_mean"): + # Patch the affine bias by all 0 tensor of the same shape, + # type and device as the running mean + module.bias = torch.nn.Parameter( + torch.zeros_like(module.running_var) + ) + # Return the patched model container + return model + +template_folding_yaml = """ +# Per operator type default configurations +defaults: + # Scaled dot-product attention head implemented via HLS + ScaledDotProductAttention_hls: + # Type of memory to be used for internal buffer storage + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Type of memory to be used fo the attention mask (if present) + # Options: auto, block, distributed + ram_style_mask: block + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + mac_resource: lut + # Addition of two inputs (constants or streamed) implemented via HLS + ElementwiseAdd_hls: + # Type of memory to be used for internal buffer storage and/or constant + # parameter tensors + # Options: auto, block, distributed, ultra + ram_style: distributed + # Matrix vector activation unit implemented via HLS + MVAU_hls: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Matrix vector activation unit implemented via RTL + MVAU_rtl: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + # Note: RTL MVAU currently does not support LUT-based implementation + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via HLS (applies to standalone thresholds) + Thresholding_hls: + # Memory mode for threshold storage + # Options: internal_embedded, internal_decoupled + mem_mode: internal_decoupled + # Type of memory to be used for threshold storage if "internal_decoupled" + # Options: distributed, block + ram_style: distributed + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via RTL (applies to standalone thresholds) + Thresholding_rtl: + # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the + # depth of the thresholds + # Note: This combination forces "distributed" LUT implementation + depth_trigger_uram: 2147483647 # "infinity" + depth_trigger_bram: 2147483647 # "infinity" + # # Note: This combination forces "block" RAM implementation + # depth_trigger_uram: 0 + # depth_trigger_bram: 1 + # # Note: This combination forces "ultra" RAM implementation + # depth_trigger_uram: 1 + # depth_trigger_bram: 0 + # # Note: This combination is equivalent to "auto" + # depth_trigger_uram: 0 + # depth_trigger_bram: 0 + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) + StreamingFIFO_rtl: + # RTL vs. IPI implementation of FIFOs + # Options: rtl, vivado + impl_style: rtl + # Resource type for FIFOs when impl_style is vivado + # Options: auto, block, distributed, ultra + ram_style: distributed + # Individual, named node-specific configurations here + # ... +""" + +class bench_transformer(bench): + def step_export_onnx(self, output_onnx_path): + # Load the parameters file + #params = dvc.api.params_show("params.yaml") + # Seed all RNGs + seed(self.params["seed"]) + # Make PyTorch behave deterministically if possible + torch.use_deterministic_algorithms(mode=True, warn_only=True) + # Create a model instance from the configuration parameters + #model = DummyTransformer(**params["model"]) + model = DummyTransformer( + num_layers = self.params["model_num_layers"], + num_heads = self.params["model_num_heads"], + emb_dim = self.params["model_emb_dim"], + mlp_dim = self.params["model_mlp_dim"], + seq_len = self.params["model_seq_len"], + bias = self.params["model_bias"], + bits = self.params["model_bits"], + norm = self.params["model_norm"], + mask = self.params["model_mask"], + positional_encoding = self.params["model_positional_encoding"], + ) + + # Get the configured sequence length and embedding dimension to generate + # test inputs + seq, dim = self.params["model_seq_len"], self.params["model_emb_dim"] + # No gradient accumulation for calibration passes required + with torch.no_grad(): + # Check whether GPU training is available and select the appropriate + # device + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + # Move the model to the training device + model = model.to(device) + # Multiple passes of calibration might be necessary for larger/deep + # models + for _ in trange(0, self.params["calibration_passes"], desc="calibrating"): + # Pass random data through the model to "calibrate" dummy quantizer. + # Large batch to have more calibration samples. Otherwise, there is + # too much deviation between this calibration and the verification + # samples. + model(torch.rand(128, seq, dim, device=device)) + # Move the model back to the CPU + model = model.cpu() + # Prevent export issue for missing affine normalization parameters + model = patch_non_affine_norms(model) + # Switch model to evaluation mode to have it fixed for export + model = model.eval() + # Sample random input tensor in batch-first layout + x = torch.rand(1, seq, dim) + # Compute attention output + o = model(x) + # Save the input and output data for verification purposes later + # TODO: go via self.build_inputs["input_npy_path"] + np.save("inp.npy", x.detach().numpy()) + np.save("out.npy", o.detach().numpy()) + # Export the model graph to QONNX + #export_qonnx(model, (x,), "attention.onnx", **self.params["export"]) + export_qonnx(model, (x,), output_onnx_path, + opset_version = 14, + do_constant_folding = True) + + def step_build(self): + #with open("params.yaml") as file: + # params = yaml.safe_load(file) + # Seed all RNGs + seed(self.params["seed"]) + # Extract sequence length and embedding dimension from parameters + seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"] + + # Prepare config files + # TODO: make configurable + # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs + specialize_layers_dict = { + "Defaults": { + "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] + }, + "": { + "preferred_impl_style": "" + } + } + with open("specialize_layers.json", "w") as f: + json.dump(specialize_layers_dict, f, indent=2) + with open("folding.yaml", "w") as f: + f.write(template_folding_yaml) + + # Create a configuration for building the scaled dot-product attention + # operator to a hardware accelerator + cfg = build_cfg.DataflowBuildConfig( + # Unpack the build configuration parameters + #**params["build"], + output_dir = self.build_inputs["build_dir"], + stitched_ip_gen_dcp = True, + synth_clk_period_ns = self.clock_period_ns, + board = self.board, + shell_flow_type = "vivado_zynq", #TODO: Alveo support + folding_config_file = "folding.yaml", + specialize_layers_config_file = "specialize_layers.json", + standalone_thresholds = True, + max_multithreshold_bit_width = 16, + mvau_wwidth_max = 2048, + split_large_fifos = True, + + verbose = False, # if True prints stdout and stderr to console instead of build_dataflow.log + enable_build_pdb_debug = False, + + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM + #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later + #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed + #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed + #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components + #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation + ], + + verify_steps=[ + # Verify the model after converting to the FINN onnx dialect + build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, + # Verify the model again using python mode after the default + # streamlining step + build_cfg.VerificationStepType.STREAMLINED_PYTHON, + # Verify the model again after tidy up transformations, right before + # converting to HLS + build_cfg.VerificationStepType.TIDY_UP_PYTHON, + # Verify the model after generating C++ HLS and applying folding + build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + ], + # File with test inputs for verification + verify_input_npy="inp.npy", + # File with expected test outputs for verification + verify_expected_output_npy="out.npy", + # Save the intermediate model graphs + save_intermediate_models=True, + # Avoid RTL simulation for setting the FIFO sizes + auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, + # Do not automatically set FIFO sizes as this requires RTL simulation + # not implemented for the attention operator + auto_fifo_depths=False, + # Build steps to execute + steps=[ + # Need to apply some tidy-up transformations before converting to + # the finn dialect of onnx + step_tidy_up_pre_attention, + # Convert all QONNX Quant nodes to Multithreshold nodes + "step_qonnx_to_finn", + # Tidy up the graph after converting from QONNX to FINN format + # Note: Triggers a verification step + "step_tidy_up", + # Positional encoding needs to be streamlined first with slightly + # different order of certain streamlining transformations to avoid + # weird rounding issue of intermediate results + step_streamline_positional, + # Custom streamlining for models containing attention operators + step_streamline_attention, + # Streamlining of the residual branches + step_streamline_residual, + # Streamline the normalization layers, i.e., transposed batch norm + step_streamline_norms, + # Another round using the default streamlining steps + # Note: Triggers a verification step + "step_streamline", + # New conversion of the scaled dot-product attention pattern + step_convert_attention_to_hw, + # Another tidy-up step to remove unnecessary dimensions and + # operations after converting the attention operators to HLS + step_tidy_up_post_attention, + # Convert the elementwise binary operations to hardware operators. + # These include for example adding residual branches and positional + # encoding + step_convert_elementwise_binary_to_hw, + # Convert the Gather layer realizing the input token embedding to + # the FINN hardware implementation, i.e., the Lookup layer + step_convert_lookup_to_hw, + # Properly replicate the stream feeding the query, key and value + # projections + step_replicate_streams, + # Convert most other layers supported by FINN to HW operators + "step_convert_to_hw", + # Specialize HW layer implementations as either HLS or RTL + "step_specialize_layers", + "step_create_dataflow_partition", + # Set the folding configuration to meet the cycles per sequence + # target + set_target_parallelization(seq_len, emb_dim), + # Apply folding configuration, specifying hardware implementation + # details + # Note: This triggers a verification step + step_apply_folding_config, + "step_minimize_bit_width", + # The ScaledDotProductAttention custom op does not define any + # estimates + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + # Set the attention- and residual-related FIFO depths insert FIFOs + # and apply folding configuration once again + # Note: Implement all FIFOs with a depth at least as deep as the + # sequence length in URAM. + set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), + # Run additional node-by-node verification in RTL simulation of the + # model before creating the stitched IP + # Note: end-to-end verification of the stitched IP in RTL simulation + # is still not possible due to missing float IPs + node_by_node_cppsim, + # Only for debugging for now, does not work if "vivado" style + # StreamingFIFOs are used + # node_by_node_rtlsim, + + test_step_insert_tlastmarker, # required for instrumentation_wrapper + + "step_create_stitched_ip", + + # "step_measure_rtlsim_performance", # not possible due to float components + + step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) + + #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + + # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) + #"step_synthesize_bitfile", + #"step_make_pynq_driver", + #"step_deployment_package", + + #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration + #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration + + #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration + #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime + + #test_step_export_xo, # preparation step for original instr wrapper integration + #test_step_build_platform # synthesis with instr wrapper + ] + ) + # Run the build process on the dummy attention operator graph + # TODO: maybe let this function return the cfg only, so it can be modified by bench context + build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) + + def run(self): + self.steps_full_build_flow() + + # DEBUG code for live logging of long instr wrapper simulation: + # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log") + # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True) + # sim_output_dir = build_dir + "/instrwrap_sim" + # # Prepare bash script + # bash_script = os.getcwd() + "/run_vivado_sim.sh" + # with open(bash_script, "w") as script: + # script.write("#!/bin/bash\n") + # script.write("cd %s\n"%(sim_output_dir)) + # script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path)) + # # Run script + # print("Running Vivado simulation of instrumentation wrapper") + # sub_proc = subprocess.Popen(["bash", bash_script]) + # sub_proc.communicate() + ####### diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py new file mode 100644 index 0000000000..d28a4c501a --- /dev/null +++ b/benchmarking/dut/transformer_custom_steps.py @@ -0,0 +1,878 @@ +# ADAPTED FROM Christoph's attention-dummy build_steps.py + +# Copies (deep-copies) python objects +import copy +# Numpy for loading and comparing the verification input/output +import numpy as np +# YAML for loading experiment configurations +import yaml +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper +# QONNX quantization data types +from qonnx.core.datatype import DataType +# Converts ONNX graph nodes to QONNX custom-ops if possible +from qonnx.custom_op.registry import getCustomOp +# QONNX graph transformations for renaming and cleaning up +from qonnx.transformation.general import ( + Transformation, + GiveUniqueNodeNames, + GiveReadableTensorNames, + RemoveUnusedTensors, + RemoveStaticGraphInputs, + GiveUniqueParameterTensors, + ConvertDivToMul, + ConvertSubToAdd +) +# Converts BatchNorm operation to affine transformation +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine +# QONNX graph transformations for inferring datatypes and shapes +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.infer_data_layouts import InferDataLayouts +# QONNX cleanup transformations +from qonnx.transformation.remove import RemoveIdentityOps +# Precompute constant output nodes +from qonnx.transformation.fold_constants import FoldConstants +# Streamlining transformation: This is a collection of various transformations +from finn.transformation.streamline import ( + ConvertSignToThres, RoundAndClipThresholds +) +# Fuse/Absorb operations +from finn.transformation.streamline.absorb import ( + AbsorbAddIntoMultiThreshold, + AbsorbSignBiasIntoMultiThreshold, + FactorOutMulSignMagnitude, + AbsorbMulIntoMultiThreshold, + Absorb1BitMulIntoMatMul, + Absorb1BitMulIntoConv +) +# Reorder operations +from finn.transformation.streamline.reorder import ( + MoveMulPastFork, + MoveLinearPastFork, + MoveTransposePastFork, + MoveLinearPastEltwiseAdd, + MoveScalarLinearPastInvariants, + MoveTransposePastEltwise, + MoveMulPastMaxPool, + MoveAddPastMul, + MoveScalarAddPastMatMul, + MoveAddPastConv, + MoveScalarMulPastMatMul, + MoveScalarMulPastConv, +) +# Collapse consecutive operations of the same type +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedMul, + CollapseRepeatedTranspose, + CollapseRepeatedAdd +) +# FINN transformation converting ONNX nodes to hardware custom operators +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferElementwiseBinaryOperation, + InferLookupLayer +) +# Remove some operations without real effect +from finn.transformation.streamline.remove import ( + RemoveIdentityTranspose, + RemoveIdentityReshape +) +# Cleanup transformation getting rid of 3d data layout +from finn.transformation.squeeze import Squeeze +# Detects the attention pattern and converts to hardware custom op +from finn.transformation.fpgadataflow.attention import ( + InferScaledDotProductAttention, + AbsorbMultiThresholdIntoScaledDotProductAttention +) +# Mult-Head Attention support +from finn.transformation.fpgadataflow.attention_heads import ( + InferMultiHeads, + MoveSplitMultiHeadsPastMultiThreshold, + UnrollMultiHeadAttention, + MoveMergeMultiHeadsPastMultiThreshold +) +# Stream replication for outputs with multiple consumers +from finn.transformation.fpgadataflow.replicate_stream import ( + InferReplicateStream +) +# Inserts data-width converter and FIFO nodes into the model graph +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +# Splitting and removing of FIFOs from the model graph +from finn.transformation.fpgadataflow.set_fifo_depths import ( + RemoveShallowFIFOs, + SplitLargeFIFOs, +) +# Specializes each layer's implementation style: HLS or RTL implementation +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +# FINN dataflow builder configuration +from finn.builder.build_dataflow_config import ( + VerificationStepType, DataflowBuildConfig +) +# Graph transformation setting the folding, i.e., parallelization configuration +from finn.transformation.fpgadataflow.set_folding import SetFolding +# FINN verification after build/graph transformation steps +from finn.builder.build_dataflow_steps import verify_step + +# Transformations preparing the operators for synthesis and simulation +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + +# Execute onnx model graphs from the dataflow parent for verification +from finn.util.test import execute_parent + + +# Composes graph transformations such that each individual transformation as +# well as the whole sequence is applied exhaustively +class ComposedTransformation(Transformation): + # Initializes the transformation given a list of transformations + def __init__(self, transformations: list[Transformation]): + # Initialize the transformation base class + super().__init__() + # Register the list of transformations to be applied in apply() + self.transformations = transformations + + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all transformations to be applied + for transformation in self.transformations: + # Start each transformation on a deep copy of the model to mimic the + # behavior of ModelWrapper.transform() + model = copy.deepcopy(model) + # Exhaustively apply the transformation until it no longer modifies + # the graph + while True: + # Apply the transformation once, reporting back whether any node + # or pattern has been modified + model, _graph_modified = transformation.apply(model) + # Keep track whether the graph has been modified at least once + graph_modified = graph_modified or _graph_modified + # Break the loop if this transformation did not change anything + if not _graph_modified: + break + # Apply the cleanup transformations of the ModelWrapper + model.cleanup() + # Apply some further cleanup transformations to the model graph + # removing some clutter and keeping all names readable and ordered + # at any time + model = model.transform(RemoveIdentityOps()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the graph actually + # has been transformed by at least one transformation so the whole + # sequence of transformations will be reapplied + return model, graph_modified + + +# Custom Streamlining transformation: Similar to the built-in transformations +# but exhaustively reapplied until none of the transformations can be applied +# anymore. +def Streamline(): # noqa: Uppercase + return ComposedTransformation([ + ConvertSubToAdd(), + ConvertDivToMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveMulPastMaxPool(), + AbsorbSignBiasIntoMultiThreshold(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + MoveMulPastMaxPool(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + RoundAndClipThresholds(), + ]) + + +# Function running transformations necessary to clean up models containing +# attention operators +def step_tidy_up_pre_attention(model: ModelWrapper, _): + # Add shape and datatype annotations throughout all the graph + model = model.transform(InferDataTypes()) # noqa Duplicate + model = model.transform(InferShapes()) + + # Cleanup the graph by removing redundant, unnecessary and constant nodes + # and tensors and give unique names to everything remaining + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(RemoveStaticGraphInputs()) + model = model.transform(RemoveUnusedTensors()) + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(FoldConstants()) + + # Remove unnecessary shape and layout transformations + model = model.transform(RemoveIdentityReshape()) + model = model.transform(RemoveIdentityTranspose()) + # Insert tensor layout annotations for Quant to MultiThreshold transform + # to determine the correct output channel dimension + model = model.transform(InferDataLayouts()) + # Return the tidied up model + return model + + +# Variant of streamlining transformations adapted to attention operators +def step_streamline_attention(model: ModelWrapper, cfg: DataflowBuildConfig): + # Exhaustively apply the pattern of streamlining and moving past fork-nodes + model = model.transform(ComposedTransformation([ + # Apply the set of standard streamlining transformations from finn to + # the model + Streamline(), + # We need a custom streamlining step to enable streamlining through + # certain fork-nodes Note: This transform is part of finn, but not + # included in the standard streamlining transformations + MoveLinearPastFork(), + # Streamline again there should be more transformations enabled after + # moving some nodes past forks + Streamline(), + ])) + + # If configured, run a verification of the transformed model on some sample + # inputs + if (VerificationStepType.STREAMLINED_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "streamlined_attention_python", need_parent=False + ) + + # Return the streamlined model + return model + + +# Streamlining transformations to be applied to residual branches +def step_streamline_residual(model: ModelWrapper, cfg: DataflowBuildConfig): + # Exhaustively apply the pattern for streamlining residual branches. This + # ensures streamlining to work for arbitrary many consecutive residual + # blocks, where one "round" of these transformations is required per block. + model = model.transform(ComposedTransformation([ + # Streamline the residual connections by moving scale factors past + # elementwise add nodes + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + MoveScalarLinearPastInvariants(), + # Do the normal streamlining flow once again + Streamline(), + ])) + + # If configured, run a verification of the transformed model on some sample + # inputs + if (VerificationStepType.STREAMLINED_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "streamlined_residual_python", need_parent=False + ) + + # Return the streamlined model + return model + + +# Streamlining transformation to be applied to the normalization layers +def step_streamline_norms(model: ModelWrapper, cfg: DataflowBuildConfig): + # Exhaustively apply the pattern for streamlining norms. This ensures + # streamlining to work for arbitrary many consecutive blocks, where one + # round of these transformations is required per block. + model = model.transform(ComposedTransformation([ + # Streamline transposed batch normalization (move transposes past the + # scale-bias operator, so they can be collapsed afterward) + MoveTransposePastEltwise(), + # There should now be transposes next to each other which can be + # collapsed + CollapseRepeatedTranspose(), + # The transposes around the batch normalization should be collapsed by + # now and cancel each other out + RemoveIdentityTranspose(), + # Nested, exhaustive compositions of transformations + ComposedTransformation([ + # We now might have transpose operations accumulating in front of + # fork nodes + MoveTransposePastFork(), + MoveTransposePastEltwise(), + CollapseRepeatedTranspose(), + RemoveIdentityTranspose(), + ]), + # This might have caused the normalization scale and bias to accumulate + # in front of transpose or fork node + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + MoveScalarLinearPastInvariants(), + # This might have enabled more streamlining transformations + Streamline(), + # We need a custom streamlining step to enable streamlining through + # certain fork-nodes Note: This transform is part of finn, but not + # included in the standard streamlining transformations + MoveLinearPastFork(), + # This might have enabled more streamlining transformations + Streamline(), + ])) + + # If configured, run a verification of the transformed model on some sample + # inputs + if (VerificationStepType.STREAMLINED_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step(model, cfg, "streamlined_norms_python", need_parent=False) + + # Return the streamlined model + return model + + +# Streamlining transformation to be applied to the positional encoding layer +def step_streamline_positional(model: ModelWrapper, cfg: DataflowBuildConfig): + # There is probably a division in front of the quantized positional + # encoding, which is exactly the inverse of the multiplication in front of + # that: The are the matching scale factors of the shared input quantizer of + # input and positional encoding. Convert the division to multiplication, so + # these two can be merged. + model = model.transform(ConvertDivToMul()) + # Merge the quantization scales of shared input quantizers + model = model.transform(CollapseRepeatedMul()) + # Push scalar multiplications, probably scale factors of quantizers, into + # the branches of a fork + model = model.transform(MoveMulPastFork()) + + # If configured, run a verification of the transformed model on some sample + # inputs + if (VerificationStepType.STREAMLINED_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "streamlined_positional_python", need_parent=False + ) + + # Return the streamlined model + return model + + +# Function running the InferScaledDotProductAttention transformation +def step_convert_attention_to_hw(model: ModelWrapper, _): + # Try to infer reshaping of attention heads + model = model.transform(InferMultiHeads()) # noqa: Duplicate + # Try to mode the mult-head splitting past the multi thresholds + model = model.transform(MoveSplitMultiHeadsPastMultiThreshold()) + # Moving multi-head splitting past multi thresholds might enable absorbing + # adds into thresholds once again + model = model.transform(AbsorbAddIntoMultiThreshold()) + # Try to infer a ScaledDotProductAttention custom op + model = model.transform(InferScaledDotProductAttention()) + # Parallelize attention head in the onnx graph + model = model.transform(UnrollMultiHeadAttention()) + # Swap the order of merging the multi heads and applying thresholds + model = model.transform(MoveMergeMultiHeadsPastMultiThreshold()) + # If applicable, absorb the final thresholds into the attention operator + model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + # Return the model with attention and multi-heads mapped to hardware + # operators + return model + + +# Function running the transformations to convert elementwise binary operations +# to their hardware implementations +def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _): + # Convert elementwise operations to hardware operators + # Note: Do not convert the final Mul operator at the output + return model.transform(InferElementwiseBinaryOperation( + InferElementwiseBinaryOperation.reject_output_dequant + )) + + +# Function running the transformations to convert Gather, i.e., index lookup, +# nodes to their hardware implementations +def step_convert_lookup_to_hw(model: ModelWrapper, _): + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(model.graph.node): + # If this is a Gather node, force the input (index) type annotation + if node.op_type == "Gather": + # Force to unsigned 64-bit integer for now + model.set_tensor_datatype(node.input[1], DataType["UINT64"]) + # Get the value info for the input tensor to have access to the ONNX + # datatype of the tensor + value_info = model.get_tensor_valueinfo(node.input[1]) + # Force the container datatype of the input to be a float + value_info.type.tensor_type.elem_type = 1 + # Convert Gather to Lookup layers + return model.transform(InferLookupLayer()) + + +# Function running the InferReplicateStream transformation +def step_replicate_streams(model: ModelWrapper, _): + # Properly replicate the stream feeding the query, key and value projections + return model.transform(InferReplicateStream()) + + +# Post-processing tidy-up squeezing dimensions and identity operators left over +# from mapping the attention operators +def step_tidy_up_post_attention(model: ModelWrapper, _): + # Remove dimensions of size 1 (single batch tensors) + model = model.transform(Squeeze()) + model = model.transform(RemoveIdentityTranspose()) + + # Squeezing might enable absorbing adds into thresholds once again + model = model.transform(AbsorbAddIntoMultiThreshold()) + # If applicable, absorb the final thresholds into the attention operator + # Note: Might be applicable again after squeezing a transpose away + model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + + # Squeezing might enable some more streamlining transformations once again + model = model.transform(ComposedTransformation([ + # Streamline the residual connections by moving scale factors past + # elementwise add nodes + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + MoveScalarLinearPastInvariants(), + # Do the normal streamlining flow once again + Streamline(), + ])) + + # Clean up the names for debugging + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + # Return the tidied up model + return model + + +# Custom step for setting the parallelism to meet the target of T^2 cycles per +# sequence +def set_target_parallelization(seq_len: int, + emb_dim: int): # noqa: emb_dim + # The wrapping function is a generator and this is the actual build step + # function taking the model and build configuration + def step_set_target_parallelization( + model: ModelWrapper, cfg: DataflowBuildConfig + ): + # Run over all nodes in the model graph to look for attention operators, + # which are currently not handled by the SetFolding transformation + for index, node in enumerate(model.graph.node): + # Only handle attention operations here + if node.op_type == "ScaledDotProductAttention_hls": + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Set the sequence and embedding dimension folding to meet the + # T^2 cycles target, i.e., fully parallel along the embedding + # dimension and fully sequential along the sequence dimension + inst.set_nodeattr("EmbFold", 1) + inst.set_nodeattr("SeqFold", seq_len) + # Apply the built-in folding configuration transformation with the + # T^2 target cycles + model = model.transform(SetFolding( + seq_len ** 2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation + )) + # TODO: Extract the folding configuration + # Return the model with configured parallelization + return model + + # Return the wrapped build step function + return step_set_target_parallelization + + +# Applies configuration dictionary to the model graph +class ApplyConfig(Transformation): + # Initializes the transformation with the configuration dictionary + def __init__(self, config): + # Initialize the transformation base class + super().__init__() + # Register the configuration dictionary to be used in apply() + self.config = config + + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # A node should not be named "defaults"... + assert node.name != "defaults", \ + "Node has reserved name 'defaults'" + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Apply the per operator type default configurations to the node + if node.op_type in self.config["defaults"]: + # Run over all default options to be applied to this node + for key, value in self.config["defaults"][node.op_type].items(): + # Set the nodes attribute to the default option value + inst.set_nodeattr(key, value) + # If there is an individual, node-specific configuration apply + # this next, potentially overriding the defaults set above + if node.name in self.config: + # Run over all node-specific options to be applied to this + # node + for key, value in self.config[node.name].items(): + # Set the nodes attribute to the option value + inst.set_nodeattr(key, value) + # Return model with configuration applied + # Note: Do not consider this as modifying the graph. This does not have + # to be reapplied multiple times. + return model, False + + +# Custom build step trying to set appropriate FIFO sizes for the transformer +def set_fifo_depths( + seq_len: int, emb_dim: int, uram_threshold: int = 32 # noqa: emb_dim +): + # The wrapping function is a generator and this is the actual build step + # function taking the model and build configuration + def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): + # Run over all nodes in the model graph + for index, node in enumerate(model.graph.node): + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Extract the FIFO depths configuration of the node + in_depths = inst.get_nodeattr("inFIFODepths") + out_depths = inst.get_nodeattr("outFIFODepths") + + # Number of inputs and outputs to/from the node + num_inputs = len(node.input) + num_outputs = len(node.output) + + # If the input/output has only default configurations, fill with as + # many shallow FIFOs as there are inputs, to avoid later problems + # with to few FIFO depths specified + if in_depths == [2] and num_inputs > 1: + in_depths = num_inputs * [2] + if out_depths == [2] and num_outputs > 1: + out_depths = num_outputs * [2] + + # Special case: Attention needs properly sized input FIFOs + if node.op_type == "ScaledDotProductAttention_hls": + # Each folded input stream needs to be buffered completely + # TODO: Not exactly sure whether this is always correct or just + # the worst-case + in_depths = [ + inst.get_number_input_values(i) for i in range(num_inputs) + ] + # Note: No special treatment of the output FIFO + # out_depths = ... + + # Special case: Adding residual branches needs to buffer the inputs + # to avoid deadlocks if one branch is running faster/slower + if node.op_type == "ElementwiseAdd_hls": + # Only relevant if for join-node operations, i.e., node actually + # consumes two branches, potentially operating at a different + # rate + if model.is_join_node(node): + # Set both inputs to buffer as many cycles as we target for + # the attention operations, i.e., the T^2 cycles per + # sequence target + # TODO: Not exactly sure whether this is always correct or + # just the worst-case + # TODO: Currently we do not really have a reliable way of + # figuring out which of the two is the longer/deeper branch + # in terms of cycles to set a corresponding buffer only to + # the shorter branch. + in_depths = [seq_len ** 2, seq_len ** 2] + # Note: No special treatment of the output FIFO + # out_depths = ... + + # Set the updated FIFO depths attributes + inst.set_nodeattr("inFIFODepths", in_depths) + inst.set_nodeattr("outFIFODepths", out_depths) + + # The following partially mirrors (or even copies from) the build-in + # step_set_fifo_depths using only manual FIFO depths and our YAML-based + # folding configuration. + + # Insert data-width converters + model = model.transform(InsertDWC()) + # Insert FIFOs between all operators (inserts shallow, depths 2 FIFOs if + # no other depth is specified) + model = model.transform(InsertFIFO(create_shallow_fifos=True)) + # Specialize the implementation variant of the (newly added FIFO) layers + model = model.transform( + SpecializeLayers(cfg._resolve_fpga_part()) # noqa: Access _ method + ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Only applies if a configuration file is given + if cfg.folding_config_file is not None: + # Load the configuration dictionary form YAML file + with (open(cfg.folding_config_file, "r") as file): + # Load YAML string + config = yaml.safe_load(file) + # Assign unique names to the nodes which can be matched by + # individual per-node configuration options + model = model.transform(GiveUniqueNodeNames()) + # Apply the configuration dictionary to the model graph + model = model.transform(ApplyConfig(config)) + + # Run over all nodes in the model graph once again to modify the + # inserted FIFOs + # Note: This overwrites the folding configuration... + # TODO: Find a better way to handle this + for index, node in enumerate(model.graph.node): + # Modify all RTL FIFO operators + if node.op_type == "StreamingFIFO_rtl": + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Check the depth of the FIFO: If this is not a shallow FIFO, + # implement this via the vivado strategy in URAM + if inst.get_nodeattr("depth") >= uram_threshold: + # Change the implementation style to vivado + inst.set_nodeattr("impl_style", "vivado") + # Set the resource type for the memory to URAM + inst.set_nodeattr("ram_style", "ultra") + + # Hardware attributes to be extracted from each node + hw_attrs = { + "PE", + "SIMD", + "parallel_window", + "ram_style", + "ram_style_thresholds", + "ram_style_mask", + "depth", + "impl_style", + "resType", + "mac_resource", + "mem_mode", + "runtime_writeable_weights", + "inFIFODepths", + "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", + } + + # Start collecting the configuration from the model graph as a + # dictionary + config = {"defaults": {}} + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(model.graph.node): + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Prepare the node-specific configuration entry for this node + config[node.name] = {} + # Collect attribute values for all specified hardware attributes + for key in hw_attrs: + # Some hardware attributes may not be present for all nodes or + # op-types, this will be signaled via exception + try: + # Try extracting the configuration value from the node + # custom-op instance + config[node.name][key] = inst.get_nodeattr(key) + # Missing attributes are signaled va AttributeError + except AttributeError: + # Can be safely ignored here + pass + # Cleanup: If no attribute is present for this node, there is no + # need to keep this in the configuration dictionary as there is + # nothing to be restored later + if not config[node.name]: + # Remove the entry form the configuration dictionary + del config[node.name] + + # Create/Open a YAML file to store the configuration for later reuse + with open(cfg.output_dir + "/final_hw_config.yaml", "w") as file: + # Store the configuration dictionary as YAML code + yaml.safe_dump(config, file) + + # Perform FIFO splitting and shallow FIFO removal only after the final + # config file has been written. Otherwise, since these transforms may + # add/remove FIFOs, we get name mismatch problems when trying to reuse + # the final config. + if cfg.split_large_fifos: + model = model.transform(SplitLargeFIFOs()) + model = model.transform(RemoveShallowFIFOs()) + + # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again + # this will only run for the new nodes (e.g. FIFOs and DWCs) + model = model.transform( + PrepareIP( + cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period() # noqa + ) + ) + model = model.transform(HLSSynthIP()) + + # Return the model with configured parallelization + return model + + # Return the wrapped build step function + return step_set_fifo_depths + + +# Custom step applying our custom format of folding configuration to the graph +def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig): + # Only applies if a configuration file is given + if cfg.folding_config_file is not None: + # Load the configuration dictionary form YAML file + with (open(cfg.folding_config_file, "r") as file): + # Load YAML string + config = yaml.safe_load(file) + # Assign unique names to the nodes which can be matched by + # individual per-node configuration options + model = model.transform(GiveUniqueNodeNames()) + # Apply the configuration dictionary to the model graph + model = model.transform(ApplyConfig(config)) + # If configured, run a verification of the transformed model on some sample + # inputs + if (VerificationStepType.FOLDED_HLS_CPPSIM in + cfg._resolve_verification_steps()): # noqa + # Prepare C++ Simulation for verification + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + # Execute a verification step of the model with inputs specified in + # build configuration + verify_step(model, cfg, "folded_hls_cppsim", need_parent=True) + + # Return model with configuration applied + return model + + +# Runs a node-by-node Python simulation of the model saving the fill execution +# context +# Note: Assumes no execution mode to be set +def node_by_node_python(model: ModelWrapper, cfg: DataflowBuildConfig): + # Save the original model + original = model + # Copy the model + model = copy.deepcopy(model) + + # Load the verification input/output pair + inp = np.load(cfg.verify_input_npy) # noqa + out = np.load(cfg.verify_expected_output_npy) + + # Path to the parent model wrapping the streaming dataflow partition and the + # wrapped child model, i.e., the inside of the streaming dataflow partition + parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" + child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx" + # Save the child model prepared for python simulation + model.save(child) + # Load the parent model to pass to verification execution + parent_model = ModelWrapper(parent) + + # Reshape the input/output to match the model + inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) + out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) + + # Execute the onnx model to collect the result + # context = execute_onnx(model, context, return_full_exec_context=True) + context = execute_parent(parent, child, inp, return_full_ctx=True) + # Extract the output tensor from the execution context + model_out = context[parent_model.graph.output[0].name] + # Compare input to output + result = {True: "SUCCESS", False: "FAIL"}[ + np.allclose(out, model_out, atol=1e-3) + ] + # Save the verification outputs into the configured build directory + verification_output = f"{cfg.output_dir}/verification_output/" + # Save the verification execution context + np.savez(f"{verification_output}/verify_python_{result}.npz", **context) + # Return the original, unmodified model + return original + + +# Runs a node-by-node C++ simulation of the model saving the fill execution +# context +def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig): + # Save the original model + original = model + # Copy the model + model = copy.deepcopy(model) + # Set model execution mode to C++ simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + + # Load the verification input/output pair + inp = np.load(cfg.verify_input_npy) # noqa + out = np.load(cfg.verify_expected_output_npy) + + # Path to the parent model wrapping the streaming dataflow partition and the + # wrapped child model, i.e., the inside of the streaming dataflow partition + parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" + child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx" + # Save the child model prepared for C++ simulation + model.save(child) + # Load the parent model to pass to verification execution + parent_model = ModelWrapper(parent) + + # Reshape the input/output to match the model + inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) + out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) + + # Execute the onnx model to collect the result + # context = execute_onnx(model, context, return_full_exec_context=True) + context = execute_parent(parent, child, inp, return_full_ctx=True) + # Extract the output tensor from the execution context + model_out = context[parent_model.graph.output[0].name] + # Compare input to output + result = {True: "SUCCESS", False: "FAIL"}[ + np.allclose(out, model_out, atol=1e-3) + ] + # Save the verification outputs into the configured build directory + verification_output = f"{cfg.output_dir}/verification_output/" + # Save the verification execution context + np.savez(f"{verification_output}/verify_cppsim_{result}.npz", **context) + # Return the original, unmodified model + return original + + +# Runs a node-by-node RTL simulation of the model saving the fill execution +# context +def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig): + # Save the original model + original = model + # Copy the model + model = copy.deepcopy(model) + # Set model execution mode to RTL simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP( + cfg._resolve_fpga_part(), cfg.synth_clk_period_ns) # noqa + ) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Load the verification input/output pair + inp = np.load(cfg.verify_input_npy) # noqa + out = np.load(cfg.verify_expected_output_npy) + + # Path to the parent model wrapping the streaming dataflow partition and the + # wrapped child model, i.e., the inside of the streaming dataflow partition + parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" + child = f"{cfg.output_dir}/intermediate_models/verify_rtlsim.onnx" + # Save the child model prepared for RTL simulation + model.save(child) + # Load the parent model to pass to verification execution + parent_model = ModelWrapper(parent) + + # Reshape the input/output to match the model + inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) + out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) + + # Execute the onnx model to collect the result + # context = execute_onnx(model, context, return_full_exec_context=True) + context = execute_parent(parent, child, inp, return_full_ctx=True) + # Extract the output tensor from the execution context + model_out = context[parent_model.graph.output[0].name] + # Compare input to output + result = {True: "SUCCESS", False: "FAIL"}[ + np.allclose(out, model_out, atol=1e-3) + ] + # Save the verification outputs into the configured build directory + verification_output = f"{cfg.output_dir}/verification_output/" + # Save the verification execution context + np.savez(f"{verification_output}/verify_rtlsim_{result}.npz", **context) + # Return the original, unmodified model + return original diff --git a/benchmarking/dut/transformer_gpt.py b/benchmarking/dut/transformer_gpt.py new file mode 100644 index 0000000000..5ee77483ab --- /dev/null +++ b/benchmarking/dut/transformer_gpt.py @@ -0,0 +1,348 @@ +# Adapted from Christoph's attention-dummy repository + +# PyTorch base package: Math and Tensor Stuff +import torch +# Brevitas wrapper around PyTorch tensors adding quantization information +from brevitas.quant_tensor import QuantTensor +# Brevitas: Quantized versions of PyTorch layers +from brevitas.nn import ( + QuantMultiheadAttention, + QuantEltwiseAdd, + QuantIdentity, + QuantLinear, + QuantReLU +) +from qonnx.core.modelwrapper import ModelWrapper +# Progressbar +from tqdm import trange +import numpy as np +from brevitas.export import export_qonnx +import random +import json +import subprocess +# FINN dataflow builder +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.builder.build_dataflow_config import AutoFIFOSizingMethod +from bench_base import bench, step_synth_harness +import os +from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents + +# Custom build steps required to streamline and convert the attention operator +from dut.transformer_custom_steps import ( + step_tidy_up_pre_attention, + step_tidy_up_post_attention, + step_streamline_attention, + step_streamline_residual, + step_streamline_norms, + step_streamline_positional, + step_convert_attention_to_hw, + step_convert_elementwise_binary_to_hw, + step_convert_lookup_to_hw, + step_replicate_streams, + set_target_parallelization, + set_fifo_depths, + step_apply_folding_config, + node_by_node_rtlsim, # noqa: Maybe unused, only for debugging + node_by_node_python, # noqa: Maybe unused, only for debugging + node_by_node_cppsim +) +from performance.platform_build_steps import( + test_step_gen_vitis_xo, + test_step_gen_instrumentation_wrapper, + test_step_gen_instrwrap_sim, + test_step_insert_tlastmarker, + test_step_export_xo, + test_step_build_platform, + test_step_run_instrwrap_sim +) + +### ADAPTED FROM utils.py +# Seeds all relevant random number generators to the same seed for +# reproducibility +def seed(s): + random.seed(s) + np.random.seed(s) + torch.manual_seed(s) + +template_folding_yaml = """ +# Per operator type default configurations +defaults: + # Scaled dot-product attention head implemented via HLS + ScaledDotProductAttention_hls: + # Type of memory to be used for internal buffer storage + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Type of memory to be used fo the attention mask (if present) + # Options: auto, block, distributed + ram_style_mask: block + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + mac_resource: lut + # Addition of two inputs (constants or streamed) implemented via HLS + ElementwiseAdd_hls: + # Type of memory to be used for internal buffer storage and/or constant + # parameter tensors + # Options: auto, block, distributed, ultra + ram_style: distributed + # Matrix vector activation unit implemented via HLS + MVAU_hls: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Matrix vector activation unit implemented via RTL + MVAU_rtl: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + # Note: RTL MVAU currently does not support LUT-based implementation + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via HLS (applies to standalone thresholds) + Thresholding_hls: + # Memory mode for threshold storage + # Options: internal_embedded, internal_decoupled + mem_mode: internal_decoupled + # Type of memory to be used for threshold storage if "internal_decoupled" + # Options: distributed, block + ram_style: distributed + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via RTL (applies to standalone thresholds) + Thresholding_rtl: + # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the + # depth of the thresholds + # Note: This combination forces "distributed" LUT implementation + depth_trigger_uram: 2147483647 # "infinity" + depth_trigger_bram: 2147483647 # "infinity" + # # Note: This combination forces "block" RAM implementation + # depth_trigger_uram: 0 + # depth_trigger_bram: 1 + # # Note: This combination forces "ultra" RAM implementation + # depth_trigger_uram: 1 + # depth_trigger_bram: 0 + # # Note: This combination is equivalent to "auto" + # depth_trigger_uram: 0 + # depth_trigger_bram: 0 + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) + StreamingFIFO_rtl: + # RTL vs. IPI implementation of FIFOs + # Options: rtl, vivado + impl_style: rtl + # Resource type for FIFOs when impl_style is vivado + # Options: auto, block, distributed, ultra + ram_style: distributed + # Individual, named node-specific configurations here + # ... +""" + +class bench_transformer_gpt(bench): + def step_build(self): + #with open("params.yaml") as file: + # params = yaml.safe_load(file) + # Seed all RNGs + seed(self.params["seed"]) + + # Extract sequence length and embedding dimension from the output of the + # first quantizer in the model + # Note: Embedding and Sequence dimension flip later + model = ModelWrapper(self.build_inputs["onnx_path"]) + _, emb_dim, seq_len = model.get_tensor_shape( + "/emb_add/input_quant/export_handler/Quant_output_0" + ) + + # Prepare config files + # TODO: make configurable + # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs + specialize_layers_dict = { + "Defaults": { + "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] + }, + "": { + "preferred_impl_style": "" + } + } + with open("specialize_layers.json", "w") as f: + json.dump(specialize_layers_dict, f, indent=2) + with open("folding.yaml", "w") as f: + f.write(template_folding_yaml) + + #TODO: make configurable instead of hardcoding exception + self.board = "U280" + self.part = "xcu280-fsvh2892-2L-e" + + # Create a configuration for building the scaled dot-product attention + # operator to a hardware accelerator + cfg = build_cfg.DataflowBuildConfig( + # Unpack the build configuration parameters + #**params["build"], + output_dir = self.build_inputs["build_dir"], + stitched_ip_gen_dcp = True, + synth_clk_period_ns = self.clock_period_ns, + board = self.board, + shell_flow_type = "vitis_alveo", #TODO: proper Alveo support instead of hardcoding + folding_config_file = "folding.yaml", + specialize_layers_config_file = "specialize_layers.json", + standalone_thresholds = True, + max_multithreshold_bit_width = 16, + mvau_wwidth_max = 2048, + split_large_fifos = True, + + verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log + + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM + #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later + #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed + #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed + #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components + #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation + ], + + verify_steps=[ + # Verify the model after converting to the FINN onnx dialect + build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, + # Verify the model again using python mode after the default + # streamlining step + build_cfg.VerificationStepType.STREAMLINED_PYTHON, + # Verify the model again after tidy up transformations, right before + # converting to HLS + build_cfg.VerificationStepType.TIDY_UP_PYTHON, + # Verify the model after generating C++ HLS and applying folding + build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + ], + # File with test inputs for verification + verify_input_npy=self.build_inputs["input_npy_path"], + # File with expected test outputs for verification + verify_expected_output_npy=self.build_inputs["output_npy_path"], + # Save the intermediate model graphs + save_intermediate_models=True, + # Avoid RTL simulation for setting the FIFO sizes + auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, + # Do not automatically set FIFO sizes as this requires RTL simulation + # not implemented for the attention operator + auto_fifo_depths=False, + # Build steps to execute + steps=[ + # Need to apply some tidy-up transformations before converting to + # the finn dialect of onnx + step_tidy_up_pre_attention, + # Convert all QONNX Quant nodes to Multithreshold nodes + "step_qonnx_to_finn", + # Tidy up the graph after converting from QONNX to FINN format + # Note: Triggers a verification step + "step_tidy_up", + # Positional encoding needs to be streamlined first with slightly + # different order of certain streamlining transformations to avoid + # weird rounding issue of intermediate results + step_streamline_positional, + # Custom streamlining for models containing attention operators + step_streamline_attention, + # Streamlining of the residual branches + step_streamline_residual, + # Streamline the normalization layers, i.e., transposed batch norm + step_streamline_norms, + # Another round using the default streamlining steps + # Note: Triggers a verification step + "step_streamline", + # New conversion of the scaled dot-product attention pattern + step_convert_attention_to_hw, + # Another tidy-up step to remove unnecessary dimensions and + # operations after converting the attention operators to HLS + step_tidy_up_post_attention, + # Convert the elementwise binary operations to hardware operators. + # These include for example adding residual branches and positional + # encoding + step_convert_elementwise_binary_to_hw, + # Convert the Gather layer realizing the input token embedding to + # the FINN hardware implementation, i.e., the Lookup layer + step_convert_lookup_to_hw, + # Properly replicate the stream feeding the query, key and value + # projections + step_replicate_streams, + # Convert most other layers supported by FINN to HW operators + "step_convert_to_hw", + # Specialize HW layer implementations as either HLS or RTL + "step_specialize_layers", + "step_create_dataflow_partition", + # Set the folding configuration to meet the cycles per sequence + # target + set_target_parallelization(seq_len, emb_dim), + # Apply folding configuration, specifying hardware implementation + # details + # Note: This triggers a verification step + step_apply_folding_config, + "step_minimize_bit_width", + # The ScaledDotProductAttention custom op does not define any + # estimates + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + # Set the attention- and residual-related FIFO depths insert FIFOs + # and apply folding configuration once again + # Note: Implement all FIFOs with a depth at least as deep as the + # sequence length in URAM. + set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), + # Run additional node-by-node verification in RTL simulation of the + # model before creating the stitched IP + # Note: end-to-end verification of the stitched IP in RTL simulation + # is still not possible due to missing float IPs + node_by_node_cppsim, + # Only for debugging for now, does not work if "vivado" style + # StreamingFIFOs are used + # node_by_node_rtlsim, + + test_step_insert_tlastmarker, # required for instrumentation_wrapper + + "step_create_stitched_ip", + + # "step_measure_rtlsim_performance", # not possible due to float components + + step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) + + #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + + # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) + #"step_synthesize_bitfile", + #"step_make_pynq_driver", + #"step_deployment_package", + + #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration + #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration + + #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration + #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime + + #test_step_export_xo, # preparation step for original instr wrapper integration + #test_step_build_platform # synthesis with instr wrapper + ] + ) + # Run the build process on the dummy attention operator graph + # TODO: maybe let this function return the cfg only, so it can be modified by bench context + build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) + + def run(self): + self.steps_full_build_flow() diff --git a/benchmarking/dut/transformer_radioml.py b/benchmarking/dut/transformer_radioml.py new file mode 100644 index 0000000000..4d77cb4b8d --- /dev/null +++ b/benchmarking/dut/transformer_radioml.py @@ -0,0 +1,336 @@ +# Adapted from Christoph's attention-dummy repository + +# PyTorch base package: Math and Tensor Stuff +import torch +# Brevitas wrapper around PyTorch tensors adding quantization information +from brevitas.quant_tensor import QuantTensor +# Brevitas: Quantized versions of PyTorch layers +from brevitas.nn import ( + QuantMultiheadAttention, + QuantEltwiseAdd, + QuantIdentity, + QuantLinear, + QuantReLU +) +# Progressbar +from tqdm import trange +import numpy as np +from brevitas.export import export_qonnx +import random +import json +import subprocess +# FINN dataflow builder +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.builder.build_dataflow_config import AutoFIFOSizingMethod +from bench_base import bench, step_synth_harness +import os +from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents + +# Custom build steps required to streamline and convert the attention operator +from dut.transformer_custom_steps import ( + step_tidy_up_pre_attention, + step_tidy_up_post_attention, + step_streamline_attention, + step_streamline_residual, + step_streamline_norms, + step_streamline_positional, + step_convert_attention_to_hw, + step_convert_elementwise_binary_to_hw, + step_convert_lookup_to_hw, + step_replicate_streams, + set_target_parallelization, + set_fifo_depths, + step_apply_folding_config, + node_by_node_rtlsim, + node_by_node_cppsim +) +from performance.platform_build_steps import( + test_step_gen_vitis_xo, + test_step_gen_instrumentation_wrapper, + test_step_gen_instrwrap_sim, + test_step_insert_tlastmarker, + test_step_export_xo, + test_step_build_platform, + test_step_run_instrwrap_sim +) + +### ADAPTED FROM utils.py +# Seeds all relevant random number generators to the same seed for +# reproducibility +def seed(s): + random.seed(s) + np.random.seed(s) + torch.manual_seed(s) + +template_folding_yaml = """ +# Per operator type default configurations +defaults: + # Scaled dot-product attention head implemented via HLS + ScaledDotProductAttention_hls: + # Type of memory to be used for internal buffer storage + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Type of memory to be used fo the attention mask (if present) + # Options: auto, block, distributed + ram_style_mask: block + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + mac_resource: lut + # Addition of two inputs (constants or streamed) implemented via HLS + ElementwiseAdd_hls: + # Type of memory to be used for internal buffer storage and/or constant + # parameter tensors + # Options: auto, block, distributed, ultra + ram_style: distributed + # Matrix vector activation unit implemented via HLS + MVAU_hls: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Matrix vector activation unit implemented via RTL + MVAU_rtl: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + # Note: RTL MVAU currently does not support LUT-based implementation + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via HLS (applies to standalone thresholds) + Thresholding_hls: + # Memory mode for threshold storage + # Options: internal_embedded, internal_decoupled + mem_mode: internal_decoupled + # Type of memory to be used for threshold storage if "internal_decoupled" + # Options: distributed, block + ram_style: distributed + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via RTL (applies to standalone thresholds) + Thresholding_rtl: + # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the + # depth of the thresholds + # Note: This combination forces "distributed" LUT implementation + depth_trigger_uram: 2147483647 # "infinity" + depth_trigger_bram: 2147483647 # "infinity" + # # Note: This combination forces "block" RAM implementation + # depth_trigger_uram: 0 + # depth_trigger_bram: 1 + # # Note: This combination forces "ultra" RAM implementation + # depth_trigger_uram: 1 + # depth_trigger_bram: 0 + # # Note: This combination is equivalent to "auto" + # depth_trigger_uram: 0 + # depth_trigger_bram: 0 + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) + StreamingFIFO_rtl: + # RTL vs. IPI implementation of FIFOs + # Options: rtl, vivado + impl_style: rtl + # Resource type for FIFOs when impl_style is vivado + # Options: auto, block, distributed, ultra + ram_style: distributed + # Individual, named node-specific configurations here + # ... +""" + +class bench_transformer_radioml(bench): + def step_build(self): + #with open("params.yaml") as file: + # params = yaml.safe_load(file) + # Seed all RNGs + seed(self.params["seed"]) + # Extract sequence length and embedding dimension from parameters + _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape + + # Prepare config files + # TODO: make configurable + # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs + specialize_layers_dict = { + "Defaults": { + "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] + }, + "": { + "preferred_impl_style": "" + } + } + with open("specialize_layers.json", "w") as f: + json.dump(specialize_layers_dict, f, indent=2) + with open("folding.yaml", "w") as f: + f.write(template_folding_yaml) + + # Create a configuration for building the scaled dot-product attention + # operator to a hardware accelerator + cfg = build_cfg.DataflowBuildConfig( + # Unpack the build configuration parameters + #**params["build"], + output_dir = self.build_inputs["build_dir"], + stitched_ip_gen_dcp = True, + synth_clk_period_ns = self.clock_period_ns, + board = self.board, + shell_flow_type = "vivado_zynq", #TODO: Alveo support + folding_config_file = "folding.yaml", + specialize_layers_config_file = "specialize_layers.json", + standalone_thresholds = True, + max_multithreshold_bit_width = 16, + mvau_wwidth_max = 2048, + split_large_fifos = True, + + verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log + + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM + #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later + #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed + #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed + #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components + #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation + ], + + verify_steps=[ + # Verify the model after converting to the FINN onnx dialect + build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, + # Verify the model again using python mode after the default + # streamlining step + build_cfg.VerificationStepType.STREAMLINED_PYTHON, + # Verify the model again after tidy up transformations, right before + # converting to HLS + build_cfg.VerificationStepType.TIDY_UP_PYTHON, + # Verify the model after generating C++ HLS and applying folding + build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + ], + # File with test inputs for verification + verify_input_npy=self.build_inputs["input_npy_path"], + # File with expected test outputs for verification + verify_expected_output_npy=self.build_inputs["output_npy_path"], + # Save the intermediate model graphs + save_intermediate_models=True, + # Avoid RTL simulation for setting the FIFO sizes + auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, + # Do not automatically set FIFO sizes as this requires RTL simulation + # not implemented for the attention operator + auto_fifo_depths=False, + # Build steps to execute + steps=[ + # Need to apply some tidy-up transformations before converting to + # the finn dialect of onnx + step_tidy_up_pre_attention, + # Convert all QONNX Quant nodes to Multithreshold nodes + "step_qonnx_to_finn", + # Tidy up the graph after converting from QONNX to FINN format + # Note: Triggers a verification step + "step_tidy_up", + # Positional encoding needs to be streamlined first with slightly + # different order of certain streamlining transformations to avoid + # weird rounding issue of intermediate results + step_streamline_positional, + # Custom streamlining for models containing attention operators + step_streamline_attention, + # Streamlining of the residual branches + step_streamline_residual, + # Streamline the normalization layers, i.e., transposed batch norm + step_streamline_norms, + # Another round using the default streamlining steps + # Note: Triggers a verification step + "step_streamline", + # New conversion of the scaled dot-product attention pattern + step_convert_attention_to_hw, + # Another tidy-up step to remove unnecessary dimensions and + # operations after converting the attention operators to HLS + step_tidy_up_post_attention, + # Convert the elementwise binary operations to hardware operators. + # These include for example adding residual branches and positional + # encoding + step_convert_elementwise_binary_to_hw, + # Convert the Gather layer realizing the input token embedding to + # the FINN hardware implementation, i.e., the Lookup layer + step_convert_lookup_to_hw, + # Properly replicate the stream feeding the query, key and value + # projections + step_replicate_streams, + # Convert most other layers supported by FINN to HW operators + "step_convert_to_hw", + # Specialize HW layer implementations as either HLS or RTL + "step_specialize_layers", + "step_create_dataflow_partition", + # Set the folding configuration to meet the cycles per sequence + # target + set_target_parallelization(seq_len, emb_dim), + # Apply folding configuration, specifying hardware implementation + # details + # Note: This triggers a verification step + step_apply_folding_config, + "step_minimize_bit_width", + # The ScaledDotProductAttention custom op does not define any + # estimates + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + # Set the attention- and residual-related FIFO depths insert FIFOs + # and apply folding configuration once again + # Note: Implement all FIFOs with a depth at least as deep as the + # sequence length in URAM. + set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), + # Run additional node-by-node verification in RTL simulation of the + # model before creating the stitched IP + # Note: end-to-end verification of the stitched IP in RTL simulation + # is still not possible due to missing float IPs + node_by_node_cppsim, + # Only for debugging for now, does not work if "vivado" style + # StreamingFIFOs are used + # node_by_node_rtlsim, + + test_step_insert_tlastmarker, # required for instrumentation_wrapper + + "step_create_stitched_ip", + + # "step_measure_rtlsim_performance", # not possible due to float components + + step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) + + #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + + # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) + #"step_synthesize_bitfile", + #"step_make_pynq_driver", + #"step_deployment_package", + + #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration + #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration + + #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration + #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime + + #test_step_export_xo, # preparation step for original instr wrapper integration + #test_step_build_platform # synthesis with instr wrapper + ] + ) + # Run the build process on the dummy attention operator graph + # TODO: maybe let this function return the cfg only, so it can be modified by bench context + build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) + + def run(self): + self.steps_full_build_flow() diff --git a/benchmarking/harness/sink/ip/component.xml b/benchmarking/harness/sink/ip/component.xml new file mode 100644 index 0000000000..cb20a9abad --- /dev/null +++ b/benchmarking/harness/sink/ip/component.xml @@ -0,0 +1,256 @@ + + + xilinx.com + user + harness_sink + 1.0 + + + s_axis_0 + + + + + + + TDATA + + + s_axis_0_tdata + + + + + TVALID + + + s_axis_0_tvalid + + + + + TREADY + + + s_axis_0_tready + + + + + + + + + xilinx_anylanguagesynthesis + Synthesis + :vivado.xilinx.com:synthesis + Verilog + harness_sink + + xilinx_anylanguagesynthesis_view_fileset + + + + viewChecksum + 18b9f9a4 + + + + + xilinx_anylanguagebehavioralsimulation + Simulation + :vivado.xilinx.com:simulation + Verilog + harness_sink + + xilinx_anylanguagebehavioralsimulation_view_fileset + + + + viewChecksum + 18b9f9a4 + + + + + xilinx_xpgui + UI Layout + :vivado.xilinx.com:xgui.ui + + xilinx_xpgui_view_fileset + + + + viewChecksum + 6955aee3 + + + + + + + enable + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + valid + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + checksum + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_0_tdata + + in + + 7 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axis_0_tvalid + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_0_tready + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + + + STREAM_WIDTH + Stream Width + 8 + + + + + + xilinx_anylanguagesynthesis_view_fileset + + src/harness_sink.v + verilogSource + CHECKSUM_18b9f9a4 + IMPORTED_FILE + + + + xilinx_anylanguagebehavioralsimulation_view_fileset + + src/harness_sink.v + verilogSource + IMPORTED_FILE + + + + xilinx_xpgui_view_fileset + + xgui/harness_sink_v1_0.tcl + tclSource + CHECKSUM_6955aee3 + XGUI_VERSION_2 + + + + harness_sink_v1_0 + + + STREAM_WIDTH + Stream Width + 8 + + + Component_Name + harness_sink_v1_0 + + + + + + zynq + qzynq + azynq + zynquplus + + + /UserIP + + harness_sink_v1_0 + level_0 + package_project + 2 + 2023-08-22T13:34:35Z + + + 2022.2 + + + + + + + + + + + + + diff --git a/benchmarking/harness/sink/ip/src/harness_sink.v b/benchmarking/harness/sink/ip/src/harness_sink.v new file mode 100644 index 0000000000..e6b95e7797 --- /dev/null +++ b/benchmarking/harness/sink/ip/src/harness_sink.v @@ -0,0 +1,39 @@ +`timescale 1ns / 1ps +////////////////////////////////////////////////////////////////////////////////// +// Company: +// Engineer: +// +// Create Date: 08/22/2023 02:19:08 PM +// Design Name: +// Module Name: harness_sink +// Project Name: +// Target Devices: +// Tool Versions: +// Description: +// +// Dependencies: +// +// Revision: +// Revision 0.01 - File Created +// Additional Comments: +// +////////////////////////////////////////////////////////////////////////////////// + + +module harness_sink #( + parameter STREAM_WIDTH=8 +)( + input enable, + output valid, + output checksum, + input [STREAM_WIDTH-1:0] s_axis_0_tdata, + input s_axis_0_tvalid, + output s_axis_0_tready +); + +assign s_axis_0_tready = enable; + +assign valid = s_axis_0_tvalid; +assign checksum = ^s_axis_0_tdata; + +endmodule diff --git a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl new file mode 100644 index 0000000000..eb752d53a5 --- /dev/null +++ b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl @@ -0,0 +1,25 @@ +# Definitional proc to organize widgets for parameters. +proc init_gui { IPINST } { + ipgui::add_param $IPINST -name "Component_Name" + #Adding Page + set Page_0 [ipgui::add_page $IPINST -name "Page 0"] + ipgui::add_param $IPINST -name "STREAM_WIDTH" -parent ${Page_0} + + +} + +proc update_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } { + # Procedure called to update STREAM_WIDTH when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } { + # Procedure called to validate STREAM_WIDTH + return true +} + + +proc update_MODELPARAM_VALUE.STREAM_WIDTH { MODELPARAM_VALUE.STREAM_WIDTH PARAM_VALUE.STREAM_WIDTH } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.STREAM_WIDTH}] ${MODELPARAM_VALUE.STREAM_WIDTH} +} + diff --git a/benchmarking/harness/vector_xor.v b/benchmarking/harness/vector_xor.v new file mode 100644 index 0000000000..3361860ab8 --- /dev/null +++ b/benchmarking/harness/vector_xor.v @@ -0,0 +1,32 @@ +`timescale 1ns / 1ps +////////////////////////////////////////////////////////////////////////////////// +// Company: +// Engineer: +// +// Create Date: 08/22/2023 02:19:08 PM +// Design Name: +// Module Name: harness_sink +// Project Name: +// Target Devices: +// Tool Versions: +// Description: +// +// Dependencies: +// +// Revision: +// Revision 0.01 - File Created +// Additional Comments: +// +////////////////////////////////////////////////////////////////////////////////// + + +module vector_xor #( + parameter WIDTH=8 +)( + input [WIDTH-1:0] in_data, + output out_data +); + +assign out_data = ^in_data; + +endmodule diff --git a/benchmarking/templates.py b/benchmarking/templates.py new file mode 100644 index 0000000000..c8bf944380 --- /dev/null +++ b/benchmarking/templates.py @@ -0,0 +1,213 @@ +# Template strings for benchmarking + + +# power report scripting based on Lucas Reuter: +template_open = """ +open_project $PROJ_PATH$ +open_run $RUN$ +""" + +template_single_test = """ +set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type lut [get_cells -r finn_design_i/.*] +set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type register [get_cells -r finn_design_i/.*] +set_switching_activity -deassert_resets +report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +reset_switching_activity -hier -type lut [get_cells -r finn_design_i/.*] +reset_switching_activity -hier -type register [get_cells -r finn_design_i/.*] +""" + +# template_single_test_type = """ +# set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*] +# set_switching_activity -deassert_resets +# report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +# reset_switching_activity -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*] +# """ + +template_sim_power = """ +set_property SOURCE_SET sources_1 [get_filesets sim_1] +import_files -fileset sim_1 -norecurse $TB_FILE_PATH$ +set_property top switching_simulation_tb [get_filesets sim_1] +update_compile_order -fileset sim_1 + +launch_simulation -mode post-implementation -type functional +restart +open_saif $SAIF_FILE_PATH$ +log_saif [get_objects -r /switching_simulation_tb/dut/*] +run $SIM_DURATION_NS$ ns +close_saif + +read_saif $SAIF_FILE_PATH$ +report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +""" + +# TODO: configurable clock frequency +template_switching_simulation_tb = """ +`timescale 1 ns/10 ps + +module switching_simulation_tb; +reg clk; +reg rst; + +//dut inputs +reg tready; +reg [$INSTREAM_WIDTH$-1:0] tdata; +reg tvalid; + +//dut outputs +wire [$OUTSTREAM_WIDTH$-1:0] accel_tdata; +wire accel_tready; +wire accel_tvalid; + +finn_design_wrapper dut( + .ap_clk(clk), + .ap_rst_n(rst), + .m_axis_0_tdata(accel_tdata), + .m_axis_0_tready(tready), + .m_axis_0_tvalid(accel_tvalid), + .s_axis_0_tdata(tdata), + .s_axis_0_tready(accel_tready), + .s_axis_0_tvalid(tvalid) + ); + +always + begin + clk = 0; + #2.5; + clk = 1; + #2.5; + end + +integer i; +initial + begin + tready = 0; + tdata = 0; + tvalid = 0; + rst = 0; + #50; + rst = 1; + tvalid = 1; + tready = 1; + while(1) + begin + for (i = 0; i < $INSTREAM_WIDTH$/$DTYPE_WIDTH$; i = i+1) begin + tdata[i*$DTYPE_WIDTH$ +: $DTYPE_WIDTH$] = $RANDOM_FUNCTION$; + end + #5; + end + end +endmodule +""" + +zynq_harness_template = """ +set FREQ_MHZ %s +set NUM_AXILITE %d +if {$NUM_AXILITE > 9} { + error "Maximum 10 AXI-Lite interfaces supported" +} +set NUM_AXIMM %d +set BOARD %s +set FPGA_PART %s +create_project finn_zynq_link ./ -part $FPGA_PART + +# set board part repo paths to find boards installed by FINN +set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]] +set paths_param [get_param board.repoPaths] +lappend paths_prop $::env(FINN_ROOT)/deps/board_files +lappend paths_param $::env(FINN_ROOT)/deps/board_files +set_property BOARD_PART_REPO_PATHS $paths_prop [current_project] +set_param board.repoPaths $paths_param + +if {$BOARD == "RFSoC2x2"} { + set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project] + set ZYNQ_TYPE "zynq_us+" +} else { + puts "Unrecognized board" +} + +create_bd_design "top" +if {$ZYNQ_TYPE == "zynq_us+"} { + set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]] + create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps + apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] + set_property CONFIG.PSU__DISPLAYPORT__PERIPHERAL__ENABLE {0} [get_bd_cells zynq_ps] + #activate one slave port, deactivate the second master port + set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {0}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps] + #set frequency of PS clock (this can't always be exactly met) + set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] +} else { + puts "Unrecognized Zynq type" +} + +#instantiate axi interconnect, axi smartconnect +set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]] +#set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]] +create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0 +#create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0 +#set number of axilite interfaces, and number of axi master interfaces +#set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0] +set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0] + +#create reset controller and connect interconnects to PS +if {$ZYNQ_TYPE == "zynq_us+"} { + set axi_peripheral_base 0xA0000000 + #connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD] + connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI] + #connect interconnect clocks and resets + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/S00_ACLK] + #apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins zynq_ps/saxihp0_fpd_aclk] +} +#connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn] + +#procedure used by below IP instantiations to map BD address segments based on the axi interface aperture +proc assign_axi_addr_proc {axi_intf_path} { + #global variable holds current base address + global axi_peripheral_base + #infer range + set range [expr 2**[get_property CONFIG.ADDR_WIDTH [get_bd_intf_pins $axi_intf_path]]] + set range [expr $range < 4096 ? 4096 : $range] + #align base address to range + set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)] + #perform assignment + assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg*] -offset $offset -range $range + #advance base address + set axi_peripheral_base [expr $offset + $range] +} + +#custom IP instantiations/connections start here +%s + +#finalize clock and reset connections for interconnects +if {$ZYNQ_TYPE == "zynq_us+"} { + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_0/M*_ACLK] +} + +save_bd_design +assign_bd_address +validate_bd_design + +set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ] +make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top + +#set_property strategy Flow_PerfOptimized_high [get_runs synth_1] +#set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1] +#set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1] +#set_property strategy Performance_ExtraTimingOpt [get_runs impl_1] +#set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1] +#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +#set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1] + +# out-of-context synth can't be used for bitstream generation +# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1] +launch_runs -to_step write_bitstream impl_1 +wait_on_run [get_runs impl_1] + +# generate synthesis report +open_run impl_1 +report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml +close_project +""" diff --git a/benchmarking/util.py b/benchmarking/util.py new file mode 100644 index 0000000000..17dec02762 --- /dev/null +++ b/benchmarking/util.py @@ -0,0 +1,87 @@ +# Utility functions for benchmarking +import os, shutil +from qonnx.core.datatype import DataType +import xml.etree.ElementTree as ET + +def _find_rows_and_headers(table): + rows = table.findall("tablerow") + headers = [] + + for row in rows: + headers = row.findall("tableheader") + if len(headers) > 0: + break + return (rows, headers) + + +def summarize_table(table): + table_summary = {} + table_summary["headers"] = [] + rows, headers = _find_rows_and_headers(table) + + if len(headers) > 0: + string = "Header: " + for header in headers: + table_summary["headers"].append(header.attrib["contents"]) + string = string + header.attrib["contents"] + " " + # print(string.rstrip()) + + for row in rows: + cells = row.findall("tablecell") + if len(cells) > 0: + cell_name = cells[0].attrib["contents"] + string = cell_name + table_summary[cell_name] = [] + for cell in cells[1:]: + table_summary[cell_name].append(cell.attrib["contents"]) + string = string + cell.attrib["contents"] + " " + # print(string.rstrip()) + + return table_summary + + +def summarize_section(section): + section_summary = {} + section_summary["tables"] = [] + section_summary["subsections"] = {} + + # print("Section:", section.attrib["title"]) + tables = section.findall("table") + sub_sections = section.findall("section") + for table in tables: + section_summary["tables"].append(summarize_table(table)) + # print("") + for sub_section in sub_sections: + section_summary["subsections"][sub_section.attrib["title"]] = summarize_section(sub_section) + + return section_summary + + +def power_xml_to_dict(xml_path): + tree = ET.parse(xml_path) + root = tree.getroot() + sections = root.findall("section") + result = {} + + for section in sections: + result[section.attrib["title"]] = summarize_section(section) + + return result + +def prepare_inputs(input_tensor, idt, wdt): + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert bipolar to binary + return {"inp": (input_tensor + 1) / 2} + else: + return {"inp": input_tensor} + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print('Failed to delete %s. Reason: %s' % (file_path, e)) From cc61f000c16a66c104b94f018c598174be3125a7 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 14:19:41 +0000 Subject: [PATCH 002/125] Pull in new Transformer flow --- benchmarking/bench.py | 4 - .../cfg/resnet50_fifosizing_test.json | 2 +- benchmarking/dut/transformer.py | 125 +- benchmarking/dut/transformer_custom_steps.py | 1091 ++++++++++++----- benchmarking/dut/transformer_gpt.py | 348 ------ benchmarking/dut/transformer_radioml.py | 336 ----- 6 files changed, 864 insertions(+), 1042 deletions(-) delete mode 100644 benchmarking/dut/transformer_gpt.py delete mode 100644 benchmarking/dut/transformer_radioml.py diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 77f62bd775..db6f00c159 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -130,10 +130,6 @@ def get_default_session_options_new(): # give bench subclass name directly in config? if config_select.startswith("mvau"): bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("transformer_radioml"): - bench_object = bench_transformer_radioml(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("transformer_gpt"): - bench_object = bench_transformer_gpt(params, task_id, run_id, artifacts_dir, save_dir) elif config_select.startswith("transformer"): bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir) elif config_select.startswith("fifosizing"): diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index 1e85b972da..fbb0075dae 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -5,7 +5,7 @@ "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - "board": ["U250"], + "board": ["U280"], "clock_period_ns": [4], "strategy": ["analytical"], diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 0dc6444a55..ed9991100b 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -13,6 +13,7 @@ QuantReLU ) import os +from qonnx.core.modelwrapper import ModelWrapper # Progressbar from tqdm import trange import numpy as np @@ -26,34 +27,37 @@ import finn.builder.build_dataflow_config as build_cfg from finn.builder.build_dataflow_config import AutoFIFOSizingMethod from bench_base import bench, step_synth_harness +from finn.util.basic import alveo_part_map + +# Range information structure for seeding the range analysis for converting +# quantized activations to MultiThreshold +from qonnx.util.range_analysis import RangeInfo # Custom build steps required to streamline and convert the attention operator from dut.transformer_custom_steps import ( - step_tidy_up_pre_attention, - step_tidy_up_post_attention, - step_streamline_attention, - step_streamline_residual, - step_streamline_norms, - step_streamline_positional, + prepare_graph, + step_streamline, step_convert_attention_to_hw, step_convert_elementwise_binary_to_hw, step_convert_lookup_to_hw, + step_convert_split_concat_to_hw, + step_convert_depth_wise_to_hw, step_replicate_streams, set_target_parallelization, set_fifo_depths, step_apply_folding_config, - node_by_node_rtlsim, - node_by_node_cppsim -) -from performance.platform_build_steps import( - test_step_gen_vitis_xo, - test_step_gen_instrumentation_wrapper, - test_step_gen_instrwrap_sim, - test_step_insert_tlastmarker, - test_step_export_xo, - test_step_build_platform, - test_step_run_instrwrap_sim + node_by_node_rtlsim, # noqa: Maybe unused, only for debugging + node_by_node_cppsim, ) +# from performance.platform_build_steps import( +# test_step_gen_vitis_xo, +# test_step_gen_instrumentation_wrapper, +# test_step_gen_instrwrap_sim, +# test_step_insert_tlastmarker, +# test_step_export_xo, +# test_step_build_platform, +# test_step_run_instrwrap_sim +# ) ### ADAPTED FROM utils.py # Seeds all relevant random number generators to the same seed for @@ -791,6 +795,9 @@ def patch_non_affine_norms(model: torch.nn.Module): # noqa: Shadows model class bench_transformer(bench): def step_export_onnx(self, output_onnx_path): + # Generates a dummy transformer block, + # not used for actual models (RadioML, GPT, etc.) + # Load the parameters file #params = dvc.api.params_show("params.yaml") # Seed all RNGs @@ -841,9 +848,10 @@ def step_export_onnx(self, output_onnx_path): # Compute attention output o = model(x) # Save the input and output data for verification purposes later - # TODO: go via self.build_inputs["input_npy_path"] np.save("inp.npy", x.detach().numpy()) np.save("out.npy", o.detach().numpy()) + self.build_inputs["input_npy_path"] = "inp.npy" + self.build_inputs["output_npy_path"] = "out.npy" # Export the model graph to QONNX #export_qonnx(model, (x,), "attention.onnx", **self.params["export"]) export_qonnx(model, (x,), output_onnx_path, @@ -856,8 +864,23 @@ def step_build(self): # Seed all RNGs seed(self.params["seed"]) # Extract sequence length and embedding dimension from parameters - seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"] - + if "model_seq_len" in self.params and "model_emb_dim" in self.params: + # for dummy Transformer DUT + seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"] + else: + # for real input models + _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape + # TODO: use the following to get dimensions for GPT models? + #model = ModelWrapper(self.build_inputs["onnx_path"]) + #_, emb_dim, seq_len = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0") + + # Read the input value range information for the dataset from the parameters + # Note: Consider calibrating this on the fly from the dataset + range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable? + input_range = tuple(np.array([range]).T) + # Construct the seed range information of the input tensor + range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range) + # Prepare config files # TODO: make configurable # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs @@ -874,16 +897,21 @@ def step_build(self): with open("folding.yaml", "w") as f: f.write(template_folding_yaml) + if self.board in alveo_part_map: + shell_flow = "vitis_alveo" + else: + shell_flow = "vivado_zynq" + # Create a configuration for building the scaled dot-product attention # operator to a hardware accelerator cfg = build_cfg.DataflowBuildConfig( # Unpack the build configuration parameters - #**params["build"], + #**params["build"]["finn"], output_dir = self.build_inputs["build_dir"], - stitched_ip_gen_dcp = True, + stitched_ip_gen_dcp = False, # only needed for further manual integration synth_clk_period_ns = self.clock_period_ns, board = self.board, - shell_flow_type = "vivado_zynq", #TODO: Alveo support + shell_flow_type = shell_flow, folding_config_file = "folding.yaml", specialize_layers_config_file = "specialize_layers.json", standalone_thresholds = True, @@ -915,11 +943,14 @@ def step_build(self): build_cfg.VerificationStepType.TIDY_UP_PYTHON, # Verify the model after generating C++ HLS and applying folding build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + # No RTL Simulation support for now ], # File with test inputs for verification - verify_input_npy="inp.npy", + verify_input_npy=self.build_inputs["input_npy_path"], # File with expected test outputs for verification - verify_expected_output_npy="out.npy", + verify_expected_output_npy=self.build_inputs["output_npy_path"], + # Output full context dump for verification steps + verify_save_full_context=True, # Save the intermediate model graphs save_intermediate_models=True, # Avoid RTL simulation for setting the FIFO sizes @@ -929,39 +960,27 @@ def step_build(self): auto_fifo_depths=False, # Build steps to execute steps=[ - # Need to apply some tidy-up transformations before converting to - # the finn dialect of onnx - step_tidy_up_pre_attention, - # Convert all QONNX Quant nodes to Multithreshold nodes - "step_qonnx_to_finn", - # Tidy up the graph after converting from QONNX to FINN format - # Note: Triggers a verification step - "step_tidy_up", - # Positional encoding needs to be streamlined first with slightly - # different order of certain streamlining transformations to avoid - # weird rounding issue of intermediate results - step_streamline_positional, - # Custom streamlining for models containing attention operators - step_streamline_attention, - # Streamlining of the residual branches - step_streamline_residual, - # Streamline the normalization layers, i.e., transposed batch norm - step_streamline_norms, - # Another round using the default streamlining steps - # Note: Triggers a verification step - "step_streamline", - # New conversion of the scaled dot-product attention pattern + # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering + # and Quant to MultiThreshold conversion + prepare_graph(range_info=range_info), + # Unified exhaustive streamlining of complex model topologies + # including attention, residuals and splits + step_streamline, + # conversion of the scaled dot-product attention pattern to + # hardware, including cleanup and data layout squeezing step_convert_attention_to_hw, - # Another tidy-up step to remove unnecessary dimensions and - # operations after converting the attention operators to HLS - step_tidy_up_post_attention, # Convert the elementwise binary operations to hardware operators. # These include for example adding residual branches and positional # encoding step_convert_elementwise_binary_to_hw, - # Convert the Gather layer realizing the input token embedding to - # the FINN hardware implementation, i.e., the Lookup layer + # Convert Lookup layers, e.g., token embedding, to hardware custom + # operators step_convert_lookup_to_hw, + # Convert Split and Concat operators to hardware, e.g., splits + # contained in the GLU activation + step_convert_split_concat_to_hw, + # Convert depth-wise convolution MatMuls to VVUs + step_convert_depth_wise_to_hw, # Properly replicate the stream feeding the query, key and value # projections step_replicate_streams, @@ -997,7 +1016,7 @@ def step_build(self): # StreamingFIFOs are used # node_by_node_rtlsim, - test_step_insert_tlastmarker, # required for instrumentation_wrapper + #test_step_insert_tlastmarker, # required for instrumentation_wrapper "step_create_stitched_ip", diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index d28a4c501a..e122f79a0d 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -1,81 +1,88 @@ -# ADAPTED FROM Christoph's attention-dummy build_steps.py - +# ADAPTED FROM Christoph's radioml-transformer repository, specifically these files: +# build_steps.py +# custom/apply_config.py +# custom/composed_transformation.py +# custom/streamline.py + +# Python warning messages +import warnings +# Copies of python objects +from copy import deepcopy # Copies (deep-copies) python objects import copy # Numpy for loading and comparing the verification input/output import numpy as np # YAML for loading experiment configurations import yaml + # QONNX wrapper of ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper -# QONNX quantization data types -from qonnx.core.datatype import DataType -# Converts ONNX graph nodes to QONNX custom-ops if possible -from qonnx.custom_op.registry import getCustomOp +# Range information structure for seeding the range analysis for converting +# quantized activations to MultiThreshold +from qonnx.util.range_analysis import RangeInfo + # QONNX graph transformations for renaming and cleaning up from qonnx.transformation.general import ( Transformation, GiveUniqueNodeNames, GiveReadableTensorNames, - RemoveUnusedTensors, - RemoveStaticGraphInputs, GiveUniqueParameterTensors, - ConvertDivToMul, - ConvertSubToAdd + RemoveStaticGraphInputs, + RemoveUnusedTensors, ) -# Converts BatchNorm operation to affine transformation -from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine -# QONNX graph transformations for inferring datatypes and shapes +# QONNX graph transformations for annotating the graph with datatype and shape +# information from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.infer_data_layouts import InferDataLayouts -# QONNX cleanup transformations -from qonnx.transformation.remove import RemoveIdentityOps -# Precompute constant output nodes + +# If we have a convolution with a bias tensors input, QONNX and later FINN +# expect the bias to be expressed as a standalone Add node following the Conv +# node. +from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv +# Converts BatchNorm operation to affine transformation +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine +# Converts Gemm operation to MatMul with extracted standalone bias op +from qonnx.transformation.gemm_to_matmul import GemmToMatMul +# Converts Conv to Im2Col and MatMul with extracted standalone bias op +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +# Transposes the initializer tensors of a Quant node instead of having a +# standalone Transpose following +from qonnx.transformation.quant_constant_folding import ( + FoldTransposeIntoQuantInit +) +# Collapses chains of constants into a single constant operation or even +# initializer tensors. from qonnx.transformation.fold_constants import FoldConstants -# Streamlining transformation: This is a collection of various transformations -from finn.transformation.streamline import ( - ConvertSignToThres, RoundAndClipThresholds +# Folds quantizers into weight tensor initializers, needed for lowering +# convolutions to MatMuls +from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights +# FINN streamlining transformations reordering the graph +from finn.transformation.streamline.reorder import ( + MoveTransposePastFork, + MoveTransposePastEltwise, + MoveTransposePastJoinMul, + MoveTransposePastJoinAdd, + MoveTransposePastSplit, + MoveTransposePastJoinConcat, + MoveSqueezePastMultiThreshold, + MoveSqueezePastMatMul ) -# Fuse/Absorb operations +# FINN streamlining transformations absorbing tensors/nodes into others from finn.transformation.streamline.absorb import ( AbsorbAddIntoMultiThreshold, AbsorbSignBiasIntoMultiThreshold, - FactorOutMulSignMagnitude, - AbsorbMulIntoMultiThreshold, - Absorb1BitMulIntoMatMul, - Absorb1BitMulIntoConv -) -# Reorder operations -from finn.transformation.streamline.reorder import ( - MoveMulPastFork, - MoveLinearPastFork, - MoveTransposePastFork, - MoveLinearPastEltwiseAdd, - MoveScalarLinearPastInvariants, - MoveTransposePastEltwise, - MoveMulPastMaxPool, - MoveAddPastMul, - MoveScalarAddPastMatMul, - MoveAddPastConv, - MoveScalarMulPastMatMul, - MoveScalarMulPastConv, ) -# Collapse consecutive operations of the same type +# FINN streamlining transformations fusing/collapsing operations of the same +# kind from finn.transformation.streamline.collapse_repeated import ( - CollapseRepeatedMul, - CollapseRepeatedTranspose, - CollapseRepeatedAdd + CollapseRepeatedTranspose ) -# FINN transformation converting ONNX nodes to hardware custom operators -from finn.transformation.fpgadataflow.convert_to_hw_layers import ( - InferElementwiseBinaryOperation, - InferLookupLayer -) -# Remove some operations without real effect +# FINN streamlining transformations removing nodes without real effect from the +# graph from finn.transformation.streamline.remove import ( RemoveIdentityTranspose, - RemoveIdentityReshape + RemoveIdentityReshape, + RemoveIdentityOps ) # Cleanup transformation getting rid of 3d data layout from finn.transformation.squeeze import Squeeze @@ -87,14 +94,33 @@ # Mult-Head Attention support from finn.transformation.fpgadataflow.attention_heads import ( InferMultiHeads, - MoveSplitMultiHeadsPastMultiThreshold, UnrollMultiHeadAttention, + MoveSplitMultiHeadsPastMultiThreshold, MoveMergeMultiHeadsPastMultiThreshold ) -# Stream replication for outputs with multiple consumers +# Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferSqueeze, + InferUnsqueeze, + InferElementwiseBinaryOperation, + InferSplitLayer, + InferConcatLayer, + InferLookupLayer, + InferVectorVectorActivation +) +# Converts fork-nodes to ReplicateStream hardware operator from finn.transformation.fpgadataflow.replicate_stream import ( InferReplicateStream ) +# Standard QONNX to FINN conversion function +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN +from finn.transformation.qonnx.quant_act_to_multithreshold import ( + default_filter_function_generator, +) +# QONNX quantization data types +from qonnx.core.datatype import DataType +# Converts ONNX graph nodes to QONNX custom-ops if possible +from qonnx.custom_op.registry import getCustomOp # Inserts data-width converter and FIFO nodes into the model graph from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -125,6 +151,78 @@ # Execute onnx model graphs from the dataflow parent for verification from finn.util.test import execute_parent +# Base class for all QONNX graph transformations and some basic cleanup +# transformations +from qonnx.transformation.general import ( + Transformation, + ConvertDivToMul, + ConvertSubToAdd, +) + +# QONNX graph transformations for annotating the graph with datatype and shape +# information +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +# Converts BatchNorm operation to affine transformation +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine + +# Groups node inputs by dynamic vs. initializer category +from finn.transformation.streamline.absorb import group_inputs_by_category + +# FINN streamlining transformations converting and rounding values +from finn.transformation.streamline import ( + ConvertSignToThres, + RoundAndClipThresholds +) +# FINN streamlining transformations reordering the graph +from finn.transformation.streamline.reorder import ( + MoveMulPastFork, + MoveTransposePastFork, + MoveLinearPastEltwiseAdd, + MoveScalarLinearPastInvariants, + MoveTransposePastEltwise, + MoveMulPastMaxPool, + MoveAddPastMul, + MoveScalarAddPastMatMul, + MoveAddPastConv, + MoveScalarMulPastMatMul, + MoveScalarMulPastConv, + MoveTransposePastJoinMul, + MoveTransposePastJoinAdd, + MoveMulPastJoinAdd, + MoveAddPastJoinAdd, + MoveScalarLinearPastSplit, + MoveAffinePastJoinConcat, + MoveMulPastJoinConcat, + MoveAddPastJoinConcat, + MoveTransposePastSplit, + MoveTransposePastJoinConcat, + MoveSqueezePastMultiThreshold, + is_scalar +) +# FINN streamlining transformations absorbing tensors/nodes into others +from finn.transformation.streamline.absorb import ( + AbsorbAddIntoMultiThreshold, + AbsorbSignBiasIntoMultiThreshold, + FactorOutMulSignMagnitude, + AbsorbMulIntoMultiThreshold, + Absorb1BitMulIntoMatMul, + Absorb1BitMulIntoConv, + AbsorbTransposeIntoMultiThreshold +) +# FINN streamlining transformations fusing/collapsing operations of the same +# kind +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedMul, + CollapseRepeatedTranspose, + CollapseRepeatedAdd +) +# FINN streamlining transformations removing nodes without real effect from the +# graph +from finn.transformation.streamline.remove import ( + RemoveIdentityTranspose, + RemoveIdentityReshape +) # Composes graph transformations such that each individual transformation as # well as the whole sequence is applied exhaustively @@ -164,201 +262,627 @@ def apply(self, model: ModelWrapper): # noqa model = model.transform(RemoveIdentityOps()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) # Return the transformed model and indicate whether the graph actually # has been transformed by at least one transformation so the whole # sequence of transformations will be reapplied return model, graph_modified +# # Custom conversion from Quant to MultiThreshold +# TODO: Enable once fixed... +# from custom.quant_activation_to_multithreshold import ( +# QuantActivationToMultiThreshold +# ) -# Custom Streamlining transformation: Similar to the built-in transformations -# but exhaustively reapplied until none of the transformations can be applied -# anymore. -def Streamline(): # noqa: Uppercase - return ComposedTransformation([ - ConvertSubToAdd(), - ConvertDivToMul(), - BatchNormToAffine(), - ConvertSignToThres(), - MoveMulPastMaxPool(), - AbsorbSignBiasIntoMultiThreshold(), - MoveScalarLinearPastInvariants(), - MoveAddPastMul(), - MoveScalarAddPastMatMul(), - MoveAddPastConv(), - MoveScalarMulPastMatMul(), - MoveScalarMulPastConv(), - MoveAddPastMul(), - CollapseRepeatedAdd(), - CollapseRepeatedMul(), - MoveMulPastMaxPool(), - AbsorbAddIntoMultiThreshold(), - FactorOutMulSignMagnitude(), - AbsorbMulIntoMultiThreshold(), - Absorb1BitMulIntoMatMul(), - Absorb1BitMulIntoConv(), - RoundAndClipThresholds(), - ]) - - -# Function running transformations necessary to clean up models containing -# attention operators -def step_tidy_up_pre_attention(model: ModelWrapper, _): - # Add shape and datatype annotations throughout all the graph - model = model.transform(InferDataTypes()) # noqa Duplicate - model = model.transform(InferShapes()) - - # Cleanup the graph by removing redundant, unnecessary and constant nodes - # and tensors and give unique names to everything remaining - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - model = model.transform(RemoveStaticGraphInputs()) - model = model.transform(RemoveUnusedTensors()) - model = model.transform(GiveUniqueParameterTensors()) - model = model.transform(FoldConstants()) - - # Remove unnecessary shape and layout transformations - model = model.transform(RemoveIdentityReshape()) - model = model.transform(RemoveIdentityTranspose()) - # Insert tensor layout annotations for Quant to MultiThreshold transform - # to determine the correct output channel dimension - model = model.transform(InferDataLayouts()) - # Return the tidied up model - return model - - -# Variant of streamlining transformations adapted to attention operators -def step_streamline_attention(model: ModelWrapper, cfg: DataflowBuildConfig): - # Exhaustively apply the pattern of streamlining and moving past fork-nodes - model = model.transform(ComposedTransformation([ - # Apply the set of standard streamlining transformations from finn to - # the model - Streamline(), - # We need a custom streamlining step to enable streamlining through - # certain fork-nodes Note: This transform is part of finn, but not - # included in the standard streamlining transformations - MoveLinearPastFork(), - # Streamline again there should be more transformations enabled after - # moving some nodes past forks - Streamline(), - ])) - - # If configured, run a verification of the transformed model on some sample - # inputs - if (VerificationStepType.STREAMLINED_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "streamlined_attention_python", need_parent=False - ) - - # Return the streamlined model - return model - +# Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These +# cannot be handled by MoveScalarLinearPastInvariants as potential padding makes +# Add-Im2Col not commute to Im2Col-Add +class MoveScalesPastIm2Col(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Mul operation types + if node.op_type in {"Mul", "Div"}: + # Cannot handle fork- or join-multiplications + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # Only handles one forking output for now + if len(node.output) > 1: + # Softly skip this node + continue + # The first input must be dynamically received from upstream + if model.get_initializer(node.input[0]) is not None: + # Softly skip this node + continue + # Test whether the node initializer is a scalar... + if not is_scalar(model.get_initializer(node.input[1])): + # Softly skip this node + continue + # As this is not a fork-node, there can be at most one successor + successor = model.find_direct_successors(node) + # If this is the final operation in the graph, there might be no + # successor + if successor is None: + # Softly skip this node + continue + # Now there is exactly one successor which needs to be extracted + # from the list + successor = successor[0] + # Handle both, Im2Col and the inverse Col2Im, as well as padding + if successor.op_type in {"Im2Col", "Col2Im", "Pad"}: + # Get names of all tensors involved in connecting the + # nodes + inp = node.input[0] # noqa: Duplicate + mid = node.output[0] + out = successor.output[0] + # Rewire the graph to feed original input into the + # Add node first + successor.input[0] = inp + # Repurpose the middle tensor for the output of the Add + successor.output[0] = mid + # The Mul operator now gets the middle tensor as its + # input + node.input[0] = mid + # Mul now produces the original output tensor + node.output[0] = out + # Delete the shape annotation of the connecting tensors + # to be re-done later + model.set_tensor_shape(mid, None) + model.set_tensor_shape(out, None) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Break the loop after deleting shape annotations to + # immediately re-do these before changing the next + # operator + break + # Redo datatype and shape annotations + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the transformation + # needs to be applied again + return model, graph_modified -# Streamlining transformations to be applied to residual branches -def step_streamline_residual(model: ModelWrapper, cfg: DataflowBuildConfig): - # Exhaustively apply the pattern for streamlining residual branches. This - # ensures streamlining to work for arbitrary many consecutive residual - # blocks, where one "round" of these transformations is required per block. - model = model.transform(ComposedTransformation([ - # Streamline the residual connections by moving scale factors past - # elementwise add nodes - MoveLinearPastEltwiseAdd(), - MoveLinearPastFork(), - MoveScalarLinearPastInvariants(), - # Do the normal streamlining flow once again - Streamline(), - ])) +# Moves scalar linear elementwise operations past fork nodes, applies to Add, +# Mul, Sub, Div, etc. +class MoveScalarLinearPastFork(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Mul-like and Add-like operation types + if node.op_type in {"Add", "Sub", "Mul", "Div"}: + # Only handles non-joining forks for now + if not model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # Only handles one forking output for now + if len(node.output) > 1: + # Softly skip this node + continue + # Test whether the node initializer is a scalar... + if not is_scalar(model.get_initializer(node.input[1])): + # Softly skip this node + continue + # We need to insert a replica of this operation in front of each + # consumer node + for consumer in model.find_direct_successors(node): + # Create an exact replica of this operator + copy = deepcopy(node) + # Insert a new unique tensor connecting the output of the + # copy to the consumer + copy.output[0] = model.make_new_valueinfo_name() + # The original node might be connecting to multiple inputs + # of the consumer... + for idx, inp in enumerate(consumer.input): + # Find each instance of connection from original node + if inp == node.output[0]: + # Rewire to connect to the replica + consumer.input[idx] = copy.output[0] + # Insert the new replica node into the graph + graph.node.insert(index + 1, copy) + # Remove the original node from the graph + graph.node.remove(node) + # Redo datatype and shape annotations + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the transformation + # needs to be applied again + return model, graph_modified - # If configured, run a verification of the transformed model on some sample - # inputs - if (VerificationStepType.STREAMLINED_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "streamlined_residual_python", need_parent=False - ) +# Moves constant elementwise multiplication past another joining multiplication +class MoveConstMulPastJoinMul(Transformation): + # Applies the transform to a whole model graph # noqa: Duplicate + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Mul operation types + if node.op_type == "Mul": + # Currently does not handle fork- or join-nodes + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # As this is not a fork-node, there can be at most one successor + successor = model.find_direct_successors(node) + # If Squeeze is the final operation in the graph, there might + # be no successor + if successor is None: + # Softly skip this node + continue + # Now there is exactly one successor which needs to be extracted + # from the list + successor = successor[0] + # Applies to Multiplications + if successor.op_type in {"Mul"}: + # Applies only if the second multiplication is a join-node + if model.is_join_node(successor): + # Get names of all tensors involved in connecting the + # nodes + inp = node.input[0] # noqa: Duplicate + mid = node.output[0] + out = successor.output[0] + # Need to match the correct input of the joining second + # multiplication + for i, name in enumerate(successor.input): + # If the successors input currently matches the + # intermediate tensors, this input needs to be + # rewired + if name == mid: + # Rewire the graph to feed original into the + # second Mul node first + successor.input[i] = inp + # Note: Do not break here as it is perfectly + # legal to connect the same tensor multiple + # times to different inputs + # Repurpose the middle tensor for the output of the + # second Mul + successor.output[0] = mid + # The first Mul operator now gets the middle tensor as + # its input + node.input[0] = mid + # The first Mul now produces the original output tensor + node.output[0] = out + # Delete the shape annotation of the connecting tensors + # to be re-done later + model.set_tensor_shape(mid, None) + model.set_tensor_shape(out, None) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Break the loop after deleting shape annotations to + # immediately re-do these before changing the next + # operator + break + # Redo datatype and shape annotations + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the transformation + # needs to be applied again + return model, graph_modified + +# Moves elementwise additions past MatMul operations: Applicable if each +# operation has one initializer input +class MoveAddPastMatMul(Transformation): + # Applies the transform to a whole model graph # noqa: Duplicate + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Add operations + if node.op_type == "Add": + # If the add is a join operation, we do not have a constant + # added to the input + if model.is_join_node(node): + # Skip transforming this + continue + # If the Add is a fork operation we should first distribute the + # Add into the branches + if model.is_fork_node(node): + # Issue a warning to make the use aware of this potential + # transformation if the fork is moved first + warnings.warn( + f"{self.__class__.__name__}:" + f" Skipping near match: {node.name} is a fork-node," + f" try MoveLinearPastFork first" + ) + # Skip transforming this node as moving this would lead + # to messed up or detached graph + continue + # Decompose the inputs into the dynamic and the constant + # initializer input + (x_name,), (c_name,) = group_inputs_by_category(node, model) + # Now check the successor node which must be a MatMul + consumer = model.find_direct_successors(node) + # If there is no consumer, this Add seems to be last node of the + # graph + if not consumer: + # Skip transforming this + continue + # There must be exactly one consumer now + consumer = consumer[0] + # This transformation only applies to Add in front of MatMul + if not consumer.op_type == "MatMul": + # Skip this if not MatMul + continue + # MatMul may not be a join operation to apply this + # transformation + if model.is_join_node(consumer): + # Skip transforming without warning (there is nothing we can + # do about this) + continue + # Decompose the inputs to the MatMul to get the weight tensor + # name (the other input is the output of the Add) + _, (w_name,) = group_inputs_by_category(consumer, model) + # Read the weights and the constant addition tensor + w = model.get_initializer(w_name) + c = model.get_initializer(c_name) + # Determine whether the weights are the left or right input to + # the MatMul + left = w_name == consumer.input[0] + # Apply the weights to the constant tensor + c = np.matmul(w, c) if left else np.matmul(c, w) + # Insert the transformed tensor back into the mode as an + # initializer + model.set_initializer(c_name, c) + # The connecting tensors of this pattern + inp = x_name + mid = node.output[0] + out = consumer.output[0] + # Rewire the graph pattern connecting the input to the MatMul + # and the MatMul output to the Add node + consumer.input[1 if left else 0] = inp + # The Add now produces the original MatMul output + node.output[0] = out + # The middel tensor connects to the Add input + node.input[0 if node.input[0] == x_name else 1] = mid + # The MatMul feeds the middle tensors + consumer.output[0] = mid + # Delete the shape annotation of the connecting tensors + # to be re-done later + model.set_tensor_shape(mid, None) + model.set_tensor_shape(out, None) + # Delete the type annotations of the connecting tensors + # to be re-done later + # model.set_tensor_datatype(mid, None) + # model.set_tensor_datatype(out, None) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Break the loop after deleting shape annotations to + # immediately re-do these before changing the next + # operator + break + # Redo datatype and shape annotations + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the transformation + # needs to be applied again + return model, graph_modified - # Return the streamlined model - return model +# Moves elementwise multiplication past elementwise addition if one input to +# each of the operators is a known constant +# Note: Reverse of MoveAddPastMul +class MoveMulPastAdd(Transformation): + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Keep track of whether the graph has been modified + graph_modified = False + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # Applies to Mul operation types + if node.op_type == "Mul": + # Currently does not handle fork- or join-nodes + if model.is_fork_node(node) or model.is_join_node(node): + # Softly skip this node + continue + # As this is not a fork-node, there can be at most one successor + successor = model.find_direct_successors(node) + # If Squeeze is the final operation in the graph, there might + # be no successor + if successor is None: + # Softly skip this node + continue + # Now there is exactly one successor which needs to be extracted + # from the list + successor = successor[0] + # Applies to additions + if successor.op_type in {"Add"}: + # The addition may not join as we need to know the second + # input + if not model.is_join_node(successor): + # Get the constant initializer tensors for both + # operations: y = s * x + b + _, s_name = group_inputs_by_category(node, model) + _, b_name = group_inputs_by_category(successor, model) + # Skip if either node has no constant initializer + if not s_name or not b_name: + # Skip without warning ok? + continue + # There must be exactly one constant per operations + assert len(s_name) == 1, \ + f"To many constant inputs for {node}" + assert len(b_name) == 1, \ + f"To many constant inputs for {successor}" + # Now read the initializer tensors + s = model.get_initializer(*s_name) + b = model.get_initializer(*b_name) + # Update the addition initializer according to the + # distributive law + model.set_initializer(*b_name, b / s) + # Get names of all tensors involved in connecting the + # nodes + inp = node.input[0] # noqa: Duplicate + mid = node.output[0] + out = successor.output[0] + # Rewire the graph to feed original input into the + # Add node first + successor.input[0] = inp + # Repurpose the middle tensor for the output of the Add + successor.output[0] = mid + # The Mul operator now gets the middle tensor as its + # input + node.input[0] = mid + # Mul now produces the original output tensor + node.output[0] = out + # Delete the shape annotation of the connecting tensors + # to be re-done later + model.set_tensor_shape(mid, None) + model.set_tensor_shape(out, None) + # Track whether the graph has been modified, never + # resets to False + graph_modified = True + # Break the loop after deleting shape annotations to + # immediately re-do these before changing the next + # operator + break + # Redo datatype and shape annotations + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + # Return the transformed model and indicate whether the transformation + # needs to be applied again + return model, graph_modified -# Streamlining transformation to be applied to the normalization layers -def step_streamline_norms(model: ModelWrapper, cfg: DataflowBuildConfig): - # Exhaustively apply the pattern for streamlining norms. This ensures - # streamlining to work for arbitrary many consecutive blocks, where one - # round of these transformations is required per block. - model = model.transform(ComposedTransformation([ - # Streamline transposed batch normalization (move transposes past the - # scale-bias operator, so they can be collapsed afterward) - MoveTransposePastEltwise(), - # There should now be transposes next to each other which can be - # collapsed - CollapseRepeatedTranspose(), - # The transposes around the batch normalization should be collapsed by - # now and cancel each other out - RemoveIdentityTranspose(), - # Nested, exhaustive compositions of transformations +# Define a set of custom streamlining transformations: These are applied once +# during the actual streamlining step and once after converting attention to +# hardware (the associated cleanup afterward might enable some Streamlining +# transformations once again) +def Streamline(): # noqa: Uppercase + # Return a set of exhaustively applies transformations + return ComposedTransformation([ + # On skip-connections: prefer pushing scalar multiplication forward + # before MoveAddPastMul + MoveMulPastFork(), + # The "standard" set of FINN streamlining transformations or at least + # inspired by them but applied exhaustively until none of them changes + # the graph anymore. + # Note: Covers most parts of non-branching linear topologies + ComposedTransformation([ + ConvertSubToAdd(), + ConvertDivToMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveMulPastMaxPool(), + AbsorbSignBiasIntoMultiThreshold(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + MoveMulPastMaxPool(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + ]), + # Streamlining scales and biases forward through residual topologies + # Note: This mostly covers forking and joining operations ComposedTransformation([ - # We now might have transpose operations accumulating in front of - # fork nodes + # Note: This is probably the most common way of joining skip + # connections, i.e., this corresponds to the original residual + # addition, i.e., y = f(x) + x + MoveLinearPastEltwiseAdd(), + MoveScalarLinearPastFork(), + MoveScalarLinearPastInvariants(), + MoveMulPastFork(), + MoveMulPastJoinAdd(), + MoveAddPastJoinAdd(), + # Note: This brings constant Muls (i.e., quantizer scales to be + # removed) forward through joining Muls (i.e., those ending up + # as actual hardware operators). + MoveConstMulPastJoinMul() + ]), + # Streamlining scales and biases forward through shape/layout changing + # operations, i.e., mostly transposes + ComposedTransformation([ + # Convolution inputs and padding + MoveScalesPastIm2Col(), + # Streamlining for Split and Concat operations + MoveScalarLinearPastSplit(), + MoveAffinePastJoinConcat(), + MoveMulPastJoinConcat(), + MoveAddPastJoinConcat(), + # Move transposes around to some place where they could be removed + # later, i.e., where they collapse into identities MoveTransposePastFork(), + MoveTransposePastSplit(), + MoveTransposePastJoinConcat(), MoveTransposePastEltwise(), + MoveTransposePastJoinMul(), + MoveTransposePastJoinAdd(), CollapseRepeatedTranspose(), + # Remove identity shape/layout transformations RemoveIdentityTranspose(), + RemoveIdentityReshape(), + # Squeeze operators can be moved past the thresholding + MoveSqueezePastMultiThreshold(), + # A certain type of 4d-layout transpose can be absorbed (actually + # moved past) MultiThreshold operations + AbsorbTransposeIntoMultiThreshold(), ]), - # This might have caused the normalization scale and bias to accumulate - # in front of transpose or fork node - MoveLinearPastEltwiseAdd(), - MoveLinearPastFork(), - MoveScalarLinearPastInvariants(), - # This might have enabled more streamlining transformations - Streamline(), - # We need a custom streamlining step to enable streamlining through - # certain fork-nodes Note: This transform is part of finn, but not - # included in the standard streamlining transformations - MoveLinearPastFork(), - # This might have enabled more streamlining transformations - Streamline(), - ])) - - # If configured, run a verification of the transformed model on some sample - # inputs - if (VerificationStepType.STREAMLINED_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step(model, cfg, "streamlined_norms_python", need_parent=False) - - # Return the streamlined model - return model + # Only round and clip after all streamlining transformations have + # been applied exhaustively. + # Note: Might still enable another round of streamlining. + RoundAndClipThresholds(), + ]) -# Streamlining transformation to be applied to the positional encoding layer -def step_streamline_positional(model: ModelWrapper, cfg: DataflowBuildConfig): - # There is probably a division in front of the quantized positional - # encoding, which is exactly the inverse of the multiplication in front of - # that: The are the matching scale factors of the shared input quantizer of - # input and positional encoding. Convert the division to multiplication, so - # these two can be merged. - model = model.transform(ConvertDivToMul()) - # Merge the quantization scales of shared input quantizers - model = model.transform(CollapseRepeatedMul()) - # Push scalar multiplications, probably scale factors of quantizers, into - # the branches of a fork - model = model.transform(MoveMulPastFork()) +# Prepares the graph to be consumed by FINN: +# 1. Some graph cleanup removing unused tensors, nodes without effect and +# folding constants, i.e., collapsing chains of operations on constant tensors +# 2. Lowers some "more complex" operations: converts Conv and Gemm to MatMul and +# BatchNorm to Mul and Add operations followed by some necessary cleanup +# 3. Converts all QONNX Quant nodes to MultiThreshold operations which can +# absorb scales and biases during streamlining +def prepare_graph(range_info: RangeInfo): + # Wrap the actual transformation/build step function + def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig): + # Exhaustively apply the set of cleanup transformations + model = model.transform(ComposedTransformation([ + # Adds shape and datatype annotations to all tensors in this graph + InferDataTypes(), + InferShapes(), + # Cleanup the graph by removing redundant, unnecessary and constant + # nodes and tensors and give unique names to everything remaining + GiveUniqueNodeNames(), + GiveReadableTensorNames(), + RemoveStaticGraphInputs(), + RemoveUnusedTensors(), + GiveUniqueParameterTensors(), + FoldConstants(), + # Remove unnecessary shape and layout transformations + RemoveIdentityReshape(), + RemoveIdentityTranspose(), + # Redo shape and datatype annotations after removing nodes and + # tensors + InferShapes(), + InferDataTypes(), + ])) + # If configured, run a verification of the transformed model on some + # sample inputs + if (VerificationStepType.TIDY_UP_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "tidied_up_python", need_parent=False + ) + # Exhaustively apply the lowering transformations + model = model.transform(ComposedTransformation([ + # Moves the bias input to the Conv operator as a separate Add node + # behind the Conv node + ExtractBiasFromConv(), + # Converts Gemm nodes to MatMul (+ bias) + GemmToMatMul(), + # Need to do some constant and weight folding first + FoldConstants(), + FoldTransposeIntoQuantInit(), + FoldQuantWeights(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + # Converts Conv layers to MatMul + LowerConvsToMatMul(), + # Converts BatchNorm to affine scale and bias + BatchNormToAffine(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + ])) + # If configured, run a verification of the transformed model on some + # sample inputs + if (VerificationStepType.QONNX_TO_FINN_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "lowered_python", need_parent=False + ) + # Apply the quantizer to MultiThreshold conversion + # Note: This is exhaustive as well as single .transform reapplies as + # long as possible. + # TODO: Enable once fixed... + # model = model.transform(QuantActivationToMultiThreshold(range_info)) + # If configured, run a verification of the transformed model on some + # sample inputs + if (VerificationStepType.QONNX_TO_FINN_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "quant_to_thresholds_ra_python", need_parent=False + ) + # Apply the standard QONNX to FINN conversion step to convert the + # remaining quantizers not yet covered by the new range analysis based + # method + model = model.transform(ConvertQONNXtoFINN( + filter_function=default_filter_function_generator( + max_multithreshold_bit_width=cfg.max_multithreshold_bit_width + ) + )) + # If configured, run a verification of the transformed model on some + # sample inputs + if (VerificationStepType.QONNX_TO_FINN_PYTHON in + cfg._resolve_verification_steps()): # noqa + verify_step( + model, cfg, "prepared_graph_python", need_parent=False + ) + # Return the transformed model + return model - # If configured, run a verification of the transformed model on some sample - # inputs + # Return the wrapped transformation step function + return step_prepare_graph + + +# Applies the custom set of exhaustive streamlining transformations, also taking +# special topology like attention, residuals, splits and transposes into account +def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + # These should not be applied exhaustively with the other streamlining + # transformations to not end up in cycles. + # Note: This is essential to allow some Add operations to be + # absorbed by the next round's AbsorbSignBiasIntoMultiThreshold + model = model.transform(MoveMulPastAdd()) + model = model.transform(AbsorbSignBiasIntoMultiThreshold()) + # Exhaustively apply the following set of transformations to streamline the + # graph with the overall goal of collecting scales and biases in front of + # MultiThreshold operations or, alternatively, at the end of the graph. + # Note: Contains some sets of nested exhaustive transformations meant for + # particular architectural patterns, e.g., residual topologies. + model = model.transform(Streamline()) + # If configured, run a verification of the transformed model on some + # sample inputs if (VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps()): # noqa verify_step( - model, cfg, "streamlined_positional_python", need_parent=False + model, cfg, "streamlined_python", need_parent=False ) - - # Return the streamlined model + # Return the transformed model return model -# Function running the InferScaledDotProductAttention transformation -def step_convert_attention_to_hw(model: ModelWrapper, _): +# Converts scaled dot-product attention operations to FINN hardware operations +# Note: This includes some necessary cleanup after converting the pattern, in +# particular squeezing the data layouts throughout the graph +def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig): # Try to infer reshaping of attention heads model = model.transform(InferMultiHeads()) # noqa: Duplicate # Try to mode the mult-head splitting past the multi thresholds @@ -374,6 +898,40 @@ def step_convert_attention_to_hw(model: ModelWrapper, _): model = model.transform(MoveMergeMultiHeadsPastMultiThreshold()) # If applicable, absorb the final thresholds into the attention operator model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + # Squeeze (i.e., remove dimensions of size 1) the data layouts throughout + # the graph to treat the time dimension as the batch dimension for all MVU + # and Threshold operators + model = model.transform(Squeeze()) + # Squeezing might have turned further transpose and reshape operations into + # identities (those which just swapped around the dimensions of size 1) + model = model.transform(ComposedTransformation([ + # Move transposes around to some place where they could be removed + # later, i.e., where they collapse into identities + MoveTransposePastFork(), + MoveTransposePastSplit(), + MoveTransposePastJoinConcat(), + MoveTransposePastEltwise(), + MoveTransposePastJoinMul(), + MoveTransposePastJoinAdd(), + CollapseRepeatedTranspose(), + # Remove identity shape/layout transformations + RemoveIdentityTranspose(), + RemoveIdentityReshape(), + # Squeeze operators can be moved past MatMuls and thresholding + MoveSqueezePastMatMul(), + MoveSqueezePastMultiThreshold(), + ])) + # Squeezing might enable absorbing adds into thresholds once again + model = model.transform(AbsorbAddIntoMultiThreshold()) + # If applicable, absorb the final thresholds into the attention operator + # Note: Might be applicable again after squeezing a transpose away + model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + # We should do another round of streamlining to be sure and support more + # general architectural patterns, we are not aware of yet... + model = model.transform(Streamline()) + # Convert Squeeze and Unsqueeze operators to hardware operations + model = model.transform(InferSqueeze()) + model = model.transform(InferUnsqueeze()) # Return the model with attention and multi-heads mapped to hardware # operators return model @@ -389,6 +947,11 @@ def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _): )) +# Converts Split and Concat operations to hardware custom operators +def step_convert_split_concat_to_hw(model: ModelWrapper, _): + return model.transform(InferSplitLayer()).transform(InferConcatLayer()) + + # Function running the transformations to convert Gather, i.e., index lookup, # nodes to their hardware implementations def step_convert_lookup_to_hw(model: ModelWrapper, _): @@ -407,43 +970,18 @@ def step_convert_lookup_to_hw(model: ModelWrapper, _): return model.transform(InferLookupLayer()) +# Converts depth-wise convolution to hardware operator calling the +# InferVectorVectorActivation transformation +def step_convert_depth_wise_to_hw(model: ModelWrapper, _: DataflowBuildConfig): + return model.transform(InferVectorVectorActivation()) + + # Function running the InferReplicateStream transformation def step_replicate_streams(model: ModelWrapper, _): # Properly replicate the stream feeding the query, key and value projections return model.transform(InferReplicateStream()) -# Post-processing tidy-up squeezing dimensions and identity operators left over -# from mapping the attention operators -def step_tidy_up_post_attention(model: ModelWrapper, _): - # Remove dimensions of size 1 (single batch tensors) - model = model.transform(Squeeze()) - model = model.transform(RemoveIdentityTranspose()) - - # Squeezing might enable absorbing adds into thresholds once again - model = model.transform(AbsorbAddIntoMultiThreshold()) - # If applicable, absorb the final thresholds into the attention operator - # Note: Might be applicable again after squeezing a transpose away - model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) - - # Squeezing might enable some more streamlining transformations once again - model = model.transform(ComposedTransformation([ - # Streamline the residual connections by moving scale factors past - # elementwise add nodes - MoveLinearPastEltwiseAdd(), - MoveLinearPastFork(), - MoveScalarLinearPastInvariants(), - # Do the normal streamlining flow once again - Streamline(), - ])) - - # Clean up the names for debugging - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - # Return the tidied up model - return model - - # Custom step for setting the parallelism to meet the target of T^2 cycles per # sequence def set_target_parallelization(seq_len: int, @@ -736,49 +1274,6 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig): return model -# Runs a node-by-node Python simulation of the model saving the fill execution -# context -# Note: Assumes no execution mode to be set -def node_by_node_python(model: ModelWrapper, cfg: DataflowBuildConfig): - # Save the original model - original = model - # Copy the model - model = copy.deepcopy(model) - - # Load the verification input/output pair - inp = np.load(cfg.verify_input_npy) # noqa - out = np.load(cfg.verify_expected_output_npy) - - # Path to the parent model wrapping the streaming dataflow partition and the - # wrapped child model, i.e., the inside of the streaming dataflow partition - parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" - child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx" - # Save the child model prepared for python simulation - model.save(child) - # Load the parent model to pass to verification execution - parent_model = ModelWrapper(parent) - - # Reshape the input/output to match the model - inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) - out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) - - # Execute the onnx model to collect the result - # context = execute_onnx(model, context, return_full_exec_context=True) - context = execute_parent(parent, child, inp, return_full_ctx=True) - # Extract the output tensor from the execution context - model_out = context[parent_model.graph.output[0].name] - # Compare input to output - result = {True: "SUCCESS", False: "FAIL"}[ - np.allclose(out, model_out, atol=1e-3) - ] - # Save the verification outputs into the configured build directory - verification_output = f"{cfg.output_dir}/verification_output/" - # Save the verification execution context - np.savez(f"{verification_output}/verify_python_{result}.npz", **context) - # Return the original, unmodified model - return original - - # Runs a node-by-node C++ simulation of the model saving the fill execution # context def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig): @@ -816,9 +1311,7 @@ def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig): # Extract the output tensor from the execution context model_out = context[parent_model.graph.output[0].name] # Compare input to output - result = {True: "SUCCESS", False: "FAIL"}[ - np.allclose(out, model_out, atol=1e-3) - ] + result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)] # Save the verification outputs into the configured build directory verification_output = f"{cfg.output_dir}/verification_output/" # Save the verification execution context @@ -867,9 +1360,7 @@ def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig): # Extract the output tensor from the execution context model_out = context[parent_model.graph.output[0].name] # Compare input to output - result = {True: "SUCCESS", False: "FAIL"}[ - np.allclose(out, model_out, atol=1e-3) - ] + result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)] # Save the verification outputs into the configured build directory verification_output = f"{cfg.output_dir}/verification_output/" # Save the verification execution context diff --git a/benchmarking/dut/transformer_gpt.py b/benchmarking/dut/transformer_gpt.py deleted file mode 100644 index 5ee77483ab..0000000000 --- a/benchmarking/dut/transformer_gpt.py +++ /dev/null @@ -1,348 +0,0 @@ -# Adapted from Christoph's attention-dummy repository - -# PyTorch base package: Math and Tensor Stuff -import torch -# Brevitas wrapper around PyTorch tensors adding quantization information -from brevitas.quant_tensor import QuantTensor -# Brevitas: Quantized versions of PyTorch layers -from brevitas.nn import ( - QuantMultiheadAttention, - QuantEltwiseAdd, - QuantIdentity, - QuantLinear, - QuantReLU -) -from qonnx.core.modelwrapper import ModelWrapper -# Progressbar -from tqdm import trange -import numpy as np -from brevitas.export import export_qonnx -import random -import json -import subprocess -# FINN dataflow builder -import finn.builder.build_dataflow as build -import finn.builder.build_dataflow_config as build_cfg -from finn.builder.build_dataflow_config import AutoFIFOSizingMethod -from bench_base import bench, step_synth_harness -import os -from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents - -# Custom build steps required to streamline and convert the attention operator -from dut.transformer_custom_steps import ( - step_tidy_up_pre_attention, - step_tidy_up_post_attention, - step_streamline_attention, - step_streamline_residual, - step_streamline_norms, - step_streamline_positional, - step_convert_attention_to_hw, - step_convert_elementwise_binary_to_hw, - step_convert_lookup_to_hw, - step_replicate_streams, - set_target_parallelization, - set_fifo_depths, - step_apply_folding_config, - node_by_node_rtlsim, # noqa: Maybe unused, only for debugging - node_by_node_python, # noqa: Maybe unused, only for debugging - node_by_node_cppsim -) -from performance.platform_build_steps import( - test_step_gen_vitis_xo, - test_step_gen_instrumentation_wrapper, - test_step_gen_instrwrap_sim, - test_step_insert_tlastmarker, - test_step_export_xo, - test_step_build_platform, - test_step_run_instrwrap_sim -) - -### ADAPTED FROM utils.py -# Seeds all relevant random number generators to the same seed for -# reproducibility -def seed(s): - random.seed(s) - np.random.seed(s) - torch.manual_seed(s) - -template_folding_yaml = """ -# Per operator type default configurations -defaults: - # Scaled dot-product attention head implemented via HLS - ScaledDotProductAttention_hls: - # Type of memory to be used for internal buffer storage - # Options: auto, block, distributed, ultra - ram_style: block - # Type of memory to be used for threshold storage - # Options: auto, block, distributed - ram_style_thresholds: block - # Type of memory to be used fo the attention mask (if present) - # Options: auto, block, distributed - ram_style_mask: block - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - mac_resource: lut - # Addition of two inputs (constants or streamed) implemented via HLS - ElementwiseAdd_hls: - # Type of memory to be used for internal buffer storage and/or constant - # parameter tensors - # Options: auto, block, distributed, ultra - ram_style: distributed - # Matrix vector activation unit implemented via HLS - MVAU_hls: - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - resType: dsp - # Memory mode for weight storage - # Options: internal_embedded, internal_decoupled, external - mem_mode: internal_decoupled - # Type of memory to be used for weight storage if "internal_decoupled" - # Options: auto, block, distributed, ultra - ram_style: block - # Type of memory to be used for threshold storage - # Options: auto, block, distributed - ram_style_thresholds: block - # Makes weights writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Matrix vector activation unit implemented via RTL - MVAU_rtl: - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - # Note: RTL MVAU currently does not support LUT-based implementation - resType: dsp - # Memory mode for weight storage - # Options: internal_embedded, internal_decoupled, external - mem_mode: internal_decoupled - # Type of memory to be used for weight storage if "internal_decoupled" - # Options: auto, block, distributed, ultra - ram_style: block - # Makes weights writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Multi-thresholds implemented via HLS (applies to standalone thresholds) - Thresholding_hls: - # Memory mode for threshold storage - # Options: internal_embedded, internal_decoupled - mem_mode: internal_decoupled - # Type of memory to be used for threshold storage if "internal_decoupled" - # Options: distributed, block - ram_style: distributed - # Makes thresholds writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Multi-thresholds implemented via RTL (applies to standalone thresholds) - Thresholding_rtl: - # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the - # depth of the thresholds - # Note: This combination forces "distributed" LUT implementation - depth_trigger_uram: 2147483647 # "infinity" - depth_trigger_bram: 2147483647 # "infinity" - # # Note: This combination forces "block" RAM implementation - # depth_trigger_uram: 0 - # depth_trigger_bram: 1 - # # Note: This combination forces "ultra" RAM implementation - # depth_trigger_uram: 1 - # depth_trigger_bram: 0 - # # Note: This combination is equivalent to "auto" - # depth_trigger_uram: 0 - # depth_trigger_bram: 0 - # Makes thresholds writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) - StreamingFIFO_rtl: - # RTL vs. IPI implementation of FIFOs - # Options: rtl, vivado - impl_style: rtl - # Resource type for FIFOs when impl_style is vivado - # Options: auto, block, distributed, ultra - ram_style: distributed - # Individual, named node-specific configurations here - # ... -""" - -class bench_transformer_gpt(bench): - def step_build(self): - #with open("params.yaml") as file: - # params = yaml.safe_load(file) - # Seed all RNGs - seed(self.params["seed"]) - - # Extract sequence length and embedding dimension from the output of the - # first quantizer in the model - # Note: Embedding and Sequence dimension flip later - model = ModelWrapper(self.build_inputs["onnx_path"]) - _, emb_dim, seq_len = model.get_tensor_shape( - "/emb_add/input_quant/export_handler/Quant_output_0" - ) - - # Prepare config files - # TODO: make configurable - # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs - specialize_layers_dict = { - "Defaults": { - "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] - }, - "": { - "preferred_impl_style": "" - } - } - with open("specialize_layers.json", "w") as f: - json.dump(specialize_layers_dict, f, indent=2) - with open("folding.yaml", "w") as f: - f.write(template_folding_yaml) - - #TODO: make configurable instead of hardcoding exception - self.board = "U280" - self.part = "xcu280-fsvh2892-2L-e" - - # Create a configuration for building the scaled dot-product attention - # operator to a hardware accelerator - cfg = build_cfg.DataflowBuildConfig( - # Unpack the build configuration parameters - #**params["build"], - output_dir = self.build_inputs["build_dir"], - stitched_ip_gen_dcp = True, - synth_clk_period_ns = self.clock_period_ns, - board = self.board, - shell_flow_type = "vitis_alveo", #TODO: proper Alveo support instead of hardcoding - folding_config_file = "folding.yaml", - specialize_layers_config_file = "specialize_layers.json", - standalone_thresholds = True, - max_multithreshold_bit_width = 16, - mvau_wwidth_max = 2048, - split_large_fifos = True, - - verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log - - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM - #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later - #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed - #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed - #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components - #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation - ], - - verify_steps=[ - # Verify the model after converting to the FINN onnx dialect - build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, - # Verify the model again using python mode after the default - # streamlining step - build_cfg.VerificationStepType.STREAMLINED_PYTHON, - # Verify the model again after tidy up transformations, right before - # converting to HLS - build_cfg.VerificationStepType.TIDY_UP_PYTHON, - # Verify the model after generating C++ HLS and applying folding - build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, - ], - # File with test inputs for verification - verify_input_npy=self.build_inputs["input_npy_path"], - # File with expected test outputs for verification - verify_expected_output_npy=self.build_inputs["output_npy_path"], - # Save the intermediate model graphs - save_intermediate_models=True, - # Avoid RTL simulation for setting the FIFO sizes - auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, - # Do not automatically set FIFO sizes as this requires RTL simulation - # not implemented for the attention operator - auto_fifo_depths=False, - # Build steps to execute - steps=[ - # Need to apply some tidy-up transformations before converting to - # the finn dialect of onnx - step_tidy_up_pre_attention, - # Convert all QONNX Quant nodes to Multithreshold nodes - "step_qonnx_to_finn", - # Tidy up the graph after converting from QONNX to FINN format - # Note: Triggers a verification step - "step_tidy_up", - # Positional encoding needs to be streamlined first with slightly - # different order of certain streamlining transformations to avoid - # weird rounding issue of intermediate results - step_streamline_positional, - # Custom streamlining for models containing attention operators - step_streamline_attention, - # Streamlining of the residual branches - step_streamline_residual, - # Streamline the normalization layers, i.e., transposed batch norm - step_streamline_norms, - # Another round using the default streamlining steps - # Note: Triggers a verification step - "step_streamline", - # New conversion of the scaled dot-product attention pattern - step_convert_attention_to_hw, - # Another tidy-up step to remove unnecessary dimensions and - # operations after converting the attention operators to HLS - step_tidy_up_post_attention, - # Convert the elementwise binary operations to hardware operators. - # These include for example adding residual branches and positional - # encoding - step_convert_elementwise_binary_to_hw, - # Convert the Gather layer realizing the input token embedding to - # the FINN hardware implementation, i.e., the Lookup layer - step_convert_lookup_to_hw, - # Properly replicate the stream feeding the query, key and value - # projections - step_replicate_streams, - # Convert most other layers supported by FINN to HW operators - "step_convert_to_hw", - # Specialize HW layer implementations as either HLS or RTL - "step_specialize_layers", - "step_create_dataflow_partition", - # Set the folding configuration to meet the cycles per sequence - # target - set_target_parallelization(seq_len, emb_dim), - # Apply folding configuration, specifying hardware implementation - # details - # Note: This triggers a verification step - step_apply_folding_config, - "step_minimize_bit_width", - # The ScaledDotProductAttention custom op does not define any - # estimates - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - # Set the attention- and residual-related FIFO depths insert FIFOs - # and apply folding configuration once again - # Note: Implement all FIFOs with a depth at least as deep as the - # sequence length in URAM. - set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), - # Run additional node-by-node verification in RTL simulation of the - # model before creating the stitched IP - # Note: end-to-end verification of the stitched IP in RTL simulation - # is still not possible due to missing float IPs - node_by_node_cppsim, - # Only for debugging for now, does not work if "vivado" style - # StreamingFIFOs are used - # node_by_node_rtlsim, - - test_step_insert_tlastmarker, # required for instrumentation_wrapper - - "step_create_stitched_ip", - - # "step_measure_rtlsim_performance", # not possible due to float components - - step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) - - #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) - - # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) - #"step_synthesize_bitfile", - #"step_make_pynq_driver", - #"step_deployment_package", - - #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration - #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration - - #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration - #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime - - #test_step_export_xo, # preparation step for original instr wrapper integration - #test_step_build_platform # synthesis with instr wrapper - ] - ) - # Run the build process on the dummy attention operator graph - # TODO: maybe let this function return the cfg only, so it can be modified by bench context - build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) - - def run(self): - self.steps_full_build_flow() diff --git a/benchmarking/dut/transformer_radioml.py b/benchmarking/dut/transformer_radioml.py deleted file mode 100644 index 4d77cb4b8d..0000000000 --- a/benchmarking/dut/transformer_radioml.py +++ /dev/null @@ -1,336 +0,0 @@ -# Adapted from Christoph's attention-dummy repository - -# PyTorch base package: Math and Tensor Stuff -import torch -# Brevitas wrapper around PyTorch tensors adding quantization information -from brevitas.quant_tensor import QuantTensor -# Brevitas: Quantized versions of PyTorch layers -from brevitas.nn import ( - QuantMultiheadAttention, - QuantEltwiseAdd, - QuantIdentity, - QuantLinear, - QuantReLU -) -# Progressbar -from tqdm import trange -import numpy as np -from brevitas.export import export_qonnx -import random -import json -import subprocess -# FINN dataflow builder -import finn.builder.build_dataflow as build -import finn.builder.build_dataflow_config as build_cfg -from finn.builder.build_dataflow_config import AutoFIFOSizingMethod -from bench_base import bench, step_synth_harness -import os -from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents - -# Custom build steps required to streamline and convert the attention operator -from dut.transformer_custom_steps import ( - step_tidy_up_pre_attention, - step_tidy_up_post_attention, - step_streamline_attention, - step_streamline_residual, - step_streamline_norms, - step_streamline_positional, - step_convert_attention_to_hw, - step_convert_elementwise_binary_to_hw, - step_convert_lookup_to_hw, - step_replicate_streams, - set_target_parallelization, - set_fifo_depths, - step_apply_folding_config, - node_by_node_rtlsim, - node_by_node_cppsim -) -from performance.platform_build_steps import( - test_step_gen_vitis_xo, - test_step_gen_instrumentation_wrapper, - test_step_gen_instrwrap_sim, - test_step_insert_tlastmarker, - test_step_export_xo, - test_step_build_platform, - test_step_run_instrwrap_sim -) - -### ADAPTED FROM utils.py -# Seeds all relevant random number generators to the same seed for -# reproducibility -def seed(s): - random.seed(s) - np.random.seed(s) - torch.manual_seed(s) - -template_folding_yaml = """ -# Per operator type default configurations -defaults: - # Scaled dot-product attention head implemented via HLS - ScaledDotProductAttention_hls: - # Type of memory to be used for internal buffer storage - # Options: auto, block, distributed, ultra - ram_style: block - # Type of memory to be used for threshold storage - # Options: auto, block, distributed - ram_style_thresholds: block - # Type of memory to be used fo the attention mask (if present) - # Options: auto, block, distributed - ram_style_mask: block - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - mac_resource: lut - # Addition of two inputs (constants or streamed) implemented via HLS - ElementwiseAdd_hls: - # Type of memory to be used for internal buffer storage and/or constant - # parameter tensors - # Options: auto, block, distributed, ultra - ram_style: distributed - # Matrix vector activation unit implemented via HLS - MVAU_hls: - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - resType: dsp - # Memory mode for weight storage - # Options: internal_embedded, internal_decoupled, external - mem_mode: internal_decoupled - # Type of memory to be used for weight storage if "internal_decoupled" - # Options: auto, block, distributed, ultra - ram_style: block - # Type of memory to be used for threshold storage - # Options: auto, block, distributed - ram_style_thresholds: block - # Makes weights writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Matrix vector activation unit implemented via RTL - MVAU_rtl: - # Resource type to be used for implementing multiplications/MACs - # Options: auto, lut or dsp - # Note: RTL MVAU currently does not support LUT-based implementation - resType: dsp - # Memory mode for weight storage - # Options: internal_embedded, internal_decoupled, external - mem_mode: internal_decoupled - # Type of memory to be used for weight storage if "internal_decoupled" - # Options: auto, block, distributed, ultra - ram_style: block - # Makes weights writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Multi-thresholds implemented via HLS (applies to standalone thresholds) - Thresholding_hls: - # Memory mode for threshold storage - # Options: internal_embedded, internal_decoupled - mem_mode: internal_decoupled - # Type of memory to be used for threshold storage if "internal_decoupled" - # Options: distributed, block - ram_style: distributed - # Makes thresholds writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # Multi-thresholds implemented via RTL (applies to standalone thresholds) - Thresholding_rtl: - # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the - # depth of the thresholds - # Note: This combination forces "distributed" LUT implementation - depth_trigger_uram: 2147483647 # "infinity" - depth_trigger_bram: 2147483647 # "infinity" - # # Note: This combination forces "block" RAM implementation - # depth_trigger_uram: 0 - # depth_trigger_bram: 1 - # # Note: This combination forces "ultra" RAM implementation - # depth_trigger_uram: 1 - # depth_trigger_bram: 0 - # # Note: This combination is equivalent to "auto" - # depth_trigger_uram: 0 - # depth_trigger_bram: 0 - # Makes thresholds writeable through AXI-lite interface at runtime - runtime_writeable_weights: 0 - # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) - StreamingFIFO_rtl: - # RTL vs. IPI implementation of FIFOs - # Options: rtl, vivado - impl_style: rtl - # Resource type for FIFOs when impl_style is vivado - # Options: auto, block, distributed, ultra - ram_style: distributed - # Individual, named node-specific configurations here - # ... -""" - -class bench_transformer_radioml(bench): - def step_build(self): - #with open("params.yaml") as file: - # params = yaml.safe_load(file) - # Seed all RNGs - seed(self.params["seed"]) - # Extract sequence length and embedding dimension from parameters - _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape - - # Prepare config files - # TODO: make configurable - # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs - specialize_layers_dict = { - "Defaults": { - "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] - }, - "": { - "preferred_impl_style": "" - } - } - with open("specialize_layers.json", "w") as f: - json.dump(specialize_layers_dict, f, indent=2) - with open("folding.yaml", "w") as f: - f.write(template_folding_yaml) - - # Create a configuration for building the scaled dot-product attention - # operator to a hardware accelerator - cfg = build_cfg.DataflowBuildConfig( - # Unpack the build configuration parameters - #**params["build"], - output_dir = self.build_inputs["build_dir"], - stitched_ip_gen_dcp = True, - synth_clk_period_ns = self.clock_period_ns, - board = self.board, - shell_flow_type = "vivado_zynq", #TODO: Alveo support - folding_config_file = "folding.yaml", - specialize_layers_config_file = "specialize_layers.json", - standalone_thresholds = True, - max_multithreshold_bit_width = 16, - mvau_wwidth_max = 2048, - split_large_fifos = True, - - verbose=False, # if True prints stdout and stderr to console instead of build_dataflow.log - - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM - #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later - #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed - #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed - #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components - #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation - ], - - verify_steps=[ - # Verify the model after converting to the FINN onnx dialect - build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, - # Verify the model again using python mode after the default - # streamlining step - build_cfg.VerificationStepType.STREAMLINED_PYTHON, - # Verify the model again after tidy up transformations, right before - # converting to HLS - build_cfg.VerificationStepType.TIDY_UP_PYTHON, - # Verify the model after generating C++ HLS and applying folding - build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, - ], - # File with test inputs for verification - verify_input_npy=self.build_inputs["input_npy_path"], - # File with expected test outputs for verification - verify_expected_output_npy=self.build_inputs["output_npy_path"], - # Save the intermediate model graphs - save_intermediate_models=True, - # Avoid RTL simulation for setting the FIFO sizes - auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, - # Do not automatically set FIFO sizes as this requires RTL simulation - # not implemented for the attention operator - auto_fifo_depths=False, - # Build steps to execute - steps=[ - # Need to apply some tidy-up transformations before converting to - # the finn dialect of onnx - step_tidy_up_pre_attention, - # Convert all QONNX Quant nodes to Multithreshold nodes - "step_qonnx_to_finn", - # Tidy up the graph after converting from QONNX to FINN format - # Note: Triggers a verification step - "step_tidy_up", - # Positional encoding needs to be streamlined first with slightly - # different order of certain streamlining transformations to avoid - # weird rounding issue of intermediate results - step_streamline_positional, - # Custom streamlining for models containing attention operators - step_streamline_attention, - # Streamlining of the residual branches - step_streamline_residual, - # Streamline the normalization layers, i.e., transposed batch norm - step_streamline_norms, - # Another round using the default streamlining steps - # Note: Triggers a verification step - "step_streamline", - # New conversion of the scaled dot-product attention pattern - step_convert_attention_to_hw, - # Another tidy-up step to remove unnecessary dimensions and - # operations after converting the attention operators to HLS - step_tidy_up_post_attention, - # Convert the elementwise binary operations to hardware operators. - # These include for example adding residual branches and positional - # encoding - step_convert_elementwise_binary_to_hw, - # Convert the Gather layer realizing the input token embedding to - # the FINN hardware implementation, i.e., the Lookup layer - step_convert_lookup_to_hw, - # Properly replicate the stream feeding the query, key and value - # projections - step_replicate_streams, - # Convert most other layers supported by FINN to HW operators - "step_convert_to_hw", - # Specialize HW layer implementations as either HLS or RTL - "step_specialize_layers", - "step_create_dataflow_partition", - # Set the folding configuration to meet the cycles per sequence - # target - set_target_parallelization(seq_len, emb_dim), - # Apply folding configuration, specifying hardware implementation - # details - # Note: This triggers a verification step - step_apply_folding_config, - "step_minimize_bit_width", - # The ScaledDotProductAttention custom op does not define any - # estimates - "step_generate_estimate_reports", - "step_hw_codegen", - "step_hw_ipgen", - # Set the attention- and residual-related FIFO depths insert FIFOs - # and apply folding configuration once again - # Note: Implement all FIFOs with a depth at least as deep as the - # sequence length in URAM. - set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), - # Run additional node-by-node verification in RTL simulation of the - # model before creating the stitched IP - # Note: end-to-end verification of the stitched IP in RTL simulation - # is still not possible due to missing float IPs - node_by_node_cppsim, - # Only for debugging for now, does not work if "vivado" style - # StreamingFIFOs are used - # node_by_node_rtlsim, - - test_step_insert_tlastmarker, # required for instrumentation_wrapper - - "step_create_stitched_ip", - - # "step_measure_rtlsim_performance", # not possible due to float components - - step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) - - #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) - - # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) - #"step_synthesize_bitfile", - #"step_make_pynq_driver", - #"step_deployment_package", - - #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration - #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration - - #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration - #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime - - #test_step_export_xo, # preparation step for original instr wrapper integration - #test_step_build_platform # synthesis with instr wrapper - ] - ) - # Run the build process on the dummy attention operator graph - # TODO: maybe let this function return the cfg only, so it can be modified by bench context - build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) - - def run(self): - self.steps_full_build_flow() From 47cb5ac3eb387b3ab80b2b0cbb1ac40271ef5806 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 14:53:17 +0000 Subject: [PATCH 003/125] Fix imports --- benchmarking/dut/transformer_custom_steps.py | 679 +++++++++---------- 1 file changed, 328 insertions(+), 351 deletions(-) diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index e122f79a0d..2dc387a94a 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -4,226 +4,207 @@ # custom/composed_transformation.py # custom/streamline.py -# Python warning messages -import warnings -# Copies of python objects -from copy import deepcopy # Copies (deep-copies) python objects import copy + # Numpy for loading and comparing the verification input/output import numpy as np + +# Python warning messages +import warnings + # YAML for loading experiment configurations import yaml +# Copies of python objects +from copy import deepcopy + +# QONNX quantization data types +from qonnx.core.datatype import DataType + # QONNX wrapper of ONNX model graphs from qonnx.core.modelwrapper import ModelWrapper -# Range information structure for seeding the range analysis for converting -# quantized activations to MultiThreshold -from qonnx.util.range_analysis import RangeInfo + +# Converts ONNX graph nodes to QONNX custom-ops if possible +from qonnx.custom_op.registry import getCustomOp + +# Converts BatchNorm operation to affine transformation +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine + +# If we have a convolution with a bias tensors input, QONNX and later FINN +# expect the bias to be expressed as a standalone Add node following the Conv +# node. +from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv + +# Collapses chains of constants into a single constant operation or even +# initializer tensors. +from qonnx.transformation.fold_constants import FoldConstants + +# Converts Gemm operation to MatMul with extracted standalone bias op +from qonnx.transformation.gemm_to_matmul import GemmToMatMul # QONNX graph transformations for renaming and cleaning up from qonnx.transformation.general import ( - Transformation, - GiveUniqueNodeNames, + ConvertDivToMul, + ConvertSubToAdd, GiveReadableTensorNames, + GiveUniqueNodeNames, GiveUniqueParameterTensors, RemoveStaticGraphInputs, RemoveUnusedTensors, + Transformation, ) + # QONNX graph transformations for annotating the graph with datatype and shape # information from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes -# If we have a convolution with a bias tensors input, QONNX and later FINN -# expect the bias to be expressed as a standalone Add node following the Conv -# node. -from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv -# Converts BatchNorm operation to affine transformation -from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine -# Converts Gemm operation to MatMul with extracted standalone bias op -from qonnx.transformation.gemm_to_matmul import GemmToMatMul # Converts Conv to Im2Col and MatMul with extracted standalone bias op from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul + # Transposes the initializer tensors of a Quant node instead of having a # standalone Transpose following -from qonnx.transformation.quant_constant_folding import ( - FoldTransposeIntoQuantInit -) -# Collapses chains of constants into a single constant operation or even -# initializer tensors. -from qonnx.transformation.fold_constants import FoldConstants -# Folds quantizers into weight tensor initializers, needed for lowering -# convolutions to MatMuls -from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights -# FINN streamlining transformations reordering the graph -from finn.transformation.streamline.reorder import ( - MoveTransposePastFork, - MoveTransposePastEltwise, - MoveTransposePastJoinMul, - MoveTransposePastJoinAdd, - MoveTransposePastSplit, - MoveTransposePastJoinConcat, - MoveSqueezePastMultiThreshold, - MoveSqueezePastMatMul -) -# FINN streamlining transformations absorbing tensors/nodes into others -from finn.transformation.streamline.absorb import ( - AbsorbAddIntoMultiThreshold, - AbsorbSignBiasIntoMultiThreshold, -) -# FINN streamlining transformations fusing/collapsing operations of the same -# kind -from finn.transformation.streamline.collapse_repeated import ( - CollapseRepeatedTranspose -) -# FINN streamlining transformations removing nodes without real effect from the -# graph -from finn.transformation.streamline.remove import ( - RemoveIdentityTranspose, - RemoveIdentityReshape, - RemoveIdentityOps -) -# Cleanup transformation getting rid of 3d data layout -from finn.transformation.squeeze import Squeeze +from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit +from qonnx.transformation.remove import RemoveIdentityOps + +# Range information structure for seeding the range analysis for converting +# quantized activations to MultiThreshold +from qonnx.util.range_analysis import RangeInfo + +# FINN dataflow builder configuration +from finn.builder.build_dataflow_config import DataflowBuildConfig, VerificationStepType + +# FINN verification after build/graph transformation steps +from finn.builder.build_dataflow_steps import verify_step + # Detects the attention pattern and converts to hardware custom op from finn.transformation.fpgadataflow.attention import ( + AbsorbMultiThresholdIntoScaledDotProductAttention, InferScaledDotProductAttention, - AbsorbMultiThresholdIntoScaledDotProductAttention ) + # Mult-Head Attention support from finn.transformation.fpgadataflow.attention_heads import ( InferMultiHeads, - UnrollMultiHeadAttention, + MoveMergeMultiHeadsPastMultiThreshold, MoveSplitMultiHeadsPastMultiThreshold, - MoveMergeMultiHeadsPastMultiThreshold + UnrollMultiHeadAttention, ) +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim + # Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps from finn.transformation.fpgadataflow.convert_to_hw_layers import ( - InferSqueeze, - InferUnsqueeze, - InferElementwiseBinaryOperation, - InferSplitLayer, InferConcatLayer, + InferElementwiseBinaryOperation, InferLookupLayer, - InferVectorVectorActivation -) -# Converts fork-nodes to ReplicateStream hardware operator -from finn.transformation.fpgadataflow.replicate_stream import ( - InferReplicateStream -) -# Standard QONNX to FINN conversion function -from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN -from finn.transformation.qonnx.quant_act_to_multithreshold import ( - default_filter_function_generator, + InferSplitLayer, + InferSqueeze, + InferUnsqueeze, + InferVectorVectorActivation, ) -# QONNX quantization data types -from qonnx.core.datatype import DataType -# Converts ONNX graph nodes to QONNX custom-ops if possible -from qonnx.custom_op.registry import getCustomOp +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP + # Inserts data-width converter and FIFO nodes into the model graph from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO + +# Transformations preparing the operators for synthesis and simulation +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + +# Converts fork-nodes to ReplicateStream hardware operator +from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + # Splitting and removing of FIFOs from the model graph from finn.transformation.fpgadataflow.set_fifo_depths import ( RemoveShallowFIFOs, SplitLargeFIFOs, ) -# Specializes each layer's implementation style: HLS or RTL implementation -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -# FINN dataflow builder configuration -from finn.builder.build_dataflow_config import ( - VerificationStepType, DataflowBuildConfig -) + # Graph transformation setting the folding, i.e., parallelization configuration from finn.transformation.fpgadataflow.set_folding import SetFolding -# FINN verification after build/graph transformation steps -from finn.builder.build_dataflow_steps import verify_step -# Transformations preparing the operators for synthesis and simulation -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +# Specializes each layer's implementation style: HLS or RTL implementation +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -# Execute onnx model graphs from the dataflow parent for verification -from finn.util.test import execute_parent +# Standard QONNX to FINN conversion function +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN -# Base class for all QONNX graph transformations and some basic cleanup -# transformations -from qonnx.transformation.general import ( - Transformation, - ConvertDivToMul, - ConvertSubToAdd, +# Folds quantizers into weight tensor initializers, needed for lowering +# convolutions to MatMuls +from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights +from finn.transformation.qonnx.quant_act_to_multithreshold import ( + default_filter_function_generator, ) -# QONNX graph transformations for annotating the graph with datatype and shape -# information -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes -# Converts BatchNorm operation to affine transformation -from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine - -# Groups node inputs by dynamic vs. initializer category -from finn.transformation.streamline.absorb import group_inputs_by_category +# Cleanup transformation getting rid of 3d data layout +from finn.transformation.squeeze import Squeeze # FINN streamlining transformations converting and rounding values -from finn.transformation.streamline import ( - ConvertSignToThres, - RoundAndClipThresholds -) -# FINN streamlining transformations reordering the graph -from finn.transformation.streamline.reorder import ( - MoveMulPastFork, - MoveTransposePastFork, - MoveLinearPastEltwiseAdd, - MoveScalarLinearPastInvariants, - MoveTransposePastEltwise, - MoveMulPastMaxPool, - MoveAddPastMul, - MoveScalarAddPastMatMul, - MoveAddPastConv, - MoveScalarMulPastMatMul, - MoveScalarMulPastConv, - MoveTransposePastJoinMul, - MoveTransposePastJoinAdd, - MoveMulPastJoinAdd, - MoveAddPastJoinAdd, - MoveScalarLinearPastSplit, - MoveAffinePastJoinConcat, - MoveMulPastJoinConcat, - MoveAddPastJoinConcat, - MoveTransposePastSplit, - MoveTransposePastJoinConcat, - MoveSqueezePastMultiThreshold, - is_scalar -) -# FINN streamlining transformations absorbing tensors/nodes into others +from finn.transformation.streamline import ConvertSignToThres, RoundAndClipThresholds from finn.transformation.streamline.absorb import ( + Absorb1BitMulIntoConv, + Absorb1BitMulIntoMatMul, AbsorbAddIntoMultiThreshold, + AbsorbMulIntoMultiThreshold, AbsorbSignBiasIntoMultiThreshold, + AbsorbTransposeIntoMultiThreshold, FactorOutMulSignMagnitude, - AbsorbMulIntoMultiThreshold, - Absorb1BitMulIntoMatMul, - Absorb1BitMulIntoConv, - AbsorbTransposeIntoMultiThreshold + group_inputs_by_category, ) + # FINN streamlining transformations fusing/collapsing operations of the same # kind from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedAdd, CollapseRepeatedMul, CollapseRepeatedTranspose, - CollapseRepeatedAdd ) + # FINN streamlining transformations removing nodes without real effect from the # graph from finn.transformation.streamline.remove import ( + RemoveIdentityReshape, RemoveIdentityTranspose, - RemoveIdentityReshape ) +# FINN streamlining transformations reordering the graph +from finn.transformation.streamline.reorder import ( + MoveAddPastConv, + MoveAddPastJoinAdd, + MoveAddPastJoinConcat, + MoveAddPastMul, + MoveAffinePastJoinConcat, + MoveLinearPastEltwiseAdd, + MoveMulPastFork, + MoveMulPastJoinAdd, + MoveMulPastJoinConcat, + MoveMulPastMaxPool, + MoveScalarAddPastMatMul, + MoveScalarLinearPastInvariants, + MoveScalarLinearPastSplit, + MoveScalarMulPastConv, + MoveScalarMulPastMatMul, + MoveSqueezePastMatMul, + MoveSqueezePastMultiThreshold, + MoveTransposePastEltwise, + MoveTransposePastFork, + MoveTransposePastJoinAdd, + MoveTransposePastJoinConcat, + MoveTransposePastJoinMul, + MoveTransposePastSplit, + is_scalar, +) + +# Execute onnx model graphs from the dataflow parent for verification +from finn.util.test import execute_parent + +# FINN streamlining transformations absorbing tensors/nodes into others + + # Composes graph transformations such that each individual transformation as # well as the whole sequence is applied exhaustively class ComposedTransformation(Transformation): @@ -269,12 +250,14 @@ def apply(self, model: ModelWrapper): # noqa # sequence of transformations will be reapplied return model, graph_modified + # # Custom conversion from Quant to MultiThreshold # TODO: Enable once fixed... # from custom.quant_activation_to_multithreshold import ( # QuantActivationToMultiThreshold # ) + # Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These # cannot be handled by MoveScalarLinearPastInvariants as potential padding makes # Add-Im2Col not commute to Im2Col-Add @@ -350,6 +333,7 @@ def apply(self, model: ModelWrapper): # noqa # needs to be applied again return model, graph_modified + # Moves scalar linear elementwise operations past fork nodes, applies to Add, # Mul, Sub, Div, etc. class MoveScalarLinearPastFork(Transformation): @@ -401,6 +385,7 @@ def apply(self, model: ModelWrapper): # noqa # needs to be applied again return model, graph_modified + # Moves constant elementwise multiplication past another joining multiplication class MoveConstMulPastJoinMul(Transformation): # Applies the transform to a whole model graph # noqa: Duplicate @@ -474,7 +459,8 @@ def apply(self, model: ModelWrapper): # noqa # Return the transformed model and indicate whether the transformation # needs to be applied again return model, graph_modified - + + # Moves elementwise additions past MatMul operations: Applicable if each # operation has one initializer input class MoveAddPastMatMul(Transformation): @@ -620,10 +606,8 @@ def apply(self, model: ModelWrapper): # noqa # Skip without warning ok? continue # There must be exactly one constant per operations - assert len(s_name) == 1, \ - f"To many constant inputs for {node}" - assert len(b_name) == 1, \ - f"To many constant inputs for {successor}" + assert len(s_name) == 1, f"To many constant inputs for {node}" + assert len(b_name) == 1, f"To many constant inputs for {successor}" # Now read the initializer tensors s = model.get_initializer(*s_name) b = model.get_initializer(*b_name) @@ -663,93 +647,102 @@ def apply(self, model: ModelWrapper): # noqa # needs to be applied again return model, graph_modified + # Define a set of custom streamlining transformations: These are applied once # during the actual streamlining step and once after converting attention to # hardware (the associated cleanup afterward might enable some Streamlining # transformations once again) def Streamline(): # noqa: Uppercase # Return a set of exhaustively applies transformations - return ComposedTransformation([ - # On skip-connections: prefer pushing scalar multiplication forward - # before MoveAddPastMul - MoveMulPastFork(), - # The "standard" set of FINN streamlining transformations or at least - # inspired by them but applied exhaustively until none of them changes - # the graph anymore. - # Note: Covers most parts of non-branching linear topologies - ComposedTransformation([ - ConvertSubToAdd(), - ConvertDivToMul(), - BatchNormToAffine(), - ConvertSignToThres(), - MoveMulPastMaxPool(), - AbsorbSignBiasIntoMultiThreshold(), - MoveScalarLinearPastInvariants(), - MoveAddPastMul(), - MoveScalarAddPastMatMul(), - MoveAddPastConv(), - MoveScalarMulPastMatMul(), - MoveScalarMulPastConv(), - MoveAddPastMul(), - CollapseRepeatedAdd(), - CollapseRepeatedMul(), - MoveMulPastMaxPool(), - AbsorbAddIntoMultiThreshold(), - FactorOutMulSignMagnitude(), - AbsorbMulIntoMultiThreshold(), - Absorb1BitMulIntoMatMul(), - Absorb1BitMulIntoConv(), - ]), - # Streamlining scales and biases forward through residual topologies - # Note: This mostly covers forking and joining operations - ComposedTransformation([ - # Note: This is probably the most common way of joining skip - # connections, i.e., this corresponds to the original residual - # addition, i.e., y = f(x) + x - MoveLinearPastEltwiseAdd(), - MoveScalarLinearPastFork(), - MoveScalarLinearPastInvariants(), + return ComposedTransformation( + [ + # On skip-connections: prefer pushing scalar multiplication forward + # before MoveAddPastMul MoveMulPastFork(), - MoveMulPastJoinAdd(), - MoveAddPastJoinAdd(), - # Note: This brings constant Muls (i.e., quantizer scales to be - # removed) forward through joining Muls (i.e., those ending up - # as actual hardware operators). - MoveConstMulPastJoinMul() - ]), - # Streamlining scales and biases forward through shape/layout changing - # operations, i.e., mostly transposes - ComposedTransformation([ - # Convolution inputs and padding - MoveScalesPastIm2Col(), - # Streamlining for Split and Concat operations - MoveScalarLinearPastSplit(), - MoveAffinePastJoinConcat(), - MoveMulPastJoinConcat(), - MoveAddPastJoinConcat(), - # Move transposes around to some place where they could be removed - # later, i.e., where they collapse into identities - MoveTransposePastFork(), - MoveTransposePastSplit(), - MoveTransposePastJoinConcat(), - MoveTransposePastEltwise(), - MoveTransposePastJoinMul(), - MoveTransposePastJoinAdd(), - CollapseRepeatedTranspose(), - # Remove identity shape/layout transformations - RemoveIdentityTranspose(), - RemoveIdentityReshape(), - # Squeeze operators can be moved past the thresholding - MoveSqueezePastMultiThreshold(), - # A certain type of 4d-layout transpose can be absorbed (actually - # moved past) MultiThreshold operations - AbsorbTransposeIntoMultiThreshold(), - ]), - # Only round and clip after all streamlining transformations have - # been applied exhaustively. - # Note: Might still enable another round of streamlining. - RoundAndClipThresholds(), - ]) + # The "standard" set of FINN streamlining transformations or at least + # inspired by them but applied exhaustively until none of them changes + # the graph anymore. + # Note: Covers most parts of non-branching linear topologies + ComposedTransformation( + [ + ConvertSubToAdd(), + ConvertDivToMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveMulPastMaxPool(), + AbsorbSignBiasIntoMultiThreshold(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + MoveMulPastMaxPool(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + ] + ), + # Streamlining scales and biases forward through residual topologies + # Note: This mostly covers forking and joining operations + ComposedTransformation( + [ + # Note: This is probably the most common way of joining skip + # connections, i.e., this corresponds to the original residual + # addition, i.e., y = f(x) + x + MoveLinearPastEltwiseAdd(), + MoveScalarLinearPastFork(), + MoveScalarLinearPastInvariants(), + MoveMulPastFork(), + MoveMulPastJoinAdd(), + MoveAddPastJoinAdd(), + # Note: This brings constant Muls (i.e., quantizer scales to be + # removed) forward through joining Muls (i.e., those ending up + # as actual hardware operators). + MoveConstMulPastJoinMul(), + ] + ), + # Streamlining scales and biases forward through shape/layout changing + # operations, i.e., mostly transposes + ComposedTransformation( + [ + # Convolution inputs and padding + MoveScalesPastIm2Col(), + # Streamlining for Split and Concat operations + MoveScalarLinearPastSplit(), + MoveAffinePastJoinConcat(), + MoveMulPastJoinConcat(), + MoveAddPastJoinConcat(), + # Move transposes around to some place where they could be removed + # later, i.e., where they collapse into identities + MoveTransposePastFork(), + MoveTransposePastSplit(), + MoveTransposePastJoinConcat(), + MoveTransposePastEltwise(), + MoveTransposePastJoinMul(), + MoveTransposePastJoinAdd(), + CollapseRepeatedTranspose(), + # Remove identity shape/layout transformations + RemoveIdentityTranspose(), + RemoveIdentityReshape(), + # Squeeze operators can be moved past the thresholding + MoveSqueezePastMultiThreshold(), + # A certain type of 4d-layout transpose can be absorbed (actually + # moved past) MultiThreshold operations + AbsorbTransposeIntoMultiThreshold(), + ] + ), + # Only round and clip after all streamlining transformations have + # been applied exhaustively. + # Note: Might still enable another round of streamlining. + RoundAndClipThresholds(), + ] + ) # Prepares the graph to be consumed by FINN: @@ -763,62 +756,64 @@ def prepare_graph(range_info: RangeInfo): # Wrap the actual transformation/build step function def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig): # Exhaustively apply the set of cleanup transformations - model = model.transform(ComposedTransformation([ - # Adds shape and datatype annotations to all tensors in this graph - InferDataTypes(), - InferShapes(), - # Cleanup the graph by removing redundant, unnecessary and constant - # nodes and tensors and give unique names to everything remaining - GiveUniqueNodeNames(), - GiveReadableTensorNames(), - RemoveStaticGraphInputs(), - RemoveUnusedTensors(), - GiveUniqueParameterTensors(), - FoldConstants(), - # Remove unnecessary shape and layout transformations - RemoveIdentityReshape(), - RemoveIdentityTranspose(), - # Redo shape and datatype annotations after removing nodes and - # tensors - InferShapes(), - InferDataTypes(), - ])) + model = model.transform( + ComposedTransformation( + [ + # Adds shape and datatype annotations to all tensors in this graph + InferDataTypes(), + InferShapes(), + # Cleanup the graph by removing redundant, unnecessary and constant + # nodes and tensors and give unique names to everything remaining + GiveUniqueNodeNames(), + GiveReadableTensorNames(), + RemoveStaticGraphInputs(), + RemoveUnusedTensors(), + GiveUniqueParameterTensors(), + FoldConstants(), + # Remove unnecessary shape and layout transformations + RemoveIdentityReshape(), + RemoveIdentityTranspose(), + # Redo shape and datatype annotations after removing nodes and + # tensors + InferShapes(), + InferDataTypes(), + ] + ) + ) # If configured, run a verification of the transformed model on some # sample inputs - if (VerificationStepType.TIDY_UP_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "tidied_up_python", need_parent=False - ) + if VerificationStepType.TIDY_UP_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "tidied_up_python", need_parent=False) # Exhaustively apply the lowering transformations - model = model.transform(ComposedTransformation([ - # Moves the bias input to the Conv operator as a separate Add node - # behind the Conv node - ExtractBiasFromConv(), - # Converts Gemm nodes to MatMul (+ bias) - GemmToMatMul(), - # Need to do some constant and weight folding first - FoldConstants(), - FoldTransposeIntoQuantInit(), - FoldQuantWeights(), - # Annotate the graph with shape and data type information - InferShapes(), - InferDataTypes(), - # Converts Conv layers to MatMul - LowerConvsToMatMul(), - # Converts BatchNorm to affine scale and bias - BatchNormToAffine(), - # Annotate the graph with shape and data type information - InferShapes(), - InferDataTypes(), - ])) + model = model.transform( + ComposedTransformation( + [ + # Moves the bias input to the Conv operator as a separate Add node + # behind the Conv node + ExtractBiasFromConv(), + # Converts Gemm nodes to MatMul (+ bias) + GemmToMatMul(), + # Need to do some constant and weight folding first + FoldConstants(), + FoldTransposeIntoQuantInit(), + FoldQuantWeights(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + # Converts Conv layers to MatMul + LowerConvsToMatMul(), + # Converts BatchNorm to affine scale and bias + BatchNormToAffine(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + ] + ) + ) # If configured, run a verification of the transformed model on some # sample inputs - if (VerificationStepType.QONNX_TO_FINN_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "lowered_python", need_parent=False - ) + if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "lowered_python", need_parent=False) # Apply the quantizer to MultiThreshold conversion # Note: This is exhaustive as well as single .transform reapplies as # long as possible. @@ -826,26 +821,22 @@ def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig): # model = model.transform(QuantActivationToMultiThreshold(range_info)) # If configured, run a verification of the transformed model on some # sample inputs - if (VerificationStepType.QONNX_TO_FINN_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "quant_to_thresholds_ra_python", need_parent=False - ) + if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "quant_to_thresholds_ra_python", need_parent=False) # Apply the standard QONNX to FINN conversion step to convert the # remaining quantizers not yet covered by the new range analysis based # method - model = model.transform(ConvertQONNXtoFINN( - filter_function=default_filter_function_generator( - max_multithreshold_bit_width=cfg.max_multithreshold_bit_width + model = model.transform( + ConvertQONNXtoFINN( + filter_function=default_filter_function_generator( + max_multithreshold_bit_width=cfg.max_multithreshold_bit_width + ) ) - )) + ) # If configured, run a verification of the transformed model on some # sample inputs - if (VerificationStepType.QONNX_TO_FINN_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "prepared_graph_python", need_parent=False - ) + if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "prepared_graph_python", need_parent=False) # Return the transformed model return model @@ -870,11 +861,8 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(Streamline()) # If configured, run a verification of the transformed model on some # sample inputs - if (VerificationStepType.STREAMLINED_PYTHON in - cfg._resolve_verification_steps()): # noqa - verify_step( - model, cfg, "streamlined_python", need_parent=False - ) + if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "streamlined_python", need_parent=False) # Return the transformed model return model @@ -904,23 +892,27 @@ def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig): model = model.transform(Squeeze()) # Squeezing might have turned further transpose and reshape operations into # identities (those which just swapped around the dimensions of size 1) - model = model.transform(ComposedTransformation([ - # Move transposes around to some place where they could be removed - # later, i.e., where they collapse into identities - MoveTransposePastFork(), - MoveTransposePastSplit(), - MoveTransposePastJoinConcat(), - MoveTransposePastEltwise(), - MoveTransposePastJoinMul(), - MoveTransposePastJoinAdd(), - CollapseRepeatedTranspose(), - # Remove identity shape/layout transformations - RemoveIdentityTranspose(), - RemoveIdentityReshape(), - # Squeeze operators can be moved past MatMuls and thresholding - MoveSqueezePastMatMul(), - MoveSqueezePastMultiThreshold(), - ])) + model = model.transform( + ComposedTransformation( + [ + # Move transposes around to some place where they could be removed + # later, i.e., where they collapse into identities + MoveTransposePastFork(), + MoveTransposePastSplit(), + MoveTransposePastJoinConcat(), + MoveTransposePastEltwise(), + MoveTransposePastJoinMul(), + MoveTransposePastJoinAdd(), + CollapseRepeatedTranspose(), + # Remove identity shape/layout transformations + RemoveIdentityTranspose(), + RemoveIdentityReshape(), + # Squeeze operators can be moved past MatMuls and thresholding + MoveSqueezePastMatMul(), + MoveSqueezePastMultiThreshold(), + ] + ) + ) # Squeezing might enable absorbing adds into thresholds once again model = model.transform(AbsorbAddIntoMultiThreshold()) # If applicable, absorb the final thresholds into the attention operator @@ -942,9 +934,9 @@ def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig): def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _): # Convert elementwise operations to hardware operators # Note: Do not convert the final Mul operator at the output - return model.transform(InferElementwiseBinaryOperation( - InferElementwiseBinaryOperation.reject_output_dequant - )) + return model.transform( + InferElementwiseBinaryOperation(InferElementwiseBinaryOperation.reject_output_dequant) + ) # Converts Split and Concat operations to hardware custom operators @@ -984,13 +976,10 @@ def step_replicate_streams(model: ModelWrapper, _): # Custom step for setting the parallelism to meet the target of T^2 cycles per # sequence -def set_target_parallelization(seq_len: int, - emb_dim: int): # noqa: emb_dim +def set_target_parallelization(seq_len: int, emb_dim: int): # noqa: emb_dim # The wrapping function is a generator and this is the actual build step # function taking the model and build configuration - def step_set_target_parallelization( - model: ModelWrapper, cfg: DataflowBuildConfig - ): + def step_set_target_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig): # Run over all nodes in the model graph to look for attention operators, # which are currently not handled by the SetFolding transformation for index, node in enumerate(model.graph.node): @@ -1006,9 +995,9 @@ def step_set_target_parallelization( inst.set_nodeattr("SeqFold", seq_len) # Apply the built-in folding configuration transformation with the # T^2 target cycles - model = model.transform(SetFolding( - seq_len ** 2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation - )) + model = model.transform( + SetFolding(seq_len**2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation) + ) # TODO: Extract the folding configuration # Return the model with configured parallelization return model @@ -1033,8 +1022,7 @@ def apply(self, model: ModelWrapper): # noqa # Iterate all nodes in the graph keeping track of the index for index, node in enumerate(graph.node): # A node should not be named "defaults"... - assert node.name != "defaults", \ - "Node has reserved name 'defaults'" + assert node.name != "defaults", "Node has reserved name 'defaults'" # Convert this to the custom-op instance for easy access to node # attributes inst = getCustomOp(node) @@ -1059,9 +1047,7 @@ def apply(self, model: ModelWrapper): # noqa # Custom build step trying to set appropriate FIFO sizes for the transformer -def set_fifo_depths( - seq_len: int, emb_dim: int, uram_threshold: int = 32 # noqa: emb_dim -): +def set_fifo_depths(seq_len: int, emb_dim: int, uram_threshold: int = 32): # noqa: emb_dim # The wrapping function is a generator and this is the actual build step # function taking the model and build configuration def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): @@ -1091,9 +1077,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # Each folded input stream needs to be buffered completely # TODO: Not exactly sure whether this is always correct or just # the worst-case - in_depths = [ - inst.get_number_input_values(i) for i in range(num_inputs) - ] + in_depths = [inst.get_number_input_values(i) for i in range(num_inputs)] # Note: No special treatment of the output FIFO # out_depths = ... @@ -1113,7 +1097,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # figuring out which of the two is the longer/deeper branch # in terms of cycles to set a corresponding buffer only to # the shorter branch. - in_depths = [seq_len ** 2, seq_len ** 2] + in_depths = [seq_len**2, seq_len**2] # Note: No special treatment of the output FIFO # out_depths = ... @@ -1131,16 +1115,14 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # no other depth is specified) model = model.transform(InsertFIFO(create_shallow_fifos=True)) # Specialize the implementation variant of the (newly added FIFO) layers - model = model.transform( - SpecializeLayers(cfg._resolve_fpga_part()) # noqa: Access _ method - ) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) # noqa: Access _ method model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # Only applies if a configuration file is given if cfg.folding_config_file is not None: # Load the configuration dictionary form YAML file - with (open(cfg.folding_config_file, "r") as file): + with open(cfg.folding_config_file, "r") as file: # Load YAML string config = yaml.safe_load(file) # Assign unique names to the nodes which can be matched by @@ -1232,9 +1214,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) model = model.transform( - PrepareIP( - cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period() # noqa - ) + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) # noqa ) model = model.transform(HLSSynthIP()) @@ -1250,7 +1230,7 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig): # Only applies if a configuration file is given if cfg.folding_config_file is not None: # Load the configuration dictionary form YAML file - with (open(cfg.folding_config_file, "r") as file): + with open(cfg.folding_config_file, "r") as file: # Load YAML string config = yaml.safe_load(file) # Assign unique names to the nodes which can be matched by @@ -1260,8 +1240,7 @@ def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(ApplyConfig(config)) # If configured, run a verification of the transformed model on some sample # inputs - if (VerificationStepType.FOLDED_HLS_CPPSIM in - cfg._resolve_verification_steps()): # noqa + if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps(): # noqa # Prepare C++ Simulation for verification model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -1331,9 +1310,7 @@ def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(SetExecMode("rtlsim")) # Generates the C++ source and compiles the RTL simulation model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP( - cfg._resolve_fpga_part(), cfg.synth_clk_period_ns) # noqa - ) + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)) # noqa model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) From cb7152939516fc341d718edcff16b28e6c1672a1 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 15:04:24 +0000 Subject: [PATCH 004/125] Fix imports --- benchmarking/bench.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index db6f00c159..b34951f34b 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -8,8 +8,6 @@ from dut.mvau import bench_mvau from dut.transformer import bench_transformer -from dut.transformer_radioml import bench_transformer_radioml -from dut.transformer_gpt import bench_transformer_gpt from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing From 7d8a5f153f16f854ef9a227c3baac2552ca9c914 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 15:40:50 +0000 Subject: [PATCH 005/125] Add convformer, workaround streamlining --- benchmarking/cfg/transformer_radioml_all.json | 5 +++++ benchmarking/dut/transformer_custom_steps.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json index 7dbdc217d7..f2000fb9c3 100644 --- a/benchmarking/cfg/transformer_radioml_all.json +++ b/benchmarking/cfg/transformer_radioml_all.json @@ -3,5 +3,10 @@ "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], "dut_duplication": [1] + }, + { + "seed": [12], + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], + "dut_duplication": [1] } ] diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 2dc387a94a..91bdebb206 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -179,6 +179,7 @@ MoveAddPastMul, MoveAffinePastJoinConcat, MoveLinearPastEltwiseAdd, + MoveLinearPastFork, MoveMulPastFork, MoveMulPastJoinAdd, MoveMulPastJoinConcat, @@ -696,7 +697,7 @@ def Streamline(): # noqa: Uppercase # connections, i.e., this corresponds to the original residual # addition, i.e., y = f(x) + x MoveLinearPastEltwiseAdd(), - MoveScalarLinearPastFork(), + MoveLinearPastFork(), #DEBUG for positional encoding streamlining, MoveScalarLinearPastFork() MoveScalarLinearPastInvariants(), MoveMulPastFork(), MoveMulPastJoinAdd(), From 51a5fdf21355760e5f40efaa849e37ebe77b8af6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 29 Jan 2025 21:22:31 +0000 Subject: [PATCH 006/125] Combine test and benchmark CI defs --- .gitlab-ci.yml | 152 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 141 insertions(+), 11 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ebfa2f6f88..b44a26cdc1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,37 +1,107 @@ stages: - - update - - build + - sync + - singularity_build - load_deps - test - - trigger_benchmarks variables: PIPELINE_NAME: description: "Optional name to better identify this pipeline" value: "" + TEST_SUITE: + description: "Select test suite to run" + value: "full" + options: + - "none" + - "quicktest" + - "main" + - "rtlsim" + - "end2end" + - "full" CPU_CORES: description: "Select number of CPU cores and test workers" value: "8" PARALLEL_JOBS: - description: "Number of parallel Slurm array jobs per CI job" + description: "Number of parallel Slurm array jobs per Benchmark job" value: "2" SLURM_TIMEOUT: - description: "Timeout" - value: "2-0" # [days-hours] - MANUAL_CFG_PATH: - description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner" - value: "" + description: "Select SLURM timeout" + value: "3-0" # [days-hours] SLURM_PARTITION: description: "Slurm partition (e.g., normal, largemem, fpga, gpu)" value: "normal" SLURM_QOS: description: "Optional QoS option (include --qos, e.g., --qos express)" value: "" + MANUAL_CFG_PATH: + description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner" + value: "" FINN_XILINX_VERSION: value: "2022.2" + SINGULARITY_IMG_SELECT: + value: "finn_dev.sif" workflow: name: '$PIPELINE_NAME' + rules: + # Run pipeline for GitHub PRs to dev (does not support PRs from forks) + - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" + # Run pipeline for pushes to dev + - if: $CI_COMMIT_BRANCH == "dev" + # Run pipeline if manually triggered via API or web GUI + - if: $CI_PIPELINE_SOURCE == "api" + - if: $CI_PIPELINE_SOURCE == "web" + # Run pipeline if scheduled (only for nightly sync of finn-dev) + - if: $CI_PIPELINE_SOURCE == "schedule" + +Sync finn-dev: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: sync + tags: + # Run where full Docker + Singularity is available + - image_build + rules: + # Only run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + script: + - mkdir -p ../github_clone && cd ../github_clone + - rm -rf finn-plus # Ensure we do a fresh clone (TODO: better way to handle this on job level?) + - git clone git@github.com:eki-project/finn-plus.git && cd finn-plus + - git remote add upstream https://github.com/Xilinx/finn.git + - git checkout finn-dev + - git pull upstream dev + - git push origin finn-dev + +Singularity Image Build: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: singularity_build + tags: + # Run where full Docker + Singularity is available + - image_build + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Only run if relevant files changed relative to dev branch + - changes: + paths: + - requirements.txt + - docker/Dockerfile.finn + - docker/finn_entrypoint.sh + - docker/quicktest.sh + compare_to: "dev" + script: + - docker build --no-cache -f docker/Dockerfile.finn --tag=finn_docker_export . + - apptainer build --force finn_singularity_image.sif docker-daemon://finn_docker_export:latest + - rsync -vh finn_singularity_image.sif $PATH_SINGULARITY_IMG_BUILD/finn-plus/finn_$CI_COMMIT_REF_SLUG.sif + after_script: # Clean caches + - echo 'y' | docker image prune + - echo 'y' | docker builder prune + - echo 'y' | apptainer cache clean Fetch Repos: id_tokens: @@ -40,6 +110,12 @@ Fetch Repos: stage: load_deps tags: - login + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Otherwise run + - when: always cache: key: $CI_COMMIT_SHA paths: @@ -47,9 +123,58 @@ Fetch Repos: script: - ./fetch-repos.sh +FINN Test Suite 2022.2: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: test + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Do not run if test suite has been deselected + - if: $TEST_SUITE == "none" + when: never + # Select different Singularity image if it deviates from default (dev branch) + - changes: + paths: + - requirements.txt + - docker/Dockerfile.finn + - docker/finn_entrypoint.sh + - docker/quicktest.sh + compare_to: "dev" + variables: + SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" + # Always run, as long as there was no prior failure + - when: on_success + cache: + key: $CI_COMMIT_SHA + policy: pull + paths: + - deps + variables: + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" + PYTEST_PARALLEL: "$CPU_CORES" + FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT" + FINN_XILINX_VERSION: "2022.2" + before_script: + - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) + - cd $PATH_WORKDIR/finn-plus + - module load system singularity + script: + - ./run-docker.sh quicktest.sh $TEST_SUITE + +FINN Test Suite 2024.1: + extends: FINN Test Suite 2022.2 + variables: + FINN_XILINX_VERSION: "2024.1" + Bench (Manual): - stage: trigger_benchmarks + stage: test rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never - if: $MANUAL_CFG_PATH != "" trigger: include: benchmarking/bench-ci.yml @@ -60,8 +185,11 @@ Bench (Manual): BENCH_CFG: "manual" Bench: - stage: trigger_benchmarks + stage: test rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never - if: $MANUAL_CFG_PATH == "" trigger: include: benchmarking/bench-ci.yml @@ -76,6 +204,8 @@ Bench: #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test #transformer: transformer_test, transformer_radioml_all +#TODO: add selector for none, reduced, full benchmark suite + #TODO: introduce result collect job on parent level for easier visualization/excel interfacing #TODO: more control via (optional) variables #TODO: move power measurement from polling-based script to its own job/runner From 941984e6f5116ec1318fddb278b53ca1437bc50c Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 15:14:37 +0000 Subject: [PATCH 007/125] Refactor DUTs --- .gitlab-ci.yml | 2 +- benchmarking/bench.py | 41 +-- benchmarking/bench_base.py | 246 ++++++++++++---- benchmarking/cfg/fifosizing_test.json | 5 +- benchmarking/cfg/metafi_fifosizing_test.json | 7 +- benchmarking/cfg/metafi_test.json | 10 + benchmarking/cfg/mvau_test.json | 1 + .../cfg/resnet50_fifosizing_test.json | 8 +- benchmarking/cfg/resnet50_test.json | 13 + benchmarking/cfg/transformer_gpt_all.json | 4 + benchmarking/cfg/transformer_radioml_all.json | 2 + benchmarking/cfg/transformer_sweep.json | 5 + benchmarking/cfg/transformer_test.json | 1 + benchmarking/dut/metafi.py | 83 ++++++ benchmarking/dut/resnet50.py | 57 ++++ .../{fifosizing.py => synthetic_nonlinear.py} | 263 +----------------- benchmarking/dut/transformer.py | 47 ++-- 17 files changed, 430 insertions(+), 365 deletions(-) create mode 100644 benchmarking/cfg/metafi_test.json create mode 100644 benchmarking/cfg/resnet50_test.json create mode 100644 benchmarking/dut/metafi.py create mode 100644 benchmarking/dut/resnet50.py rename benchmarking/dut/{fifosizing.py => synthetic_nonlinear.py} (50%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2d28f34602..066a7dc289 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -198,7 +198,7 @@ Bench: pipeline_variables: true parallel: matrix: - - BENCH_CFG: [mvau_test] + - BENCH_CFG: [mvau_test, resnet50_test, metafi_test] #dev: mvau_test #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test diff --git a/benchmarking/bench.py b/benchmarking/bench.py index b34951f34b..f3a4c0f424 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -7,9 +7,22 @@ import onnxruntime as ort from dut.mvau import bench_mvau -from dut.transformer import bench_transformer -from dut.fifosizing import bench_fifosizing, bench_metafi_fifosizing, bench_resnet50_fifosizing - +from dut.resnet50 import bench_resnet50 +from dut.metafi import bench_metafi +from dut.synthetic_nonlinear import bench_synthetic_nonlinear + +dut = dict() +dut["mvau"] = bench_mvau +dut["resnet50"] = bench_resnet50 +dut["metafi"] = bench_metafi +dut["synthetic_nonlinear"] = bench_synthetic_nonlinear + +# TODO: remove guard once transformer support has been fully merged +try: + from dut.transformer import bench_transformer + dut["transformer"] = bench_transformer +except ImportError: + pass def main(config_name): exit_code = 0 @@ -124,20 +137,16 @@ def get_default_session_options_new(): log_dict = {"run_id": run_id, "task_id": task_id, "params": params} - # Determine which DUT to run TODO: do this lookup more generically? - # give bench subclass name directly in config? - if config_select.startswith("mvau"): - bench_object = bench_mvau(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("transformer"): - bench_object = bench_transformer(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("fifosizing"): - bench_object = bench_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("metafi_fifosizing"): - bench_object = bench_metafi_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) - elif config_select.startswith("resnet50_fifosizing"): - bench_object = bench_resnet50_fifosizing(params, task_id, run_id, artifacts_dir, save_dir) + # Create bench object for respective DUT + if "dut" in params: + if params.dut in dut: + bench_object = dut[params.dut](params, task_id, run_id, artifacts_dir, save_dir) + else: + print("ERROR: unknown DUT specified") + return 1 else: - print("ERROR: unknown DUT specified") + print("ERROR: no DUT specified") + return 1 start_time = time.time() try: diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 5c191d911f..0bd7be6907 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -6,6 +6,7 @@ import time import traceback import glob +import numpy as np from shutil import copy as shcopy from shutil import copytree import finn.core.onnx_exec as oxe @@ -33,6 +34,7 @@ gen_finn_dt_tensor, roundup_to_integer_multiple, ) +import finn.builder.build_dataflow as build from finn.analysis.fpgadataflow.post_synth_res import post_synth_res from qonnx.core.modelwrapper import ModelWrapper from finn.builder.build_dataflow_config import DataflowBuildConfig @@ -613,21 +615,22 @@ def save_local_artifacts_collection(self): for (name, source_path) in self.local_artifacts_collection: self.save_local_artifact(name, source_path) + # only used in simple flow (TODO: unify) def step_make_model(self): - # may be implemented in subclass pass - + + # only used in full build flow def step_export_onnx(self): - # may be implemented in subclass pass - def step_build(self): - # may be implemented in subclass + # only used in full build flow + def step_build_setup(self): pass + # defaults to full build flow + # may be overwritten by subclass (e.g., to call simple flow instead) def run(self): - # must be implemented in subclass - pass + self.steps_full_build_flow() def step_finn_estimate(self): # Gather FINN estimates @@ -813,51 +816,172 @@ def step_synth_power(self): def step_parse_builder_output(self, build_dir): # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow - # COPY bitstreams and other outputs - # TODO: integrate better (e.g. as artifact) and remove redundant copy - # TODO: make this more configurable or switch to job/artifact based power measurement - # TODO: make compatible to new instr wrapper (or however we generate these outputs) - shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), - os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) - shcopy(os.path.join(build_dir, "harness/top.hwh"), - os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) - shcopy(os.path.join(build_dir, "harness/synth_report.xml"), - os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) - clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) - measurement_settings = {"freq_mhz": clock_period_mhz} - with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: - json.dump(measurement_settings, f, indent=2) + ### SAVE BITSTREAMS ### + if (os.path.exists(os.path.join(build_dir, "harness"))): + # TODO: integrate better (e.g. as artifact) and remove redundant copy + # TODO: make this more configurable or switch to job/artifact based power measurement + # TODO: make compatible to new instr wrapper (or however we generate these outputs) + shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), + os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) + shcopy(os.path.join(build_dir, "harness/top.hwh"), + os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) + shcopy(os.path.join(build_dir, "harness/synth_report.xml"), + os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) + clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) + measurement_settings = {"freq_mhz": clock_period_mhz} + with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: + json.dump(measurement_settings, f, indent=2) + else: + pass #TODO: warn/skip? + + ### CHECK FOR VERIFICATION STEP SUCCESS ### + if (os.path.exists(os.path.join(build_dir, "verification_output"))): + # Collect all verification output filenames + outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy")) + # Extract the verification status for each verification output by matching + # to the SUCCESS string contained in the filename + status = all([ + out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs + ]) + + # Construct a dictionary reporting the verification status as string + self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]} + # TODO: mark job as failed if verification fails + else: + pass #TODO: warn/skip? + + ### PARSE SYNTH RESOURCE REPORT ### + if (os.path.exists(os.path.join(build_dir, "harness/post_synth_resources.json"))): + report_path = os.path.join(build_dir, "harness/post_synth_resources.json") + # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run) + report_filter = "(top)" + # Open the report file + with open(report_path) as file: + # Load the JSON formatted report + report = pd.read_json(file, orient="index") + # Filter the reported rows according to some regex filter rule + report = report.filter(regex=report_filter, axis="rows") + # Generate a summary of the total resources + summary = report.sum() + + #TODO: parse finn estimates, hls estimates, step times, rtlsim performance(rtlsim n=1, n=100) + #TODO: optional simulation of instr wrapper instead of running on hw + + self.output_dict["builder"] = summary.to_dict() + else: + pass #TODO: warn/skip? + + ### ANALYZE FIFOs ### + fifo_info = {} + # TODO: skip if not present + model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") + + fifo_info["fifo_depths"] = {} + fifo_info["fifo_sizes"] = {} + total_fifo_size = 0 + for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") + fifo_info["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") + total_fifo_size += fifo_info["fifo_sizes"][node.name] + fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) + + self.output_dict["fifos"] = fifo_info + + def step_fifotest(self, onnx_path, cfg, build_dir): + # requires certain output products (e.g., ESTIMATE_REPORTS, RTLSIM_PERFORMANCE) + # TODO: check them and skip/warn if missing + log = {} + # load performance reports + with open(build_dir + "/report/estimate_network_performance.json") as f: + est_data = json.load(f) + with open(build_dir + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"] + deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + log["deadlock"] = deadlock.tolist() + + # check rtlsim throughput + throughput = sim_data["throughput[images/s]"] + stable_throughput = sim_data["stable_throughput[images/s]"] + estimated_throughput = est_data["estimated_throughput_fps"] + throughput_factor = throughput / estimated_throughput + stable_throughput_factor = stable_throughput / estimated_throughput + + # TODO: Take throughput or stable_throughput? + throughput_pass = throughput_factor > self.params["fifo_throughput_factor_threshold"] + + log["throughput_pass"] = throughput_pass + log["throughput"] = throughput + log["stable_throughput"] = stable_throughput + log["estimated_throughput"] = estimated_throughput + + # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear + fifo_reduction_pass = [] + log["fifo_reduction_results"] = {} + model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx") + for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"): + model = copy.deepcopy(model_orig) + node = model.get_node_from_name(node_orig.name) + node_inst = getCustomOp(node) + + # skip shallow FIFOs + # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado? + if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]: + log["fifo_reduction_results"][node.name] = "skip" + continue + + # reduce depth of current FIFO and reset generated code + node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"])) + node_inst.set_nodeattr("code_gen_dir_ipgen", "") + node_inst.set_nodeattr("ip_path", "") + node_inst.set_nodeattr("ipgen_path", "") + + # save model variation + tmp_output_dir_var = build_dir + "/variations/" + node.name + os.makedirs(tmp_output_dir_var) + model.save(tmp_output_dir_var + "/model.onnx") + + # build again, only re-run necessary steps to save time + cfg.output_dir = tmp_output_dir_var + cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"] + build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg) + + # load performance report + with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f: + sim_data = json.load(f) + + # check for deadlock + model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") + first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) + last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"] + var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected + + # check rtlsim throughput + var_throughput = sim_data["throughput[images/s]"] + var_stable_throughput = sim_data["stable_throughput[images/s]"] + # TODO: take throughput or stable_throughput? + throughput_drop = (throughput - var_throughput) / throughput + + if var_deadlock: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = 1.0 + elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]: + fifo_reduction_pass.append(True) + log["fifo_reduction_results"][node.name] = throughput_drop + else: + fifo_reduction_pass.append(False) + log["fifo_reduction_results"][node.name] = "fail (no drop)" - # CHECK FOR VERIFICATION STEP SUCCESS - # Collect all verification output filenames - outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy")) - # Extract the verification status for each verification output by matching - # to the SUCCESS string contained in the filename - status = all([ - out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs - ]) - - # Construct a dictionary reporting the verification status as string - self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]} - # TODO: mark job as failed if verification fails - - # PARSE LOGS - report_path = os.path.join(build_dir, "harness/post_synth_resources.json") - # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run) - report_filter = "(top)" - # Open the report file - with open(report_path) as file: - # Load the JSON formatted report - report = pd.read_json(file, orient="index") - # Filter the reported rows according to some regex filter rule - report = report.filter(regex=report_filter, axis="rows") - # Generate a summary of the total resources - summary = report.sum() - - #TODO: parse finn estimates, hls estimates, step times, (rtlsim n=1, n=100) - #TODO: add vivado latency simulation for special transformer case - - self.output_dict["builder"] = summary.to_dict() + self.output_dict["fifos"]["fifotest"] = log def steps_simple_model_flow(self): # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops) @@ -898,6 +1022,7 @@ def steps_simple_model_flow(self): def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow + ### SETUP ### # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) # Ensure it exists but is empty (clear potential artifacts from previous runs) tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow") @@ -907,6 +1032,7 @@ def steps_full_build_flow(self): os.makedirs(self.build_inputs["build_dir"], exist_ok=True) self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"])) + ### MODEL CREATION/IMPORT ### if "model_dir" in self.params: # input ONNX model and verification input/output pairs are provided model_dir = self.params["model_dir"] @@ -928,6 +1054,22 @@ def steps_full_build_flow(self): if "floorplan_path" in self.params: self.build_inputs["floorplan_path"] = self.params["floorplan_path"] - self.step_build() + ### BUILD SETUP ### + cfg = self.step_build_setup() + cfg.board = self.board + if "folding_path" in self.build_inputs: + cfg.folding_config_file = self.build_inputs["folding_path"] + if "specialize_path" in self.build_inputs: + cfg.specialize_layers_config_file = self.build_inputs["specialize_path"] + if "floorplan_path" in self.build_inputs: + cfg.floorplan_path = self.build_inputs["floorplan_path"] + ### BUILD ### + build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) + + ### ANALYSIS ### self.step_parse_builder_output(self.build_inputs["build_dir"]) + + # Only run in-depth FIFO test if selected + if "fifo_rtlsim_n" in self.params: + self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"]) diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json index 890f4c5b66..519b7fe430 100644 --- a/benchmarking/cfg/fifosizing_test.json +++ b/benchmarking/cfg/fifosizing_test.json @@ -1,5 +1,6 @@ [ { + "dut": "synthetic_nonlinear", "dim": [32], "kernel_size": [5], "ch": [4], @@ -12,8 +13,8 @@ "strategy": ["analytical", "rtlsim"], - "rtlsim_n": [10], - "throughput_factor_threshold": [0.9], + "fifo_rtlsim_n": [10], + "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [64], "fifo_reduction_factor": [0.5], "fifo_reduction_throughput_drop_threshold": [0.01] diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index 2a3aa895ab..7540949eaf 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -1,15 +1,14 @@ [ { + "dut": "metafi", "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], "board": ["RFSoC2x2"], "clock_period_ns": [10], - "strategy": ["analytical"], - - "rtlsim_n": [10], - "throughput_factor_threshold": [0.9], + "fifo_rtlsim_n": [10], + "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [1024], "fifo_reduction_factor": [0.5], "fifo_reduction_throughput_drop_threshold": [0.01] diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json new file mode 100644 index 0000000000..63a26d0dbc --- /dev/null +++ b/benchmarking/cfg/metafi_test.json @@ -0,0 +1,10 @@ +[ + { + "dut": "metafi", + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10] + } + ] \ No newline at end of file diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json index 0c3abdb574..e9fc3358b5 100644 --- a/benchmarking/cfg/mvau_test.json +++ b/benchmarking/cfg/mvau_test.json @@ -1,5 +1,6 @@ [ { + "dut": ["mvau"], "idt": ["INT4","INT2"], "wdt": ["INT4"], "act": ["INT4"], diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index fbb0075dae..9ded5630f0 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -1,5 +1,7 @@ [ { + "dut": "resnet50", + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], @@ -8,10 +10,8 @@ "board": ["U280"], "clock_period_ns": [4], - "strategy": ["analytical"], - - "rtlsim_n": [2], - "throughput_factor_threshold": [0.9], + "fifo_rtlsim_n": [2], + "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [1024], "fifo_reduction_factor": [0.5], "fifo_reduction_throughput_drop_threshold": [0.01] diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json new file mode 100644 index 0000000000..bb9a65873e --- /dev/null +++ b/benchmarking/cfg/resnet50_test.json @@ -0,0 +1,13 @@ +[ + { + "dut": "resnet50", + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["U280"], + "clock_period_ns": [4] + } + ] \ No newline at end of file diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json index 27c426606e..fd228710f1 100644 --- a/benchmarking/cfg/transformer_gpt_all.json +++ b/benchmarking/cfg/transformer_gpt_all.json @@ -1,20 +1,24 @@ [ { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"], "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"], "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"], "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], "dut_duplication": [1] diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json index f2000fb9c3..207839f5d5 100644 --- a/benchmarking/cfg/transformer_radioml_all.json +++ b/benchmarking/cfg/transformer_radioml_all.json @@ -1,10 +1,12 @@ [ { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], "dut_duplication": [1] diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json index d10c4d94ca..d30df90b87 100644 --- a/benchmarking/cfg/transformer_sweep.json +++ b/benchmarking/cfg/transformer_sweep.json @@ -1,5 +1,6 @@ [ { + "dut": "transformer", "seed": [12], "calibration_passes": [32], @@ -18,6 +19,7 @@ "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "calibration_passes": [32], @@ -36,6 +38,7 @@ "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "calibration_passes": [32], @@ -54,6 +57,7 @@ "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "calibration_passes": [32], @@ -72,6 +76,7 @@ "dut_duplication": [1] }, { + "dut": "transformer", "seed": [12], "calibration_passes": [32], diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json index 784d96f93d..d7346e6068 100644 --- a/benchmarking/cfg/transformer_test.json +++ b/benchmarking/cfg/transformer_test.json @@ -1,5 +1,6 @@ [ { + "dut": "transformer", "seed": [12], "calibration_passes": [32], diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py new file mode 100644 index 0000000000..94bb4b068c --- /dev/null +++ b/benchmarking/dut/metafi.py @@ -0,0 +1,83 @@ +import finn.builder.build_dataflow_config as build_cfg + +from bench_base import bench + +# # custom steps +# from custom_steps import ( +# step_extract_absorb_bias, +# step_pre_streamline, +# step_residual_convert_to_hw, +# step_residual_streamline, +# step_residual_tidy, +# step_residual_topo, +# step_set_preferred_impl_style, +# step_convert_final_layers +# ) + +class bench_metafi(bench): + def step_build_setup(self): + # create build config for MetaFi models + + steps = [ + # step_residual_tidy, + # step_extract_absorb_bias, + # step_residual_topo, + # step_pre_streamline, + # step_residual_streamline, + # step_residual_convert_to_hw, + "step_create_dataflow_partition", + # step_set_preferred_impl_style, + "step_specialize_layers", + "step_target_fps_parallelization", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_set_fifo_depths", + "step_hw_codegen", + "step_hw_ipgen", + "step_create_stitched_ip", + "step_measure_rtlsim_performance", + "step_out_of_context_synthesis", + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + + cfg = build_cfg.DataflowBuildConfig( + output_dir = self.build_inputs["build_dir"], + synth_clk_period_ns = self.clock_period_ns, + steps=steps, + verbose=False, + target_fps=None, #23 + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end + #vitis_platform=vitis_platform, + + auto_fifo_depths=False, + split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test + + # general rtlsim settings + force_python_rtlsim=False, + rtlsim_batch_size=self.params["rtlsim_n"], + + # folding_config_file=folding_config_file, + # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", + # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", + # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json", + auto_fifo_strategy="characterize", + characteristic_function_strategy=self.params["strategy"], + #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, + # standalone_thresholds=True, + # enable extra performance optimizations (physopt) + vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing + ], + ) + + # where is this used and why? + cfg.use_conv_rtl = True, # use rtl for conv layers (MVAU cannot use rtl in our model) + + return cfg \ No newline at end of file diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py new file mode 100644 index 0000000000..701f7f65e2 --- /dev/null +++ b/benchmarking/dut/resnet50.py @@ -0,0 +1,57 @@ +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import alveo_default_platform + +from dut.resnet50_custom_steps import ( + step_resnet50_tidy, + step_resnet50_streamline, + step_resnet50_convert_to_hw, + step_resnet50_slr_floorplan, + ) + +from bench_base import bench + +class bench_resnet50(bench): + def step_build_setup(self): + # create build config for ResNet-50 (based on finn-examples) + + resnet50_build_steps = [ + step_resnet50_tidy, + step_resnet50_streamline, + step_resnet50_convert_to_hw, + "step_create_dataflow_partition", + "step_specialize_layers", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_set_fifo_depths", + "step_hw_codegen", + "step_hw_ipgen", + step_resnet50_slr_floorplan, + "step_create_stitched_ip", # was not in finn-examples + "step_measure_rtlsim_performance", # was not in finn-examples + "step_out_of_context_synthesis", # was not in finn-examples + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + + cfg = build_cfg.DataflowBuildConfig( + output_dir = self.build_inputs["build_dir"], + synth_clk_period_ns = self.clock_period_ns, + steps=resnet50_build_steps, + shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end + auto_fifo_depths=False, + split_large_fifos=True, + vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end + + # enable extra performance optimizations (physopt) + vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + generate_outputs=[ + build_cfg.DataflowOutputType.ESTIMATE_REPORTS, + build_cfg.DataflowOutputType.STITCHED_IP, + build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, + build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing + ], + ) + + return cfg \ No newline at end of file diff --git a/benchmarking/dut/fifosizing.py b/benchmarking/dut/synthetic_nonlinear.py similarity index 50% rename from benchmarking/dut/fifosizing.py rename to benchmarking/dut/synthetic_nonlinear.py index 46b972deb0..3193432798 100644 --- a/benchmarking/dut/fifosizing.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -28,12 +28,7 @@ from finn.util.test import get_trained_network_and_ishape from finn.util.basic import alveo_default_platform -from dut.resnet50_custom_steps import ( - step_resnet50_tidy, - step_resnet50_streamline, - step_resnet50_convert_to_hw, - step_resnet50_slr_floorplan, - ) + from bench_base import bench @@ -251,13 +246,11 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe): model = model.transform(GiveReadableTensorNames()) return model -class bench_fifosizing(bench): +class bench_synthetic_nonlinear(bench): def step_export_onnx(self, onnx_export_path): np.random.seed(0) tmp_output_dir = make_build_dir("test_fifosizing") - #TODO: generalize FIFO test so it can be used by other FIFO-related unit tests - # or make into a build flow output product "fifo_report" #TODO: allow manual folding/fifo config as input #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness? @@ -318,259 +311,7 @@ def step_build_setup(self): ) return cfg - - def step_fifotest(self, onnx_path, cfg, build_dir): - log = {} - build.build_dataflow_cfg(onnx_path, cfg) - - # load performance reports - with open(build_dir + "/report/estimate_network_performance.json") as f: - est_data = json.load(f) - with open(build_dir + "/report/rtlsim_performance.json") as f: - sim_data = json.load(f) - - # check for deadlock - model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") - first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) - last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] - deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected - log["deadlock"] = deadlock.tolist() - - # check rtlsim throughput - throughput = sim_data["throughput[images/s]"] - stable_throughput = sim_data["stable_throughput[images/s]"] - estimated_throughput = est_data["estimated_throughput_fps"] - throughput_factor = throughput / estimated_throughput - stable_throughput_factor = stable_throughput / estimated_throughput - - # TODO: Take throughput or stable_throughput? - throughput_pass = throughput_factor > self.params["throughput_factor_threshold"] - - log["throughput_pass"] = throughput_pass - log["throughput"] = throughput - log["stable_throughput"] = stable_throughput - log["estimated_throughput"] = estimated_throughput - - # log FIFO sizes for easier inspection - log["fifo_depths"] = {} - log["fifo_sizes"] = {} - total_fifo_size = 0 - for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"): - node_inst = getCustomOp(node) - log["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") - log["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") - total_fifo_size += log["fifo_sizes"][node.name] - log["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) - - # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear - fifo_reduction_pass = [] - log["fifo_reduction_results"] = {} - model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx") - for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"): - model = copy.deepcopy(model_orig) - node = model.get_node_from_name(node_orig.name) - node_inst = getCustomOp(node) - - # skip shallow FIFOs - # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado? - if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]: - log["fifo_reduction_results"][node.name] = "skip" - continue - - # reduce depth of current FIFO and reset generated code - node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"])) - node_inst.set_nodeattr("code_gen_dir_ipgen", "") - node_inst.set_nodeattr("ip_path", "") - node_inst.set_nodeattr("ipgen_path", "") - - # save model variation - tmp_output_dir_var = build_dir + "/variations/" + node.name - os.makedirs(tmp_output_dir_var) - model.save(tmp_output_dir_var + "/model.onnx") - - # build again, only re-run necessary steps to save time - cfg.output_dir = tmp_output_dir_var - cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"] - build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg) - - # load performance report - with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f: - sim_data = json.load(f) - - # check for deadlock - model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") - first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) - last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] - var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected - - # check rtlsim throughput - var_throughput = sim_data["throughput[images/s]"] - var_stable_throughput = sim_data["stable_throughput[images/s]"] - # TODO: take throughput or stable_throughput? - throughput_drop = (throughput - var_throughput) / throughput - - if var_deadlock: - fifo_reduction_pass.append(True) - log["fifo_reduction_results"][node.name] = 1.0 - elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]: - fifo_reduction_pass.append(True) - log["fifo_reduction_results"][node.name] = throughput_drop - else: - fifo_reduction_pass.append(False) - log["fifo_reduction_results"][node.name] = "fail (no drop)" - - self.output_dict["fifosizing_testresults"] = log - - def step_build(self): - # TODO: rename steps to model three phases: model creation/import, dataflow build, analysis - # dataflow build should be easily swappable and adpaptable to finn-examples - cfg = self.step_build_setup() - cfg.board = self.board - if "folding_path" in self.build_inputs: - cfg.folding_config_file = self.build_inputs["folding_path"] - if "specialize_path" in self.build_inputs: - cfg.specialize_layers_config_file = self.build_inputs["specialize_path"] - self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"]) def step_parse_builder_output(self, build_dir): # build output itself is not relevant here (yet) pass - - def run(self): - self.steps_full_build_flow() - - -# # custom steps -# from custom_steps import ( -# step_extract_absorb_bias, -# step_pre_streamline, -# step_residual_convert_to_hw, -# step_residual_streamline, -# step_residual_tidy, -# step_residual_topo, -# step_set_preferred_impl_style, -# step_convert_final_layers -# ) - -# TODO: put these definitions into separate files/classes so we can use them for other types of benchmaks as well -class bench_metafi_fifosizing(bench_fifosizing): - def step_build_setup(self): - # create build config for MetaFi models - - steps = [ - # step_residual_tidy, - # step_extract_absorb_bias, - # step_residual_topo, - # step_pre_streamline, - # step_residual_streamline, - # step_residual_convert_to_hw, - "step_create_dataflow_partition", - # step_set_preferred_impl_style, - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - "step_create_stitched_ip", - "step_measure_rtlsim_performance", - "step_out_of_context_synthesis", - "step_synthesize_bitfile", - "step_make_pynq_driver", - "step_deployment_package", - ] - - cfg = build_cfg.DataflowBuildConfig( - output_dir = self.build_inputs["build_dir"], - synth_clk_period_ns = self.clock_period_ns, - steps=steps, - verbose=False, - target_fps=None, #23 - shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end - #vitis_platform=vitis_platform, - - auto_fifo_depths=False, - split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test - - # general rtlsim settings - force_python_rtlsim=False, - rtlsim_batch_size=self.params["rtlsim_n"], - - # folding_config_file=folding_config_file, - # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", - # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", - # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json", - auto_fifo_strategy="characterize", - characteristic_function_strategy=self.params["strategy"], - #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, - # standalone_thresholds=True, - # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, - build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, - build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing - ], - ) - - # where is this used and why? - cfg.use_conv_rtl = True, # use rtl for conv layers (MVAU cannot use rtl in our model) - - return cfg - - -class bench_resnet50_fifosizing(bench_fifosizing): - def step_build_setup(self): - # create build config for ResNet-50 (based on finn-examples) - - resnet50_build_steps = [ - step_resnet50_tidy, - step_resnet50_streamline, - step_resnet50_convert_to_hw, - "step_create_dataflow_partition", - "step_specialize_layers", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - step_resnet50_slr_floorplan, - "step_create_stitched_ip", # was not in finn-examples - "step_measure_rtlsim_performance", # was not in finn-examples - "step_out_of_context_synthesis", # was not in finn-examples - "step_synthesize_bitfile", - "step_make_pynq_driver", - "step_deployment_package", - ] - - cfg = build_cfg.DataflowBuildConfig( - output_dir = self.build_inputs["build_dir"], - synth_clk_period_ns = self.clock_period_ns, - steps=resnet50_build_steps, - shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end - auto_fifo_depths=False, - split_large_fifos=True, - vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end - - # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, - build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, - build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing - ], - ) - - # non-standard build parameter for custom step - cfg.floorplan_path = self.build_inputs["floorplan_path"] - - return cfg \ No newline at end of file diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index ed9991100b..305cac8188 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -858,7 +858,7 @@ def step_export_onnx(self, output_onnx_path): opset_version = 14, do_constant_folding = True) - def step_build(self): + def step_build_setup(self): #with open("params.yaml") as file: # params = yaml.safe_load(file) # Seed all RNGs @@ -910,7 +910,6 @@ def step_build(self): output_dir = self.build_inputs["build_dir"], stitched_ip_gen_dcp = False, # only needed for further manual integration synth_clk_period_ns = self.clock_period_ns, - board = self.board, shell_flow_type = shell_flow, folding_config_file = "folding.yaml", specialize_layers_config_file = "specialize_layers.json", @@ -928,7 +927,7 @@ def step_build(self): #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed - #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components + #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components TODO: try with pyXSI #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation ], @@ -1041,25 +1040,23 @@ def step_build(self): #test_step_build_platform # synthesis with instr wrapper ] ) - # Run the build process on the dummy attention operator graph - # TODO: maybe let this function return the cfg only, so it can be modified by bench context - build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) - - def run(self): - self.steps_full_build_flow() - - # DEBUG code for live logging of long instr wrapper simulation: - # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log") - # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True) - # sim_output_dir = build_dir + "/instrwrap_sim" - # # Prepare bash script - # bash_script = os.getcwd() + "/run_vivado_sim.sh" - # with open(bash_script, "w") as script: - # script.write("#!/bin/bash\n") - # script.write("cd %s\n"%(sim_output_dir)) - # script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path)) - # # Run script - # print("Running Vivado simulation of instrumentation wrapper") - # sub_proc = subprocess.Popen(["bash", bash_script]) - # sub_proc.communicate() - ####### + + return cfg + + #def run(self): + # self.steps_full_build_flow() + # DEBUG code for live logging of long instr wrapper simulation: + # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log") + # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True) + # sim_output_dir = build_dir + "/instrwrap_sim" + # # Prepare bash script + # bash_script = os.getcwd() + "/run_vivado_sim.sh" + # with open(bash_script, "w") as script: + # script.write("#!/bin/bash\n") + # script.write("cd %s\n"%(sim_output_dir)) + # script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path)) + # # Run script + # print("Running Vivado simulation of instrumentation wrapper") + # sub_proc = subprocess.Popen(["bash", bash_script]) + # sub_proc.communicate() + ####### From f6d196b69249d405fac5e2003c68ead216c42139 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 15:36:31 +0000 Subject: [PATCH 008/125] Fix bench class lookup --- benchmarking/bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index f3a4c0f424..7e38a2f0c8 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -139,8 +139,8 @@ def get_default_session_options_new(): # Create bench object for respective DUT if "dut" in params: - if params.dut in dut: - bench_object = dut[params.dut](params, task_id, run_id, artifacts_dir, save_dir) + if params["dut"] in dut: + bench_object = dut[params["dut"]](params, task_id, run_id, artifacts_dir, save_dir) else: print("ERROR: unknown DUT specified") return 1 From c6ae70fa5f929f0176befb0c919fd6fdc5e7bf0b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 15:54:50 +0000 Subject: [PATCH 009/125] Fix cfgs --- benchmarking/cfg/fifosizing_test.json | 2 +- benchmarking/cfg/metafi_fifosizing_test.json | 2 +- benchmarking/cfg/metafi_test.json | 2 +- benchmarking/cfg/resnet50_fifosizing_test.json | 2 +- benchmarking/cfg/resnet50_test.json | 2 +- benchmarking/cfg/transformer_gpt_all.json | 8 ++++---- benchmarking/cfg/transformer_radioml_all.json | 4 ++-- benchmarking/cfg/transformer_sweep.json | 10 +++++----- benchmarking/cfg/transformer_test.json | 2 +- 9 files changed, 17 insertions(+), 17 deletions(-) diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json index 519b7fe430..d3d4559e43 100644 --- a/benchmarking/cfg/fifosizing_test.json +++ b/benchmarking/cfg/fifosizing_test.json @@ -1,6 +1,6 @@ [ { - "dut": "synthetic_nonlinear", + "dut": ["synthetic_nonlinear"], "dim": [32], "kernel_size": [5], "ch": [4], diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index 7540949eaf..a98089d046 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -1,6 +1,6 @@ [ { - "dut": "metafi", + "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index 63a26d0dbc..2d382d3a61 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -1,6 +1,6 @@ [ { - "dut": "metafi", + "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index 9ded5630f0..82b3d36659 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -1,6 +1,6 @@ [ { - "dut": "resnet50", + "dut": ["resnet50"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index bb9a65873e..19c555dd9d 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -1,6 +1,6 @@ [ { - "dut": "resnet50", + "dut": ["resnet50"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json index fd228710f1..4b1ee011c1 100644 --- a/benchmarking/cfg/transformer_gpt_all.json +++ b/benchmarking/cfg/transformer_gpt_all.json @@ -1,24 +1,24 @@ [ { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"], "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"], "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"], "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], "dut_duplication": [1] diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json index 207839f5d5..f2c8733c20 100644 --- a/benchmarking/cfg/transformer_radioml_all.json +++ b/benchmarking/cfg/transformer_radioml_all.json @@ -1,12 +1,12 @@ [ { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], "dut_duplication": [1] diff --git a/benchmarking/cfg/transformer_sweep.json b/benchmarking/cfg/transformer_sweep.json index d30df90b87..e1795ff3f8 100644 --- a/benchmarking/cfg/transformer_sweep.json +++ b/benchmarking/cfg/transformer_sweep.json @@ -1,6 +1,6 @@ [ { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], @@ -19,7 +19,7 @@ "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], @@ -38,7 +38,7 @@ "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], @@ -57,7 +57,7 @@ "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], @@ -76,7 +76,7 @@ "dut_duplication": [1] }, { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json index d7346e6068..a740a447b6 100644 --- a/benchmarking/cfg/transformer_test.json +++ b/benchmarking/cfg/transformer_test.json @@ -1,6 +1,6 @@ [ { - "dut": "transformer", + "dut": ["transformer"], "seed": [12], "calibration_passes": [32], From 3d4e7a618d0dbb5413e0ba34b56f190dc033f34a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 16:16:02 +0000 Subject: [PATCH 010/125] Fix misc --- benchmarking/bench_base.py | 7 ++++++- benchmarking/cfg/metafi_test.json | 4 +++- benchmarking/cfg/resnet50_test.json | 4 +++- benchmarking/dut/metafi.py | 3 +-- benchmarking/dut/resnet50.py | 2 +- benchmarking/dut/synthetic_nonlinear.py | 1 - 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 0bd7be6907..5ed6750820 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1055,8 +1055,13 @@ def steps_full_build_flow(self): self.build_inputs["floorplan_path"] = self.params["floorplan_path"] ### BUILD SETUP ### + # TODO: select output products here, depending on what shall be tested + # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow cfg = self.step_build_setup() cfg.board = self.board + if "fifo_rtlsim_n" in self.params: + # TODO: determine automatically or replace by exact instr wrapper sim + cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"] if "folding_path" in self.build_inputs: cfg.folding_config_file = self.build_inputs["folding_path"] if "specialize_path" in self.build_inputs: @@ -1071,5 +1076,5 @@ def steps_full_build_flow(self): self.step_parse_builder_output(self.build_inputs["build_dir"]) # Only run in-depth FIFO test if selected - if "fifo_rtlsim_n" in self.params: + if "fifo_throughput_factor_threshold" in self.params: self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"]) diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index 2d382d3a61..b0989eabca 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -5,6 +5,8 @@ "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], "board": ["RFSoC2x2"], - "clock_period_ns": [10] + "clock_period_ns": [10], + + "fifo_rtlsim_n": [10] } ] \ No newline at end of file diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index 19c555dd9d..30131923a4 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -8,6 +8,8 @@ "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], "board": ["U280"], - "clock_period_ns": [4] + "clock_period_ns": [4], + + "fifo_rtlsim_n": [2] } ] \ No newline at end of file diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index 94bb4b068c..52e31eabee 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -57,7 +57,6 @@ def step_build_setup(self): # general rtlsim settings force_python_rtlsim=False, - rtlsim_batch_size=self.params["rtlsim_n"], # folding_config_file=folding_config_file, # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", @@ -68,7 +67,7 @@ def step_build_setup(self): #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, # standalone_thresholds=True, # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, build_cfg.DataflowOutputType.STITCHED_IP, diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py index 701f7f65e2..c4f80737c0 100644 --- a/benchmarking/dut/resnet50.py +++ b/benchmarking/dut/resnet50.py @@ -45,7 +45,7 @@ def step_build_setup(self): vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategyCfg.PERFORMANCE_BEST, + vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, build_cfg.DataflowOutputType.STITCHED_IP, diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index 3193432798..852d47012f 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -301,7 +301,6 @@ def step_build_setup(self): target_fps=None, # general rtlsim settings force_python_rtlsim=False, - rtlsim_batch_size=self.params["rtlsim_n"], shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, From a5bd7ab2a9ab235425833ea2be01cc4f8c1268ad Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 16:53:51 +0000 Subject: [PATCH 011/125] Unify fifosizing settings --- benchmarking/bench_base.py | 18 +++++++++++++++++- benchmarking/cfg/fifosizing_test.json | 3 ++- benchmarking/cfg/metafi_fifosizing_test.json | 2 ++ benchmarking/cfg/metafi_test.json | 2 ++ benchmarking/cfg/resnet50_fifosizing_test.json | 2 ++ benchmarking/cfg/resnet50_test.json | 2 ++ benchmarking/dut/metafi.py | 9 ++------- benchmarking/dut/resnet50.py | 1 - benchmarking/dut/synthetic_nonlinear.py | 9 ++------- benchmarking/dut/transformer.py | 3 --- 10 files changed, 31 insertions(+), 20 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 5ed6750820..04583c1652 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1059,9 +1059,25 @@ def steps_full_build_flow(self): # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow cfg = self.step_build_setup() cfg.board = self.board + cfg.verbose = False + cfg.enable_build_pdb_debug = False + cfg.force_python_rtlsim = False + + # "manual or "characterize" or "largefifo_rtlsim" + if "fifo_method" in self.params: + if self.params["fifo_method"] == "manual": + cfg.auto_fifo_depths = False + else: + cfg.auto_fifo_depths = True + cfg.auto_fifo_strategy = self.params["fifo_method"] + # only relevant for "characterize" method: "rtlsim" or "analytical" + if "fifo_strategy" in self.params: + cfg.characteristic_function_strategy = self.params["fifo_strategy"] + + # TODO: determine automatically or replace by exact instr wrapper sim if "fifo_rtlsim_n" in self.params: - # TODO: determine automatically or replace by exact instr wrapper sim cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"] + if "folding_path" in self.build_inputs: cfg.folding_config_file = self.build_inputs["folding_path"] if "specialize_path" in self.build_inputs: diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json index d3d4559e43..20e2588282 100644 --- a/benchmarking/cfg/fifosizing_test.json +++ b/benchmarking/cfg/fifosizing_test.json @@ -11,7 +11,8 @@ "lb_num_layers": [1], "rb_num_layers": [3], - "strategy": ["analytical", "rtlsim"], + "fifo_method": ["characterize"], + "fifo_strategy": ["analytical", "rtlsim"], "fifo_rtlsim_n": [10], "fifo_throughput_factor_threshold": [0.9], diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index a98089d046..6a441cbcd5 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -7,6 +7,8 @@ "board": ["RFSoC2x2"], "clock_period_ns": [10], + "fifo_method": ["largefifo_rtlsim"], + "fifo_rtlsim_n": [10], "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [1024], diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index b0989eabca..7ede065c76 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -7,6 +7,8 @@ "board": ["RFSoC2x2"], "clock_period_ns": [10], + "fifo_method": ["manual"], + "fifo_rtlsim_n": [10] } ] \ No newline at end of file diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index 82b3d36659..b4dddc24f9 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -10,6 +10,8 @@ "board": ["U280"], "clock_period_ns": [4], + "fifo_method": ["largefifo_rtlsim"], + "fifo_rtlsim_n": [2], "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [1024], diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index 30131923a4..df81e83661 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -10,6 +10,8 @@ "board": ["U280"], "clock_period_ns": [4], + "fifo_method": ["manual"], + "fifo_rtlsim_n": [2] } ] \ No newline at end of file diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index 52e31eabee..462314c2ec 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -47,23 +47,18 @@ def step_build_setup(self): output_dir = self.build_inputs["build_dir"], synth_clk_period_ns = self.clock_period_ns, steps=steps, - verbose=False, + target_fps=None, #23 shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end #vitis_platform=vitis_platform, - auto_fifo_depths=False, split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test - # general rtlsim settings - force_python_rtlsim=False, - # folding_config_file=folding_config_file, # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json", - auto_fifo_strategy="characterize", - characteristic_function_strategy=self.params["strategy"], + #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, # standalone_thresholds=True, # enable extra performance optimizations (physopt) diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py index c4f80737c0..87c6e04e2e 100644 --- a/benchmarking/dut/resnet50.py +++ b/benchmarking/dut/resnet50.py @@ -40,7 +40,6 @@ def step_build_setup(self): synth_clk_period_ns = self.clock_period_ns, steps=resnet50_build_steps, shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end - auto_fifo_depths=False, split_large_fifos=True, vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index 852d47012f..a3039d6c5f 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -291,16 +291,11 @@ def step_build_setup(self): cfg = build_cfg.DataflowBuildConfig( output_dir = self.build_inputs["build_dir"], synth_clk_period_ns = self.clock_period_ns, - verbose=False, - # only works with characterization-based FIFO-sizing - auto_fifo_depths=True, - auto_fifo_strategy="characterize", - characteristic_function_strategy=self.params["strategy"], + split_large_fifos=False, # manual folding target_fps=None, - # general rtlsim settings - force_python_rtlsim=False, + shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 305cac8188..014da2e13e 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -918,9 +918,6 @@ def step_build_setup(self): mvau_wwidth_max = 2048, split_large_fifos = True, - verbose = False, # if True prints stdout and stderr to console instead of build_dataflow.log - enable_build_pdb_debug = False, - generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM From e6998fb09df332b6fa8d73275ccee0d92574ae0e Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 20:33:16 +0000 Subject: [PATCH 012/125] Use correct Singularity image for benchmarks --- .gitlab-ci.yml | 20 ++++++++++++++++++++ benchmarking/bench-ci.yml | 1 - 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 066a7dc289..d30b08becc 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -175,6 +175,16 @@ Bench (Manual): # Do not run on a schedule - if: $CI_PIPELINE_SOURCE == "schedule" when: never + # Select different Singularity image if it deviates from default (dev branch) + - changes: + paths: + - requirements.txt + - docker/Dockerfile.finn + - docker/finn_entrypoint.sh + - docker/quicktest.sh + compare_to: "dev" + variables: + SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" - if: $MANUAL_CFG_PATH != "" trigger: include: benchmarking/bench-ci.yml @@ -190,6 +200,16 @@ Bench: # Do not run on a schedule - if: $CI_PIPELINE_SOURCE == "schedule" when: never + # Select different Singularity image if it deviates from default (dev branch) + - changes: + paths: + - requirements.txt + - docker/Dockerfile.finn + - docker/finn_entrypoint.sh + - docker/quicktest.sh + compare_to: "dev" + variables: + SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" - if: $MANUAL_CFG_PATH == "" trigger: include: benchmarking/bench-ci.yml diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index f50bd1d3f8..5fdcd360f2 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -19,7 +19,6 @@ FINN Build: variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" - FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/xilinx/finn_dev.sif" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus From be19a1af1555c9dc498524e064b6fab9b9133b19 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 20:39:55 +0000 Subject: [PATCH 013/125] Select Singularity image in child pipeline --- .gitlab-ci.yml | 20 -------------------- benchmarking/bench-ci.yml | 13 +++++++++++++ 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d30b08becc..066a7dc289 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -175,16 +175,6 @@ Bench (Manual): # Do not run on a schedule - if: $CI_PIPELINE_SOURCE == "schedule" when: never - # Select different Singularity image if it deviates from default (dev branch) - - changes: - paths: - - requirements.txt - - docker/Dockerfile.finn - - docker/finn_entrypoint.sh - - docker/quicktest.sh - compare_to: "dev" - variables: - SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" - if: $MANUAL_CFG_PATH != "" trigger: include: benchmarking/bench-ci.yml @@ -200,16 +190,6 @@ Bench: # Do not run on a schedule - if: $CI_PIPELINE_SOURCE == "schedule" when: never - # Select different Singularity image if it deviates from default (dev branch) - - changes: - paths: - - requirements.txt - - docker/Dockerfile.finn - - docker/finn_entrypoint.sh - - docker/quicktest.sh - compare_to: "dev" - variables: - SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" - if: $MANUAL_CFG_PATH == "" trigger: include: benchmarking/bench-ci.yml diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 5fdcd360f2..3485ebfdfe 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -19,6 +19,19 @@ FINN Build: variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" + rules: + # Select different Singularity image if it deviates from default (dev branch) + - changes: + paths: + - requirements.txt + - docker/Dockerfile.finn + - docker/finn_entrypoint.sh + - docker/quicktest.sh + compare_to: "dev" + variables: + SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" + # Always run (when triggered), as long as there was no prior failure + - when: on_success before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus From 5db60fad0f7da3c59fd0c54b340e7ee457a280b7 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 30 Jan 2025 21:14:48 +0000 Subject: [PATCH 014/125] Fix img --- benchmarking/bench-ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 3485ebfdfe..05980e689f 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -16,9 +16,6 @@ FINN Build: CI_JOB_JWT: aud: https://git.uni-paderborn.de stage: synth - variables: - SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" - PYTEST_PARALLEL: "$CPU_CORES" rules: # Select different Singularity image if it deviates from default (dev branch) - changes: @@ -32,6 +29,10 @@ FINN Build: SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" # Always run (when triggered), as long as there was no prior failure - when: on_success + variables: + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" + PYTEST_PARALLEL: "$CPU_CORES" + FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus From 98176098d9a3d10c7a8ee9bd3393da2bd72490d5 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 08:09:03 +0000 Subject: [PATCH 015/125] Try fix for Transformer streamlining --- benchmarking/dut/transformer_custom_steps.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 91bdebb206..28f23ded7c 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -687,6 +687,10 @@ def Streamline(): # noqa: Uppercase AbsorbMulIntoMultiThreshold(), Absorb1BitMulIntoMatMul(), Absorb1BitMulIntoConv(), + MoveMulPastAdd(), + AbsorbMulIntoMultiThreshold(), + AbsorbAddIntoMultiThreshold(), + MoveAddPastMul() ] ), # Streamlining scales and biases forward through residual topologies From dbeb3a0e0b8bd210ac71056cc0251e8f1f9daad7 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 09:16:25 +0000 Subject: [PATCH 016/125] Display .sif file name --- benchmarking/bench.py | 7 +++---- run-docker.sh | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 7e38a2f0c8..855f57cd50 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -63,14 +63,13 @@ def get_default_session_options_new(): os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True) log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id)) - # save dir for saving bitstreams (and optionally full build artifacts for debugging (TODO)) - # TODO: make this more configurable or switch to job/artifact based power measurement + # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) if job_id == 0: #DEBUG mode save_dir = experiment_dir + "_save" else: - save_dir = os.path.join("/scratch/hpc-prf-radioml/felix/jobs/", - "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME")) + save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"), + "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME")) print("Saving additional artifacts in path: %s" % save_dir) os.makedirs(save_dir, exist_ok=True) diff --git a/run-docker.sh b/run-docker.sh index 4047205e57..b99615e2e8 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -318,7 +318,7 @@ else SINGULARITY_EXEC="${SINGULARITY_EXEC//"-v "/"-B "}" SINGULARITY_EXEC="${SINGULARITY_EXEC//"-w "/"--pwd "}" CMD_TO_RUN="$SINGULARITY_BASE $SINGULARITY_EXEC $FINN_SINGULARITY /usr/local/bin/finn_entrypoint.sh $DOCKER_CMD" - gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker" + gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker: $FINN_SINGULARITY" fi $CMD_TO_RUN From c4dbd34de4e912674722d2b690c9f6b436de85e7 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 10:15:17 +0000 Subject: [PATCH 017/125] Disable fifo reduction testing for big models --- benchmarking/bench.py | 1 + benchmarking/cfg/metafi_fifosizing_test.json | 2 +- benchmarking/cfg/resnet50_fifosizing_test.json | 4 ++-- benchmarking/cfg/resnet50_test.json | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 855f57cd50..efc38eed41 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -163,6 +163,7 @@ def get_default_session_options_new(): log_dict["status"] = "failed" print("Run failed: " + traceback.format_exc()) exit_code = 1 + # TODO: exception catch all in builder prevents internal failures from being caught here log_dict["total_time"] = int(time.time() - start_time) log_dict["output"] = output_dict diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index 6a441cbcd5..7e7ff45de9 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -11,7 +11,7 @@ "fifo_rtlsim_n": [10], "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [1024], + "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], "fifo_reduction_throughput_drop_threshold": [0.01] } diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index b4dddc24f9..10806ef1a6 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -7,14 +7,14 @@ "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - "board": ["U280"], + "board": ["U250"], "clock_period_ns": [4], "fifo_method": ["largefifo_rtlsim"], "fifo_rtlsim_n": [2], "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [1024], + "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], "fifo_reduction_throughput_drop_threshold": [0.01] } diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index df81e83661..8cef76af87 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -7,7 +7,7 @@ "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - "board": ["U280"], + "board": ["U250"], "clock_period_ns": [4], "fifo_method": ["manual"], From 349995fbfd8a5e6e908ed495a70c23e9f540a4a3 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 12:35:46 +0000 Subject: [PATCH 018/125] Try 2nd streamlining fix --- benchmarking/bench_base.py | 1 + benchmarking/dut/transformer_custom_steps.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 04583c1652..24d8369055 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1062,6 +1062,7 @@ def steps_full_build_flow(self): cfg.verbose = False cfg.enable_build_pdb_debug = False cfg.force_python_rtlsim = False + #rtlsim_use_vivado_comps # TODO ? # "manual or "characterize" or "largefifo_rtlsim" if "fifo_method" in self.params: diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 28f23ded7c..9c2a07d05e 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -687,10 +687,6 @@ def Streamline(): # noqa: Uppercase AbsorbMulIntoMultiThreshold(), Absorb1BitMulIntoMatMul(), Absorb1BitMulIntoConv(), - MoveMulPastAdd(), - AbsorbMulIntoMultiThreshold(), - AbsorbAddIntoMultiThreshold(), - MoveAddPastMul() ] ), # Streamlining scales and biases forward through residual topologies @@ -864,6 +860,11 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): # Note: Contains some sets of nested exhaustive transformations meant for # particular architectural patterns, e.g., residual topologies. model = model.transform(Streamline()) + # DEBUG for streamlining after moving to MoveLinearPastFork with workaround applied + model = model.transform(MoveMulPastAdd()) + model = model.transform(AbsorbMulIntoMultiThreshold()) + model = model.transform(AbsorbAddIntoMultiThreshold()) + model = model.transform(MoveAddPastMul()) # If configured, run a verification of the transformed model on some # sample inputs if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): # noqa From 139c62448a3ae7f555f3f44f4e748fcfb3eb40b1 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 14:08:55 +0000 Subject: [PATCH 019/125] ResNet disable inferdatalayouts --- benchmarking/dut/resnet50_custom_steps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py index ddf8b0d0de..e808072baa 100644 --- a/benchmarking/dut/resnet50_custom_steps.py +++ b/benchmarking/dut/resnet50_custom_steps.py @@ -175,7 +175,7 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) - model = model.transform(InferDataLayouts()) + #model = model.transform(InferDataLayouts()) model = model.transform(DoubleToSingleFloat()) model = model.transform(InferDataTypes()) model = model.transform(SortGraph()) @@ -196,7 +196,7 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): ] for trn in to_hw_transformations: model = model.transform(trn()) - model = model.transform(InferDataLayouts()) + #model = model.transform(InferDataLayouts()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataTypes()) From 358a2c6b155d7fe65ea1402c87e881a689e8c446 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 16:01:36 +0000 Subject: [PATCH 020/125] Use dotenv artifact --- .gitlab-ci.yml | 23 ++++++++++------------- benchmarking/bench-ci.yml | 20 ++++++-------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 066a7dc289..773d0ebb42 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -38,8 +38,6 @@ variables: value: "" FINN_XILINX_VERSION: value: "2022.2" - SINGULARITY_IMG_SELECT: - value: "finn_dev.sif" workflow: name: '$PIPELINE_NAME' @@ -98,10 +96,15 @@ Singularity Image Build: - docker build --no-cache -f docker/Dockerfile.finn --tag=finn_docker_export . - apptainer build --force finn_singularity_image.sif docker-daemon://finn_docker_export:latest - rsync -vh finn_singularity_image.sif $PATH_SINGULARITY_IMG_BUILD/finn-plus/finn_$CI_COMMIT_REF_SLUG.sif + - echo SINGULARITY_IMG_SELECT=finn_$CI_COMMIT_REF_SLUG.sif > FINN_environment.env after_script: # Clean caches - echo 'y' | docker image prune - echo 'y' | docker builder prune - echo 'y' | apptainer cache clean + # Save env var selecting Singularity image to be used in subsequent jobs + artifacts: + reports: + dotenv: FINN_environment.env Fetch Repos: id_tokens: @@ -135,16 +138,6 @@ FINN Test Suite 2022.2: # Do not run if test suite has been deselected - if: $TEST_SUITE == "none" when: never - # Select different Singularity image if it deviates from default (dev branch) - - changes: - paths: - - requirements.txt - - docker/Dockerfile.finn - - docker/finn_entrypoint.sh - - docker/quicktest.sh - compare_to: "dev" - variables: - SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" # Always run, as long as there was no prior failure - when: on_success cache: @@ -155,12 +148,13 @@ FINN Test Suite 2022.2: variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" PYTEST_PARALLEL: "$CPU_CORES" - FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT" + SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact FINN_XILINX_VERSION: "2022.2" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus - module load system singularity + - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT script: - ./run-docker.sh quicktest.sh $TEST_SUITE @@ -182,6 +176,7 @@ Bench (Manual): forward: pipeline_variables: true variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID BENCH_CFG: "manual" Bench: @@ -196,6 +191,8 @@ Bench: strategy: depend forward: pipeline_variables: true + variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID parallel: matrix: - BENCH_CFG: [mvau_test, resnet50_test, metafi_test] diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 05980e689f..877caee30d 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -16,27 +16,19 @@ FINN Build: CI_JOB_JWT: aud: https://git.uni-paderborn.de stage: synth - rules: - # Select different Singularity image if it deviates from default (dev branch) - - changes: - paths: - - requirements.txt - - docker/Dockerfile.finn - - docker/finn_entrypoint.sh - - docker/quicktest.sh - compare_to: "dev" - variables: - SINGULARITY_IMG_SELECT: "finn_$CI_COMMIT_REF_SLUG.sif" - # Always run (when triggered), as long as there was no prior failure - - when: on_success + needs: + - pipeline: $PARENT_PIPELINE_ID + job: Singularity Image Build + optional: true variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" - FINN_SINGULARITY: "$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT" + SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus - module load system singularity + - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT script: - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG cache: From cda98a665d7d443887354903400716ae08474c0b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 16:21:49 +0000 Subject: [PATCH 021/125] Try without optional --- benchmarking/bench-ci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 877caee30d..388cd18e73 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -19,7 +19,6 @@ FINN Build: needs: - pipeline: $PARENT_PIPELINE_ID job: Singularity Image Build - optional: true variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" From 91da4f5bc490c1eb52d96b67bbc477bec35125b3 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 16:56:49 +0000 Subject: [PATCH 022/125] Try optional again --- benchmarking/bench-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 388cd18e73..44ceda3265 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -17,8 +17,9 @@ FINN Build: aud: https://git.uni-paderborn.de stage: synth needs: - - pipeline: $PARENT_PIPELINE_ID - job: Singularity Image Build + - job: Singularity Image Build + pipeline: $PARENT_PIPELINE_ID + optional: true variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" From 0b92591a97f5bec677cad44c4afdccf821d0d922 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 31 Jan 2025 17:18:40 +0000 Subject: [PATCH 023/125] Workaround optional artifact --- .gitlab-ci.yml | 10 +++++++++- benchmarking/bench-ci.yml | 4 +--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 773d0ebb42..e8249863bf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -123,8 +123,17 @@ Fetch Repos: key: $CI_COMMIT_SHA paths: - deps + variables: + SINGULARITY_IMG_SELECT: "finn_dev.sif" # default, may be overwritten by dotenv artifact script: - ./fetch-repos.sh + # Workaround for https://gitlab.com/gitlab-org/gitlab/-/issues/349538 + # Passing artifacts from optional parent jobs to child pipelines is not supported + # Therefore, we pass the dotenv artifact from "Singularity Image Build" through this job + - echo SINGULARITY_IMG_SELECT=$SINGULARITY_IMG_SELECT > FINN_environment_passthrough.env + artifacts: + reports: + dotenv: FINN_environment_passthrough.env FINN Test Suite 2022.2: id_tokens: @@ -148,7 +157,6 @@ FINN Test Suite 2022.2: variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" PYTEST_PARALLEL: "$CPU_CORES" - SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact FINN_XILINX_VERSION: "2022.2" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 44ceda3265..5cf0568c31 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -17,13 +17,11 @@ FINN Build: aud: https://git.uni-paderborn.de stage: synth needs: - - job: Singularity Image Build + - job: Fetch Repos pipeline: $PARENT_PIPELINE_ID - optional: true variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" PYTEST_PARALLEL: "$CPU_CORES" - SINGULARITY_IMG_SELECT: "finn_dev.sif" # may be overwritten by dotenv artifact before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus From b7145aa1cd2dc1d96a358196d544b63560268acc Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 2 Feb 2025 16:04:04 +0000 Subject: [PATCH 024/125] Revert RN-50 removal of inferdatalayouts --- benchmarking/dut/resnet50_custom_steps.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py index e808072baa..ddf8b0d0de 100644 --- a/benchmarking/dut/resnet50_custom_steps.py +++ b/benchmarking/dut/resnet50_custom_steps.py @@ -175,7 +175,7 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) - #model = model.transform(InferDataLayouts()) + model = model.transform(InferDataLayouts()) model = model.transform(DoubleToSingleFloat()) model = model.transform(InferDataTypes()) model = model.transform(SortGraph()) @@ -196,7 +196,7 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): ] for trn in to_hw_transformations: model = model.transform(trn()) - #model = model.transform(InferDataLayouts()) + model = model.transform(InferDataLayouts()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataTypes()) From 503f73ee53d917500641c63161fbe7b45bd7db60 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 2 Feb 2025 16:20:12 +0000 Subject: [PATCH 025/125] Sweep over fifosim n --- benchmarking/bench_base.py | 19 +++++++++++++------ benchmarking/cfg/fifosizing_test.json | 2 +- benchmarking/cfg/metafi_fifosizing_test.json | 4 +++- benchmarking/cfg/metafi_test.json | 2 +- .../cfg/resnet50_fifosizing_test.json | 4 +++- benchmarking/cfg/resnet50_test.json | 2 +- benchmarking/dut/metafi.py | 2 +- 7 files changed, 23 insertions(+), 12 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 24d8369055..18797579f7 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -902,8 +902,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir): model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"] + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected log["deadlock"] = deadlock.tolist() @@ -961,8 +961,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir): model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["fifo_rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["fifo_rtlsim_n"] + input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] + output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected # check rtlsim throughput @@ -1063,6 +1063,8 @@ def steps_full_build_flow(self): cfg.enable_build_pdb_debug = False cfg.force_python_rtlsim = False #rtlsim_use_vivado_comps # TODO ? + #cfg.default_swg_exception + #cfg.large_fifo_mem_style # "manual or "characterize" or "largefifo_rtlsim" if "fifo_method" in self.params: @@ -1075,9 +1077,14 @@ def steps_full_build_flow(self): if "fifo_strategy" in self.params: cfg.characteristic_function_strategy = self.params["fifo_strategy"] + # Batch size used for RTLSim performance measurement (and in-depth FIFO test here) # TODO: determine automatically or replace by exact instr wrapper sim - if "fifo_rtlsim_n" in self.params: - cfg.rtlsim_batch_size=self.params["fifo_rtlsim_n"] + if "rtlsim_n" in self.params: + cfg.rtlsim_batch_size=self.params["rtlsim_n"] + + # Batch size used for FIFO sizing (largefifo_rtlsim only) + if "fifo_rtlsim_n": + cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"] if "folding_path" in self.build_inputs: cfg.folding_config_file = self.build_inputs["folding_path"] diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json index 20e2588282..cf49aa80a7 100644 --- a/benchmarking/cfg/fifosizing_test.json +++ b/benchmarking/cfg/fifosizing_test.json @@ -14,7 +14,7 @@ "fifo_method": ["characterize"], "fifo_strategy": ["analytical", "rtlsim"], - "fifo_rtlsim_n": [10], + "rtlsim_n": [10], "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [64], "fifo_reduction_factor": [0.5], diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index 7e7ff45de9..02116cfeb5 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -7,9 +7,11 @@ "board": ["RFSoC2x2"], "clock_period_ns": [10], + "rtlsim_n": [10], + "fifo_method": ["largefifo_rtlsim"], - "fifo_rtlsim_n": [10], + "fifo_rtlsim_n": [2, 4, 8], "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index 7ede065c76..0ee1339441 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -9,6 +9,6 @@ "fifo_method": ["manual"], - "fifo_rtlsim_n": [10] + "rtlsim_n": [3] } ] \ No newline at end of file diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index 10806ef1a6..c4dc4daf78 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -10,9 +10,11 @@ "board": ["U250"], "clock_period_ns": [4], + "rtlsim_n": [10], + "fifo_method": ["largefifo_rtlsim"], - "fifo_rtlsim_n": [2], + "fifo_rtlsim_n": [2, 4, 8], "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index 8cef76af87..4937cb8395 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -12,6 +12,6 @@ "fifo_method": ["manual"], - "fifo_rtlsim_n": [2] + "rtlsim_n": [3] } ] \ No newline at end of file diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index 462314c2ec..7808f11856 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -52,7 +52,7 @@ def step_build_setup(self): shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end #vitis_platform=vitis_platform, - split_large_fifos=False, # probably needed #TODO: account for this in FIFO reduction test + split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test # folding_config_file=folding_config_file, # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", From 2c0903d3cbe260ebd7ca5e18af48013e14a7205c Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 2 Feb 2025 17:15:04 +0000 Subject: [PATCH 026/125] Log partial results in failure --- benchmarking/bench.py | 7 ++----- benchmarking/bench_base.py | 8 +++++++- benchmarking/dut/synthetic_nonlinear.py | 4 ---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index efc38eed41..686c97ddc2 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -150,23 +150,20 @@ def get_default_session_options_new(): start_time = time.time() try: bench_object.run() - output_dict = bench_object.output_dict - if output_dict is None: - output_dict = {} + if not bench_object.output_dict: log_dict["status"] = "skipped" print("Run skipped") else: log_dict["status"] = "ok" print("Run completed") except Exception: - output_dict = {} log_dict["status"] = "failed" print("Run failed: " + traceback.format_exc()) exit_code = 1 # TODO: exception catch all in builder prevents internal failures from being caught here log_dict["total_time"] = int(time.time() - start_time) - log_dict["output"] = output_dict + log_dict["output"] = bench_object.output_dict log.append(log_dict) # overwrite output log file every time to allow early abort with open(log_path, "w") as f: diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 18797579f7..8565dfb57f 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -981,6 +981,8 @@ def step_fifotest(self, onnx_path, cfg, build_dir): fifo_reduction_pass.append(False) log["fifo_reduction_results"][node.name] = "fail (no drop)" + if "fifos" not in self.output_dict: + self.output_dict["fifos"] = {} self.output_dict["fifos"]["fifotest"] = log def steps_simple_model_flow(self): @@ -992,7 +994,11 @@ def steps_simple_model_flow(self): do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False # Perform steps - model, dut_info = self.step_make_model() + make_model_result = self.step_make_model() + if make_model_result is None: + return + else: + model, dut_info = make_model_result # Save model for logging purposes # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index a3039d6c5f..19ba3a6ce0 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -305,7 +305,3 @@ def step_build_setup(self): ) return cfg - - def step_parse_builder_output(self, build_dir): - # build output itself is not relevant here (yet) - pass From 9d71a4ab42c0220a3d2f5b4c5a8538f5e2a6479d Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 2 Feb 2025 17:49:44 +0000 Subject: [PATCH 027/125] Fix typo --- benchmarking/bench_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 8565dfb57f..895e849a53 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1089,7 +1089,7 @@ def steps_full_build_flow(self): cfg.rtlsim_batch_size=self.params["rtlsim_n"] # Batch size used for FIFO sizing (largefifo_rtlsim only) - if "fifo_rtlsim_n": + if "fifo_rtlsim_n" in self.params: cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"] if "folding_path" in self.build_inputs: From 6c744f85f84605cc04b00e1a505d49b44acbb94f Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 4 Feb 2025 15:07:08 +0000 Subject: [PATCH 028/125] Fifo testcase extension --- benchmarking/bench_base.py | 4 ++ benchmarking/cfg/metafi_fifosizing_test.json | 39 +++++++++++++++- .../cfg/resnet50_fifosizing_test.json | 45 ++++++++++++++++++- 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 895e849a53..7374e4007e 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1099,6 +1099,10 @@ def steps_full_build_flow(self): if "floorplan_path" in self.build_inputs: cfg.floorplan_path = self.build_inputs["floorplan_path"] + # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) + # TODO: make configurable or set on pipeline level? + os.environ["LIVENESS_THRESHOLD"] = "10000000" + ### BUILD ### build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index 02116cfeb5..f61ec93217 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -2,16 +2,53 @@ { "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/metafi_fifosizing_xsi_n2.json"], "board": ["RFSoC2x2"], "clock_period_ns": [10], "rtlsim_n": [10], + "fifo_method": ["manual"], + + "fifo_rtlsim_n": [2], + "fifo_throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [99999999999], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + }, + { + "dut": ["metafi"], + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "rtlsim_n": [5], + "fifo_method": ["largefifo_rtlsim"], "fifo_rtlsim_n": [2, 4, 8], + "fifo_throttle_factor": [0.5, 2], + "fifo_throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [99999999999], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + }, + { + "dut": ["metafi"], + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "rtlsim_n": [5], + + "fifo_method": ["characterize"], + "fifo_strategy": ["rtlsim", "analytical"], + "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json index c4dc4daf78..075acda981 100644 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ b/benchmarking/cfg/resnet50_fifosizing_test.json @@ -3,7 +3,7 @@ "dut": ["resnet50"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/rn-50_fifosizing_xsi_n2.json"], "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], @@ -12,9 +12,52 @@ "rtlsim_n": [10], + "fifo_method": ["manual"], + + "fifo_rtlsim_n": [2], + "fifo_throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [99999999999], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + }, + { + "dut": ["resnet50"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["U250"], + "clock_period_ns": [4], + + "rtlsim_n": [5], + "fifo_method": ["largefifo_rtlsim"], "fifo_rtlsim_n": [2, 4, 8], + "fifo_throttle_factor": [0.5, 2], + "fifo_throughput_factor_threshold": [0.9], + "fifo_reduction_skip_threshold": [99999999999], + "fifo_reduction_factor": [0.5], + "fifo_reduction_throughput_drop_threshold": [0.01] + }, + { + "dut": ["resnet50"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["U250"], + "clock_period_ns": [4], + + "rtlsim_n": [5], + + "fifo_method": ["characterize"], + "fifo_strategy": ["rtlsim", "analytical"], + "fifo_throughput_factor_threshold": [0.9], "fifo_reduction_skip_threshold": [99999999999], "fifo_reduction_factor": [0.5], From b17cc23b93808b4b06b0092e9c4b40a725c37331 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 5 Feb 2025 10:40:18 +0000 Subject: [PATCH 029/125] Missing change from merge branch --- benchmarking/bench_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 7374e4007e..9493a12786 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -1092,6 +1092,10 @@ def steps_full_build_flow(self): if "fifo_rtlsim_n" in self.params: cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"] + # Manual correction factor for FIFO-Sim input throttling + if "fifo_throttle_factor" in self.params: + cfg.fifo_throttle_factor = self.params["fifo_throttle_factor"] + if "folding_path" in self.build_inputs: cfg.folding_config_file = self.build_inputs["folding_path"] if "specialize_path" in self.build_inputs: From 7956a58ebe0ed91a4eb7a6fcc2f3242bba0361e6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Feb 2025 15:58:02 +0000 Subject: [PATCH 030/125] Increase stack size, NUM_WORKERS --- benchmarking/bench-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 5cf0568c31..c3c40d4b0e 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -21,15 +21,17 @@ FINN Build: pipeline: $PARENT_PIPELINE_ID variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" + NUM_DEFAULT_WORKERS: "$CPU_CORES" PYTEST_PARALLEL: "$CPU_CORES" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus - module load system singularity + - ulimit -s unlimited # Increase stack size limit - export FINN_SINGULARITY=$PATH_SINGULARITY_IMG/finn-plus/$SINGULARITY_IMG_SELECT script: - ./run-docker.sh python benchmarking/bench.py $BENCH_CFG - cache: + cache: key: $CI_COMMIT_SHA policy: pull paths: From 76a780b9e6097d2947b304e16d006332cc16a563 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Feb 2025 16:00:54 +0000 Subject: [PATCH 031/125] Adapt transformer flow to new FINN+ dev --- benchmarking/dut/transformer_custom_steps.py | 598 +------------------ 1 file changed, 7 insertions(+), 591 deletions(-) diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 9c2a07d05e..1a96117e22 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -1,8 +1,6 @@ # ADAPTED FROM Christoph's radioml-transformer repository, specifically these files: # build_steps.py # custom/apply_config.py -# custom/composed_transformation.py -# custom/streamline.py # Copies (deep-copies) python objects import copy @@ -10,15 +8,9 @@ # Numpy for loading and comparing the verification input/output import numpy as np -# Python warning messages -import warnings - # YAML for loading experiment configurations import yaml -# Copies of python objects -from copy import deepcopy - # QONNX quantization data types from qonnx.core.datatype import DataType @@ -31,6 +23,9 @@ # Converts BatchNorm operation to affine transformation from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine +# Transformation for exhaustively composing transformations +from qonnx.transformation.composed import ComposedTransformation + # If we have a convolution with a bias tensors input, QONNX and later FINN # expect the bias to be expressed as a standalone Add node following the Conv # node. @@ -45,8 +40,6 @@ # QONNX graph transformations for renaming and cleaning up from qonnx.transformation.general import ( - ConvertDivToMul, - ConvertSubToAdd, GiveReadableTensorNames, GiveUniqueNodeNames, GiveUniqueParameterTensors, @@ -66,7 +59,6 @@ # Transposes the initializer tensors of a Quant node instead of having a # standalone Transpose following from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit -from qonnx.transformation.remove import RemoveIdentityOps # Range information structure for seeding the range analysis for converting # quantized activations to MultiThreshold @@ -142,27 +134,15 @@ # Cleanup transformation getting rid of 3d data layout from finn.transformation.squeeze import Squeeze - -# FINN streamlining transformations converting and rounding values -from finn.transformation.streamline import ConvertSignToThres, RoundAndClipThresholds from finn.transformation.streamline.absorb import ( - Absorb1BitMulIntoConv, - Absorb1BitMulIntoMatMul, AbsorbAddIntoMultiThreshold, AbsorbMulIntoMultiThreshold, AbsorbSignBiasIntoMultiThreshold, - AbsorbTransposeIntoMultiThreshold, - FactorOutMulSignMagnitude, - group_inputs_by_category, ) # FINN streamlining transformations fusing/collapsing operations of the same # kind -from finn.transformation.streamline.collapse_repeated import ( - CollapseRepeatedAdd, - CollapseRepeatedMul, - CollapseRepeatedTranspose, -) +from finn.transformation.streamline.collapse_repeated import CollapseRepeatedTranspose # FINN streamlining transformations removing nodes without real effect from the # graph @@ -173,22 +153,8 @@ # FINN streamlining transformations reordering the graph from finn.transformation.streamline.reorder import ( - MoveAddPastConv, - MoveAddPastJoinAdd, - MoveAddPastJoinConcat, MoveAddPastMul, - MoveAffinePastJoinConcat, - MoveLinearPastEltwiseAdd, - MoveLinearPastFork, - MoveMulPastFork, - MoveMulPastJoinAdd, - MoveMulPastJoinConcat, - MoveMulPastMaxPool, - MoveScalarAddPastMatMul, - MoveScalarLinearPastInvariants, - MoveScalarLinearPastSplit, - MoveScalarMulPastConv, - MoveScalarMulPastMatMul, + MoveMulPastAdd, MoveSqueezePastMatMul, MoveSqueezePastMultiThreshold, MoveTransposePastEltwise, @@ -197,554 +163,12 @@ MoveTransposePastJoinConcat, MoveTransposePastJoinMul, MoveTransposePastSplit, - is_scalar, ) +from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline # Execute onnx model graphs from the dataflow parent for verification from finn.util.test import execute_parent -# FINN streamlining transformations absorbing tensors/nodes into others - - -# Composes graph transformations such that each individual transformation as -# well as the whole sequence is applied exhaustively -class ComposedTransformation(Transformation): - # Initializes the transformation given a list of transformations - def __init__(self, transformations: list[Transformation]): - # Initialize the transformation base class - super().__init__() - # Register the list of transformations to be applied in apply() - self.transformations = transformations - - # Applies the transform to a whole model graph - def apply(self, model: ModelWrapper): # noqa - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all transformations to be applied - for transformation in self.transformations: - # Start each transformation on a deep copy of the model to mimic the - # behavior of ModelWrapper.transform() - model = copy.deepcopy(model) - # Exhaustively apply the transformation until it no longer modifies - # the graph - while True: - # Apply the transformation once, reporting back whether any node - # or pattern has been modified - model, _graph_modified = transformation.apply(model) - # Keep track whether the graph has been modified at least once - graph_modified = graph_modified or _graph_modified - # Break the loop if this transformation did not change anything - if not _graph_modified: - break - # Apply the cleanup transformations of the ModelWrapper - model.cleanup() - # Apply some further cleanup transformations to the model graph - # removing some clutter and keeping all names readable and ordered - # at any time - model = model.transform(RemoveIdentityOps()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the graph actually - # has been transformed by at least one transformation so the whole - # sequence of transformations will be reapplied - return model, graph_modified - - -# # Custom conversion from Quant to MultiThreshold -# TODO: Enable once fixed... -# from custom.quant_activation_to_multithreshold import ( -# QuantActivationToMultiThreshold -# ) - - -# Moves scale factor, i.e., scalar Mul and Div, past Im2Col (and Col2Im): These -# cannot be handled by MoveScalarLinearPastInvariants as potential padding makes -# Add-Im2Col not commute to Im2Col-Add -class MoveScalesPastIm2Col(Transformation): - # Applies the transform to a whole model graph - def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object - graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all nodes in the graph keeping track of the index - for index, node in enumerate(graph.node): - # Applies to Mul operation types - if node.op_type in {"Mul", "Div"}: - # Cannot handle fork- or join-multiplications - if model.is_fork_node(node) or model.is_join_node(node): - # Softly skip this node - continue - # Only handles one forking output for now - if len(node.output) > 1: - # Softly skip this node - continue - # The first input must be dynamically received from upstream - if model.get_initializer(node.input[0]) is not None: - # Softly skip this node - continue - # Test whether the node initializer is a scalar... - if not is_scalar(model.get_initializer(node.input[1])): - # Softly skip this node - continue - # As this is not a fork-node, there can be at most one successor - successor = model.find_direct_successors(node) - # If this is the final operation in the graph, there might be no - # successor - if successor is None: - # Softly skip this node - continue - # Now there is exactly one successor which needs to be extracted - # from the list - successor = successor[0] - # Handle both, Im2Col and the inverse Col2Im, as well as padding - if successor.op_type in {"Im2Col", "Col2Im", "Pad"}: - # Get names of all tensors involved in connecting the - # nodes - inp = node.input[0] # noqa: Duplicate - mid = node.output[0] - out = successor.output[0] - # Rewire the graph to feed original input into the - # Add node first - successor.input[0] = inp - # Repurpose the middle tensor for the output of the Add - successor.output[0] = mid - # The Mul operator now gets the middle tensor as its - # input - node.input[0] = mid - # Mul now produces the original output tensor - node.output[0] = out - # Delete the shape annotation of the connecting tensors - # to be re-done later - model.set_tensor_shape(mid, None) - model.set_tensor_shape(out, None) - # Track whether the graph has been modified, never - # resets to False - graph_modified = True - # Break the loop after deleting shape annotations to - # immediately re-do these before changing the next - # operator - break - # Redo datatype and shape annotations - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the transformation - # needs to be applied again - return model, graph_modified - - -# Moves scalar linear elementwise operations past fork nodes, applies to Add, -# Mul, Sub, Div, etc. -class MoveScalarLinearPastFork(Transformation): - # Applies the transform to a whole model graph - def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object - graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all nodes in the graph keeping track of the index - for index, node in enumerate(graph.node): - # Applies to Mul-like and Add-like operation types - if node.op_type in {"Add", "Sub", "Mul", "Div"}: - # Only handles non-joining forks for now - if not model.is_fork_node(node) or model.is_join_node(node): - # Softly skip this node - continue - # Only handles one forking output for now - if len(node.output) > 1: - # Softly skip this node - continue - # Test whether the node initializer is a scalar... - if not is_scalar(model.get_initializer(node.input[1])): - # Softly skip this node - continue - # We need to insert a replica of this operation in front of each - # consumer node - for consumer in model.find_direct_successors(node): - # Create an exact replica of this operator - copy = deepcopy(node) - # Insert a new unique tensor connecting the output of the - # copy to the consumer - copy.output[0] = model.make_new_valueinfo_name() - # The original node might be connecting to multiple inputs - # of the consumer... - for idx, inp in enumerate(consumer.input): - # Find each instance of connection from original node - if inp == node.output[0]: - # Rewire to connect to the replica - consumer.input[idx] = copy.output[0] - # Insert the new replica node into the graph - graph.node.insert(index + 1, copy) - # Remove the original node from the graph - graph.node.remove(node) - # Redo datatype and shape annotations - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the transformation - # needs to be applied again - return model, graph_modified - - -# Moves constant elementwise multiplication past another joining multiplication -class MoveConstMulPastJoinMul(Transformation): - # Applies the transform to a whole model graph # noqa: Duplicate - def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object - graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all nodes in the graph keeping track of the index - for index, node in enumerate(graph.node): - # Applies to Mul operation types - if node.op_type == "Mul": - # Currently does not handle fork- or join-nodes - if model.is_fork_node(node) or model.is_join_node(node): - # Softly skip this node - continue - # As this is not a fork-node, there can be at most one successor - successor = model.find_direct_successors(node) - # If Squeeze is the final operation in the graph, there might - # be no successor - if successor is None: - # Softly skip this node - continue - # Now there is exactly one successor which needs to be extracted - # from the list - successor = successor[0] - # Applies to Multiplications - if successor.op_type in {"Mul"}: - # Applies only if the second multiplication is a join-node - if model.is_join_node(successor): - # Get names of all tensors involved in connecting the - # nodes - inp = node.input[0] # noqa: Duplicate - mid = node.output[0] - out = successor.output[0] - # Need to match the correct input of the joining second - # multiplication - for i, name in enumerate(successor.input): - # If the successors input currently matches the - # intermediate tensors, this input needs to be - # rewired - if name == mid: - # Rewire the graph to feed original into the - # second Mul node first - successor.input[i] = inp - # Note: Do not break here as it is perfectly - # legal to connect the same tensor multiple - # times to different inputs - # Repurpose the middle tensor for the output of the - # second Mul - successor.output[0] = mid - # The first Mul operator now gets the middle tensor as - # its input - node.input[0] = mid - # The first Mul now produces the original output tensor - node.output[0] = out - # Delete the shape annotation of the connecting tensors - # to be re-done later - model.set_tensor_shape(mid, None) - model.set_tensor_shape(out, None) - # Track whether the graph has been modified, never - # resets to False - graph_modified = True - # Break the loop after deleting shape annotations to - # immediately re-do these before changing the next - # operator - break - # Redo datatype and shape annotations - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the transformation - # needs to be applied again - return model, graph_modified - - -# Moves elementwise additions past MatMul operations: Applicable if each -# operation has one initializer input -class MoveAddPastMatMul(Transformation): - # Applies the transform to a whole model graph # noqa: Duplicate - def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object - graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all nodes in the graph keeping track of the index - for index, node in enumerate(graph.node): - # Applies to Add operations - if node.op_type == "Add": - # If the add is a join operation, we do not have a constant - # added to the input - if model.is_join_node(node): - # Skip transforming this - continue - # If the Add is a fork operation we should first distribute the - # Add into the branches - if model.is_fork_node(node): - # Issue a warning to make the use aware of this potential - # transformation if the fork is moved first - warnings.warn( - f"{self.__class__.__name__}:" - f" Skipping near match: {node.name} is a fork-node," - f" try MoveLinearPastFork first" - ) - # Skip transforming this node as moving this would lead - # to messed up or detached graph - continue - # Decompose the inputs into the dynamic and the constant - # initializer input - (x_name,), (c_name,) = group_inputs_by_category(node, model) - # Now check the successor node which must be a MatMul - consumer = model.find_direct_successors(node) - # If there is no consumer, this Add seems to be last node of the - # graph - if not consumer: - # Skip transforming this - continue - # There must be exactly one consumer now - consumer = consumer[0] - # This transformation only applies to Add in front of MatMul - if not consumer.op_type == "MatMul": - # Skip this if not MatMul - continue - # MatMul may not be a join operation to apply this - # transformation - if model.is_join_node(consumer): - # Skip transforming without warning (there is nothing we can - # do about this) - continue - # Decompose the inputs to the MatMul to get the weight tensor - # name (the other input is the output of the Add) - _, (w_name,) = group_inputs_by_category(consumer, model) - # Read the weights and the constant addition tensor - w = model.get_initializer(w_name) - c = model.get_initializer(c_name) - # Determine whether the weights are the left or right input to - # the MatMul - left = w_name == consumer.input[0] - # Apply the weights to the constant tensor - c = np.matmul(w, c) if left else np.matmul(c, w) - # Insert the transformed tensor back into the mode as an - # initializer - model.set_initializer(c_name, c) - # The connecting tensors of this pattern - inp = x_name - mid = node.output[0] - out = consumer.output[0] - # Rewire the graph pattern connecting the input to the MatMul - # and the MatMul output to the Add node - consumer.input[1 if left else 0] = inp - # The Add now produces the original MatMul output - node.output[0] = out - # The middel tensor connects to the Add input - node.input[0 if node.input[0] == x_name else 1] = mid - # The MatMul feeds the middle tensors - consumer.output[0] = mid - # Delete the shape annotation of the connecting tensors - # to be re-done later - model.set_tensor_shape(mid, None) - model.set_tensor_shape(out, None) - # Delete the type annotations of the connecting tensors - # to be re-done later - # model.set_tensor_datatype(mid, None) - # model.set_tensor_datatype(out, None) - # Track whether the graph has been modified, never - # resets to False - graph_modified = True - # Break the loop after deleting shape annotations to - # immediately re-do these before changing the next - # operator - break - # Redo datatype and shape annotations - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the transformation - # needs to be applied again - return model, graph_modified - - -# Moves elementwise multiplication past elementwise addition if one input to -# each of the operators is a known constant -# Note: Reverse of MoveAddPastMul -class MoveMulPastAdd(Transformation): - # Applies the transform to a whole model graph - def apply(self, model: ModelWrapper): # noqa - # Get the model graph out of the model wrapper object - graph = model.graph - # Keep track of whether the graph has been modified - graph_modified = False - # Iterate all nodes in the graph keeping track of the index - for index, node in enumerate(graph.node): - # Applies to Mul operation types - if node.op_type == "Mul": - # Currently does not handle fork- or join-nodes - if model.is_fork_node(node) or model.is_join_node(node): - # Softly skip this node - continue - # As this is not a fork-node, there can be at most one successor - successor = model.find_direct_successors(node) - # If Squeeze is the final operation in the graph, there might - # be no successor - if successor is None: - # Softly skip this node - continue - # Now there is exactly one successor which needs to be extracted - # from the list - successor = successor[0] - # Applies to additions - if successor.op_type in {"Add"}: - # The addition may not join as we need to know the second - # input - if not model.is_join_node(successor): - # Get the constant initializer tensors for both - # operations: y = s * x + b - _, s_name = group_inputs_by_category(node, model) - _, b_name = group_inputs_by_category(successor, model) - # Skip if either node has no constant initializer - if not s_name or not b_name: - # Skip without warning ok? - continue - # There must be exactly one constant per operations - assert len(s_name) == 1, f"To many constant inputs for {node}" - assert len(b_name) == 1, f"To many constant inputs for {successor}" - # Now read the initializer tensors - s = model.get_initializer(*s_name) - b = model.get_initializer(*b_name) - # Update the addition initializer according to the - # distributive law - model.set_initializer(*b_name, b / s) - # Get names of all tensors involved in connecting the - # nodes - inp = node.input[0] # noqa: Duplicate - mid = node.output[0] - out = successor.output[0] - # Rewire the graph to feed original input into the - # Add node first - successor.input[0] = inp - # Repurpose the middle tensor for the output of the Add - successor.output[0] = mid - # The Mul operator now gets the middle tensor as its - # input - node.input[0] = mid - # Mul now produces the original output tensor - node.output[0] = out - # Delete the shape annotation of the connecting tensors - # to be re-done later - model.set_tensor_shape(mid, None) - model.set_tensor_shape(out, None) - # Track whether the graph has been modified, never - # resets to False - graph_modified = True - # Break the loop after deleting shape annotations to - # immediately re-do these before changing the next - # operator - break - # Redo datatype and shape annotations - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - # Return the transformed model and indicate whether the transformation - # needs to be applied again - return model, graph_modified - - -# Define a set of custom streamlining transformations: These are applied once -# during the actual streamlining step and once after converting attention to -# hardware (the associated cleanup afterward might enable some Streamlining -# transformations once again) -def Streamline(): # noqa: Uppercase - # Return a set of exhaustively applies transformations - return ComposedTransformation( - [ - # On skip-connections: prefer pushing scalar multiplication forward - # before MoveAddPastMul - MoveMulPastFork(), - # The "standard" set of FINN streamlining transformations or at least - # inspired by them but applied exhaustively until none of them changes - # the graph anymore. - # Note: Covers most parts of non-branching linear topologies - ComposedTransformation( - [ - ConvertSubToAdd(), - ConvertDivToMul(), - BatchNormToAffine(), - ConvertSignToThres(), - MoveMulPastMaxPool(), - AbsorbSignBiasIntoMultiThreshold(), - MoveScalarLinearPastInvariants(), - MoveAddPastMul(), - MoveScalarAddPastMatMul(), - MoveAddPastConv(), - MoveScalarMulPastMatMul(), - MoveScalarMulPastConv(), - MoveAddPastMul(), - CollapseRepeatedAdd(), - CollapseRepeatedMul(), - MoveMulPastMaxPool(), - AbsorbAddIntoMultiThreshold(), - FactorOutMulSignMagnitude(), - AbsorbMulIntoMultiThreshold(), - Absorb1BitMulIntoMatMul(), - Absorb1BitMulIntoConv(), - ] - ), - # Streamlining scales and biases forward through residual topologies - # Note: This mostly covers forking and joining operations - ComposedTransformation( - [ - # Note: This is probably the most common way of joining skip - # connections, i.e., this corresponds to the original residual - # addition, i.e., y = f(x) + x - MoveLinearPastEltwiseAdd(), - MoveLinearPastFork(), #DEBUG for positional encoding streamlining, MoveScalarLinearPastFork() - MoveScalarLinearPastInvariants(), - MoveMulPastFork(), - MoveMulPastJoinAdd(), - MoveAddPastJoinAdd(), - # Note: This brings constant Muls (i.e., quantizer scales to be - # removed) forward through joining Muls (i.e., those ending up - # as actual hardware operators). - MoveConstMulPastJoinMul(), - ] - ), - # Streamlining scales and biases forward through shape/layout changing - # operations, i.e., mostly transposes - ComposedTransformation( - [ - # Convolution inputs and padding - MoveScalesPastIm2Col(), - # Streamlining for Split and Concat operations - MoveScalarLinearPastSplit(), - MoveAffinePastJoinConcat(), - MoveMulPastJoinConcat(), - MoveAddPastJoinConcat(), - # Move transposes around to some place where they could be removed - # later, i.e., where they collapse into identities - MoveTransposePastFork(), - MoveTransposePastSplit(), - MoveTransposePastJoinConcat(), - MoveTransposePastEltwise(), - MoveTransposePastJoinMul(), - MoveTransposePastJoinAdd(), - CollapseRepeatedTranspose(), - # Remove identity shape/layout transformations - RemoveIdentityTranspose(), - RemoveIdentityReshape(), - # Squeeze operators can be moved past the thresholding - MoveSqueezePastMultiThreshold(), - # A certain type of 4d-layout transpose can be absorbed (actually - # moved past) MultiThreshold operations - AbsorbTransposeIntoMultiThreshold(), - ] - ), - # Only round and clip after all streamlining transformations have - # been applied exhaustively. - # Note: Might still enable another round of streamlining. - RoundAndClipThresholds(), - ] - ) - # Prepares the graph to be consumed by FINN: # 1. Some graph cleanup removing unused tensors, nodes without effect and @@ -815,15 +239,7 @@ def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig): # sample inputs if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa verify_step(model, cfg, "lowered_python", need_parent=False) - # Apply the quantizer to MultiThreshold conversion - # Note: This is exhaustive as well as single .transform reapplies as - # long as possible. - # TODO: Enable once fixed... - # model = model.transform(QuantActivationToMultiThreshold(range_info)) - # If configured, run a verification of the transformed model on some - # sample inputs - if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa - verify_step(model, cfg, "quant_to_thresholds_ra_python", need_parent=False) + # Apply the standard QONNX to FINN conversion step to convert the # remaining quantizers not yet covered by the new range analysis based # method From e1671b22f9ae7c977c89f740229c31bad25b4558 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Feb 2025 17:20:06 +0000 Subject: [PATCH 032/125] Enable Transformer benchmarks --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ff3187f25d..7cf1f91e39 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -211,7 +211,7 @@ Bench: PARENT_PIPELINE_ID: $CI_PIPELINE_ID parallel: matrix: - - BENCH_CFG: [mvau_test, resnet50_test, metafi_test] + - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all] #dev: mvau_test #fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test From 2cdfd86be7744820ffcd434b0a430e1efc334615 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 14 Feb 2025 00:33:36 +0000 Subject: [PATCH 033/125] Add virtual HLS FIFO --- custom_hls/virtual_fifo.hpp | 81 +++++++ src/finn/builder/build_dataflow_config.py | 3 + src/finn/builder/build_dataflow_steps.py | 23 ++ .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/streamingfifo_hls.py | 208 ++++++++++++++++++ .../transformation/fpgadataflow/templates.py | 6 +- 6 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 custom_hls/virtual_fifo.hpp create mode 100644 src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py diff --git a/custom_hls/virtual_fifo.hpp b/custom_hls/virtual_fifo.hpp new file mode 100644 index 0000000000..85d71280bc --- /dev/null +++ b/custom_hls/virtual_fifo.hpp @@ -0,0 +1,81 @@ +#ifndef VIRTUAL_FIFO_HPP +#define VIRTUAL_FIFO_HPP + +#include +#include +#include + +// Utility Functions, taken from instrumentation wrapper +template +static void move( + hls::stream &src, + hls::stream &dst +) { +#pragma HLS pipeline II=1 style=flp + dst.write(src.read()); +} + +template +static void move( + hls::stream> &src, + hls::stream &dst +) { +#pragma HLS pipeline II=1 style=flp + dst.write(src.read().data); +} + +template +class Payload { +public: + using type = T; +}; +template +class Payload> { +public: + using type = T; +}; + +template +void VirtualFIFO(hls::stream > &in, hls::stream > &out, + ap_uint<32> mode, + ap_uint<32> depth, + ap_uint<32> &occupancy, + ap_uint<32> &max_occupancy) +{ + #pragma HLS pipeline II=1 style=flp + + static ap_uint<32> c_occupancy = 0; + static ap_uint<32> c_max_occupancy = 0; + #pragma HLS reset variable=c_occupancy + #pragma HLS reset variable=c_max_occupancy + + ap_uint inElem; + + bool read = mode == 0 || c_occupancy != depth; + bool write = c_occupancy != 0; + + // INPUT + if(read) + { + if(in.read_nb(inElem)) //disregard input data + { + c_occupancy++; + c_max_occupancy = (c_occupancy > c_max_occupancy) ? c_occupancy : c_max_occupancy; + } + } + + // OUTPUT + if(write) + { + if(out.write_nb(0)) //write dummy output data + { + c_occupancy--; + } + } + + // Update output status registers + occupancy = c_occupancy; + max_occupancy = c_max_occupancy; +} + +#endif diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index d6437a2e5c..c5e3995943 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -265,6 +265,9 @@ class DataflowBuildConfig: #: for each FIFO. auto_fifo_depths: Optional[bool] = True + # Enables experimental live FIFO sizing + live_fifo_sizing: Optional[bool] = False + #: Whether FIFO nodes with depth larger than 32768 will be split. #: Allow to configure very large FIFOs in the folding_config_file. split_large_fifos: Optional[bool] = False diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 5163b2dbdb..fe0cb68a88 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -549,6 +549,29 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): `GiveUniqueNodeNames`. """ + # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior + if cfg.live_fifo_sizing: + # Create all DWCs and FIFOs normally + model = model.transform(InsertDWC()) + model = model.transform(InsertFIFO(create_shallow_fifos=True)) + + # Specialize FIFOs to HLS back-end instead of default RTL back-end + for node in model.get_nodes_by_op_type("StreamingFIFO"): + node_inst = getCustomOp(node) + node_inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) + + # Fix impl_style attribute + for node in model.get_nodes_by_op_type("StreamingFIFO_hls"): + node_inst = getCustomOp(node) + node_inst.set_nodeattr("impl_style", "virtual") + + # Clean up model + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + return model + if cfg.auto_fifo_depths: if cfg.auto_fifo_strategy == "characterize": model = model.transform(InsertDWC()) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 405c47a08d..d753fffa2e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -47,6 +47,7 @@ StreamingDataWidthConverter_hls, ) from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls +from finn.custom_op.fpgadataflow.hls.streamingfifo_hls import StreamingFIFO_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls @@ -74,6 +75,7 @@ custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["StreamingFIFO_hls"] = StreamingFIFO_hls custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py new file mode 100644 index 0000000000..f17bc48fc6 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py @@ -0,0 +1,208 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingFIFO_hls(StreamingFIFO, HLSBackend): + """HLS-based FIFO implementation. Currently only used as virtual FIFO for live FIFO-sizing.""" + + def get_nodeattr_types(self): + my_attrs = { + # Only purpose of this CustomOp for now: virtual FIFO for live FIFO-sizing + "impl_style": ("s", False, "virtual", {"virtual"}), + } + my_attrs.update(StreamingFIFO.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "virtual_fifo.hpp"'] + + def defines(self, var): + numReps = 1 + width = self.get_instream_width() + self.code_gen_dict["$DEFINES$"] = [ + "#define Width %d " % width, + "#define numReps %d" % numReps, + ] + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """ + #pragma HLS dataflow disable_start_propagation + + static hls::stream> in_fifo; + static hls::stream>::type> out_fifo; + #pragma HLS stream variable=in_fifo depth=2 + #pragma HLS stream variable=out_fifo depth=2 + + // AXI-Stream -> FIFO + move(in0_%s, in_fifo); + + // Main + VirtualFIFO(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy); + + // FIFO -> AXI-Stream + move(out_fifo, out_%s); + """ + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + in_packed_bits = self.get_instream_width() + in_packed_hls_type = "ap_uint<%d>" % in_packed_bits + out_packed_bits = self.get_outstream_width() + out_packed_hls_type = "ap_uint<%d>" % out_packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode, + ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)""" + % ( + self.onnx_node.name, + in_packed_hls_type, + self.hls_sname(), + out_packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=occupancy") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=max_occupancy") + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def get_verilog_top_module_intf_names(self): + # Overload default HWCustomOp implementation to add axilite control IF + intf_names = super().get_verilog_top_module_intf_names() + intf_names["axilite"] = ["s_axi_control"] + return intf_names + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded shape + reshaped_input = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(exp_shape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert context[node.output[0]].shape == tuple( + exp_shape + ), """Output + shape doesn't match expected shape, should be same as input shape""" diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index ccf4e7a943..5c521720c4 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -92,9 +92,9 @@ custom_zynq_shell_template = """ set FREQ_MHZ %s set NUM_AXILITE %d -if {$NUM_AXILITE > 9} { - error "Maximum 10 AXI-Lite interfaces supported" -} +#if {$NUM_AXILITE > 9} { +# error "Maximum 10 AXI-Lite interfaces supported" +#} set NUM_AXIMM %d set BOARD %s set FPGA_PART %s From 7c04eb6e628cd21820bcef02ff624edfa3702b22 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 14 Feb 2025 16:31:29 +0000 Subject: [PATCH 034/125] Integrate instrumentation into ZynqBuild --- custom_hls/instrumentation.template.cpp | 307 ++++++++++++++++++ custom_hls/instrumentation_sim.template.tcl | 67 ++++ custom_hls/instrumentation_tb.template.sv | 172 ++++++++++ src/finn/builder/build_dataflow_config.py | 4 + src/finn/builder/build_dataflow_steps.py | 22 ++ .../transformation/fpgadataflow/floorplan.py | 8 +- .../fpgadataflow/instrumentation.py | 203 ++++++++++++ .../fpgadataflow/make_zynq_proj.py | 88 ++++- 8 files changed, 860 insertions(+), 11 deletions(-) create mode 100644 custom_hls/instrumentation.template.cpp create mode 100644 custom_hls/instrumentation_sim.template.tcl create mode 100644 custom_hls/instrumentation_tb.template.sv create mode 100644 src/finn/transformation/fpgadataflow/instrumentation.py diff --git a/custom_hls/instrumentation.template.cpp b/custom_hls/instrumentation.template.cpp new file mode 100644 index 0000000000..bf15d77a87 --- /dev/null +++ b/custom_hls/instrumentation.template.cpp @@ -0,0 +1,307 @@ +/****************************************************************************** + * Copyright (c) 2023, Xilinx, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************* + * @brief Instrumentation wrapper module for FINN IP characterization. + * @author Thomas B. Preusser + * @details + * Instrumentation wrapper intercepting the feature map input to and + * the feature map output from a FINN IP to measure processing latency and + * initiation interval in terms of clock cycles. The most recent readings + * are exposed via AXI-light. + * This wrapper can run the FINN IP detached from an external data source + * and sink by feeding LFSR-generated data and sinking the output without + * backpressure. + * This module is currently not integrated with the FINN compiler. It must + * be instantiated and integrated with the rest of the system in a manual + * process. + * + * @param PENDING maximum number of feature maps in the FINN dataflow pipeline + * @param ILEN number of input transactions per IFM + * @param OLEN number of output transactions per OFM + * @param KO number of subwords within output payload vector + * @param TI type of input payload vector + * @param TO type of output payload vector + *******************************************************************************/ + + #include + #include + #include + #include + + // Module Configuration + constexpr unsigned PENDING = @PENDING@; // Max. feature maps in flight + constexpr unsigned ILEN = @ILEN@; // Input words per IFM + constexpr unsigned OLEN = @OLEN@; // Output words per OFM + constexpr unsigned KO = @KO@; // Subwords within OFM transaction word + using TI = @TI@; // IFM transaction word + using TO = @TO@; // OFM transaction word + + //--------------------------------------------------------------------------- + // Utility Functions + static constexpr unsigned clog2 (unsigned x) { return x<2? 0 : 1+clog2((x+1)/2); } + static constexpr unsigned clog2nz(unsigned x) { return std::max(1u, clog2(x)); } + + template + static void move( + hls::stream &src, + hls::stream &dst + ) { + #pragma HLS pipeline II=1 style=flp + dst.write(src.read()); + } + + template + static void move( + hls::stream> &src, + hls::stream &dst + ) { + #pragma HLS pipeline II=1 style=flp + dst.write(src.read().data); + } + + template + class Payload { + public: + using type = T; + }; + template + class Payload> { + public: + using type = T; + }; + + /** + * Computes a checksum over a forwarded stream assumed to carry frames of + * N words further subdivided into K subwords. + * - Subword slicing can be customized typically by using a lambda. + * The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word + * type with a member `width` and a range-based slicing operator. It + * further assumes a little-endian arrangement of subwords within words + * for the canonical subword stream order. + * - Subwords wider than 23 bits are folded using bitwise XOR across + * slices of 23 bits starting from the LSB. + * - The folded subword values are weighted according to their position + * in the stream relative to the start of frame by a periodic weight + * sequence 1, 2, 3, ... + * - The weighted folded subword values are reduced to a checksum by an + * accumulation module 2^24. + * - A checksum is emitted for each completed frame. It is the concatenation + * of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum. + */ + template + class DefaultSubwordSlicer { + static_assert(T::width%K == 0, "Word size must be subword multiple."); + static constexpr unsigned W = T::width/K; + public: + ap_uint operator()(T const &x, unsigned const j) const { + #pragma HLS inline + return x((j+1)*W-1, j*W); + } + }; + + //--------------------------------------------------------------------------- + // Instrumentation Core + template< + unsigned PENDING, + unsigned ILEN, + unsigned OLEN, + unsigned KO, + typename TI, + typename TO + > + void instrument( + hls::stream &finnix, + hls::stream &finnox, + ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed + ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow + ap_uint<32> &latency, + ap_uint<32> &interval, + ap_uint<32> &checksum, + ap_uint<32> &min_latency + ) { + #pragma HLS pipeline II=1 style=flp + + // Timestamp Management State + using clock_t = ap_uint<32>; + static clock_t cnt_clk = 0; + #pragma HLS reset variable=cnt_clk + hls::stream timestamps; + #pragma HLS stream variable=timestamps depth=PENDING + static bool timestamp_ovf = false; + static bool timestamp_unf = false; + #pragma HLS reset variable=timestamp_ovf + #pragma HLS reset variable=timestamp_unf + + // Input Feed & Generation + constexpr unsigned LFSR_WIDTH = (TI::width+15)/16 * 16; + static ap_uint icnt = 0; + static ap_uint lfsr; + #pragma HLS reset variable=icnt + #pragma HLS reset variable=lfsr off + if(!finnix.full()) { + + bool const first = icnt == 0; + bool wr; + if(first) { + // Start of new feature map + wr = cfg[0]; + for(unsigned i = 0; i < LFSR_WIDTH; i += 16) { + #pragma HLS unroll + lfsr(15+i, i) = cfg(31, 16) ^ (i>>4)*33331; + } + } + else { + // Advance LFSR + wr = true; + for(unsigned i = 0; i < LFSR_WIDTH; i += 16) { + #pragma HLS unroll + lfsr(15+i, i) = (lfsr(15+i, i) >> 1) ^ ap_uint<16>(lfsr[i]? 0 : 0x8805); + } + } + + if(wr) { + finnix.write_nb(lfsr); + if(first) timestamp_ovf |= !timestamps.write_nb(cnt_clk); + icnt = icnt == ILEN-1? decltype(icnt)(0) : decltype(icnt)(icnt + 1); + } + } + + // Output Tracking + static ap_uint ocnt = 0; + #pragma HLS reset variable=ocnt + static clock_t ts1 = 0; // last output timestamp + static clock_t last_latency = 0; + static clock_t last_interval = 0; + static clock_t cur_min_latency = ~0; + #pragma HLS reset variable=ts1 + #pragma HLS reset variable=last_latency + #pragma HLS reset variable=last_interval + #pragma HLS reset variable=cur_min_latency + + static ap_uint<8> pkts = 0; + #pragma HLS reset variable=pkts + static ap_uint< 2> coeff[3]; + static ap_uint<24> psum; + static ap_uint<32> last_checksum = 0; + #pragma HLS reset variable=coeff off + #pragma HLS reset variable=psum off + #pragma HLS reset variable=last_checksum + + TO oval; + if(finnox.read_nb(oval)) { + // Start of new output feature map + if(ocnt == 0) { + for(unsigned i = 0; i < 3; i++) coeff[i] = i+1; + psum = 0; + } + + // Update checksum + for(unsigned j = 0; j < KO; j++) { + #pragma HLS unroll + auto const v0 = DefaultSubwordSlicer()(oval, j); + constexpr unsigned W = 1 + (decltype(v0)::width-1)/23; + ap_uint v = v0; + ap_uint< 23> w = 0; + for(unsigned k = 0; k < W; k++) w ^= v(23*k+22, 23*k); + psum += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0)); + } + + // Re-align coefficients + for(unsigned j = 0; j < 3; j++) { + #pragma HLS unroll + ap_uint<3> const cc = coeff[j] + ap_uint<3>(KO%3); + coeff[j] = cc(1, 0) + cc[2]; + } + + // Track frame position + if(ocnt != OLEN-1) ocnt++; + else { + clock_t ts0; + if(!timestamps.read_nb(ts0)) timestamp_unf = true; + else { + last_latency = cnt_clk - ts0; // completion - start + last_interval = cnt_clk - ts1; // completion - previous completion + cur_min_latency = std::min(cur_min_latency, last_latency); + ts1 = cnt_clk; // mark completion ^ + } + ocnt = 0; + + last_checksum = (pkts++, psum); + } + } + + // Advance Timestamp Counter + cnt_clk++; + + // Copy Status Outputs + status = timestamp_ovf | (timestamp_unf << 1); + latency = last_latency; + interval = last_interval; + checksum = last_checksum; + min_latency = cur_min_latency; + + } // instrument() + + void instrumentation_wrapper( + hls::stream &finnix, + hls::stream &finnox, + ap_uint<32> cfg, + ap_uint<32> &status, + ap_uint<32> &latency, + ap_uint<32> &interval, + ap_uint<32> &checksum, + ap_uint<32> &min_latency + ) { + #pragma HLS interface axis port=finnix + #pragma HLS interface axis port=finnox + #pragma HLS interface s_axilite bundle=ctrl port=cfg + #pragma HLS interface s_axilite bundle=ctrl port=status + #pragma HLS interface s_axilite bundle=ctrl port=latency + #pragma HLS interface s_axilite bundle=ctrl port=interval + #pragma HLS interface s_axilite bundle=ctrl port=checksum + #pragma HLS interface s_axilite bundle=ctrl port=min_latency + #pragma HLS interface ap_ctrl_none port=return + + #pragma HLS dataflow disable_start_propagation + static hls::stream finnix0; + static hls::stream::type> finnox0; + #pragma HLS stream variable=finnix0 depth=2 + #pragma HLS stream variable=finnox0 depth=2 + + // AXI-Stream -> FIFO + move(finnox, finnox0); + + // Main + instrument(finnix0, finnox0, cfg, status, latency, interval, checksum, min_latency); + + // FIFO -> AXI-Stream + move(finnix0, finnix); + + } // instrumentation_wrapper diff --git a/custom_hls/instrumentation_sim.template.tcl b/custom_hls/instrumentation_sim.template.tcl new file mode 100644 index 0000000000..4875d799e2 --- /dev/null +++ b/custom_hls/instrumentation_sim.template.tcl @@ -0,0 +1,67 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of AMD nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +set fpga_part @FPGA_PART@ +#set output_root ".." +# path to IP folder for instrumentation wrapper, change as needed +#set instrwrp_ip_dir "$output_root/instrumentation_wrapper/project_instrwrap/sol1/impl/ip" +# path to IP folder for FINN IP, change as needed +#set finn_ip_dir "$output_root/stitched_ip/ip" + +create_project -force instr_sim_proj instr_sim_proj/ -part $fpga_part +create_bd_design "dut" +update_compile_order -fileset sources_1 +#set_property ip_repo_paths [list $instrwrp_ip_dir] [current_project] +set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] @IP_DIRS_STR@] [current_project] +update_ip_catalog + + +create_bd_cell -type ip -vlnv xilinx_finn:finn:finn_design:1.0 finn_design_0 +create_bd_cell -type ip -vlnv xilinx.com:hls:instrumentation_wrapper:1.0 instrumentation_wrap_0 +connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/finnix] [get_bd_intf_pins finn_design_0/s_axis_0] +connect_bd_intf_net [get_bd_intf_pins finn_design_0/m_axis_0] [get_bd_intf_pins instrumentation_wrap_0/finnox] +make_bd_intf_pins_external [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] +make_bd_pins_external [get_bd_pins instrumentation_wrap_0/ap_clk] +make_bd_pins_external [get_bd_pins instrumentation_wrap_0/ap_rst_n] +connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins finn_design_0/ap_clk] +connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins finn_design_0/ap_rst_n] + +save_bd_design + +update_compile_order -fileset sources_1 +make_wrapper -files [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] -top +add_files -norecurse instr_sim_proj/instr_sim_proj.gen/sources_1/bd/dut/hdl/dut_wrapper.v + +set_property SOURCE_SET sources_1 [get_filesets sim_1] +add_files -fileset sim_1 ./instrwrap_testbench.sv +update_compile_order -fileset sim_1 + +set_property synth_checkpoint_mode None [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] +generate_target Simulation [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] +launch_simulation -simset sim_1 -mode behavioral +run all diff --git a/custom_hls/instrumentation_tb.template.sv b/custom_hls/instrumentation_tb.template.sv new file mode 100644 index 0000000000..933104c623 --- /dev/null +++ b/custom_hls/instrumentation_tb.template.sv @@ -0,0 +1,172 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of AMD nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +module tb #( + // sampling period (in cycles) for reading instrumentation wrapper registers + // TODO: make configurable or adjust automatically? + int unsigned INSTR_READ_PERIOD = 10000, + // 16-bit LFSR seed for generating fixed random data + int unsigned LFSR_SEED = 1 +)(); + + +// Clock & Reset +logic ap_clk = 0; +always #5ns ap_clk = !ap_clk; +logic ap_rst_n = 0; +uwire ap_rst = !ap_rst_n; + +// wires for instrumentation wrapper AXI lite interface +logic [31:0] axilite_ctrl_araddr = 'x; +uwire axilite_ctrl_arready; +logic axilite_ctrl_arvalid = 0; +logic [31:0] axilite_ctrl_awaddr = 'x; +uwire axilite_ctrl_awready; +logic axilite_ctrl_awvalid = 0; +uwire axilite_ctrl_bready = 1; +uwire [1:0]axilite_ctrl_bresp; +uwire axilite_ctrl_bvalid; +uwire [31:0]axilite_ctrl_rdata; +logic axilite_ctrl_rready = 1; +uwire [1:0]axilite_ctrl_rresp; +uwire axilite_ctrl_rvalid; +logic [31:0] axilite_ctrl_wdata = 'x; +uwire axilite_ctrl_wready; +uwire [3:0]axilite_ctrl_wstrb = 4'b1111; +logic axilite_ctrl_wvalid = 0; + + + + +dut_wrapper dut_wrapper_inst ( + .ap_clk_0(ap_clk), .ap_rst_n_0(ap_rst_n), + .s_axi_ctrl_0_araddr(axilite_ctrl_araddr), + .s_axi_ctrl_0_arready(axilite_ctrl_arready), + .s_axi_ctrl_0_arvalid(axilite_ctrl_arvalid), + .s_axi_ctrl_0_awaddr(axilite_ctrl_awaddr), + .s_axi_ctrl_0_awready(axilite_ctrl_awready), + .s_axi_ctrl_0_awvalid(axilite_ctrl_awvalid), + .s_axi_ctrl_0_bready(axilite_ctrl_bready), + .s_axi_ctrl_0_bresp(axilite_ctrl_bresp), + .s_axi_ctrl_0_bvalid(axilite_ctrl_bvalid), + .s_axi_ctrl_0_rdata(axilite_ctrl_rdata), + .s_axi_ctrl_0_rready(axilite_ctrl_rready), + .s_axi_ctrl_0_rresp(axilite_ctrl_rresp), + .s_axi_ctrl_0_rvalid(axilite_ctrl_rvalid), + .s_axi_ctrl_0_wdata(axilite_ctrl_wdata), + .s_axi_ctrl_0_wready(axilite_ctrl_wready), + .s_axi_ctrl_0_wstrb(axilite_ctrl_wstrb), + .s_axi_ctrl_0_wvalid(axilite_ctrl_wvalid) +); + +//--------------------------------------------------------------------------- + +initial begin + $timeformat(-9, 2, " ns"); + // perform reset + repeat(100) @(posedge ap_clk); + ap_rst_n <= 1; + $display("Reset complete"); + repeat(100) @(posedge ap_clk); + // instrumentation wrapper configuration: + // set up LFSR seed + start data generation + output sink + axilite_ctrl_awaddr <= 'h10; + axilite_ctrl_awvalid <= 1; + axilite_ctrl_wdata <= (LFSR_SEED << 16) | 'b11; + axilite_ctrl_wvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_wready && axilite_ctrl_awready) break; + end + axilite_ctrl_wvalid <= 0; + axilite_ctrl_awvalid <= 0; + axilite_ctrl_awaddr <= 'x; + axilite_ctrl_wdata <= 'x; + while(1) begin + axilite_ctrl_araddr <= 'h18; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] STATUS_I = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h20; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] STATUS_O = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h28; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] LATENCY = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h38; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] INTERVAL = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h48; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] CHECKSUM = %8x", $time, axilite_ctrl_rdata); + if(axilite_ctrl_rdata) begin + $display("Nonzero checksum detected, stopping simulation"); + $finish; + // TODO: simulate for configurable number of frames, like this: + // if(axilite_ctrl_rdata[31:24] == 47) begin + // $display("Frame number 48 detected, stopping simulation"); + // $finish; + // end + end + break; + end + end + axilite_ctrl_arvalid <= 0; + repeat(INSTR_READ_PERIOD) @(posedge ap_clk); + end +end + + +endmodule : tb diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index d6437a2e5c..08545ebc14 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -314,6 +314,10 @@ class DataflowBuildConfig: #: debug signals in the generated hardware) enable_hw_debug: Optional[bool] = False + #: Whether the accelerator will be simulated and synthesized with an + #: instrumentation wrapper attached to accurately measure performance. + enable_instrumentation: Optional[bool] = False + #: Whether pdb postmortem debuggig will be launched when the build fails enable_build_pdb_debug: Optional[bool] = True diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 5163b2dbdb..a4481ed778 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -89,6 +89,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.minimize_accumulator_width import ( @@ -644,6 +645,26 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig): """Create stitched IP for a graph after all HLS IP blocks have been generated. Depends on the DataflowOutputType.STITCHED_IP output product.""" + # introduce tLAST marker, required for instrumentation + if cfg.enable_instrumentation: + model = model.transform( + InsertTLastMarker( + # only insert marker on output (input TLAST is ignored for these use-cases anyway) + both=False, + # use ap_axiu instead of qdma_axis + external=False, + # static number of iterations (based on what the compiler/folding sets up) + dynamic=False, + ) + ) + # give a proper name to the inserted node, important for codegen + # TODO: deal with multi-I/O accelerators? + model.graph.node[-1].name = "TLastMarker_0" + # re-run codegen and HLS IP gen, will affect only the new TLastMarker layer assuming + # all other IPs have been generated already + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) + model = model.transform(HLSSynthIP()) + if DataflowOutputType.STITCHED_IP in cfg.generate_outputs: stitched_ip_dir = cfg.output_dir + "/stitched_ip" model = model.transform( @@ -806,6 +827,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig): cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug, + cfg.enable_instrumentation, partition_model_dir=partition_model_dir, ) ) diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index b24145afcb..7d93ff88fc 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -99,9 +99,13 @@ def apply(self, model): # if we have SLR assignment already. use that if node_slr != -1: continue + # if available, use the SLR of the preceding node srcnode = model.find_producer(node.input[0]) - node_slr = getCustomOp(srcnode).get_nodeattr("slr") - node_inst.set_nodeattr("slr", node_slr) + if srcnode is not None: + node_slr = getCustomOp(srcnode).get_nodeattr("slr") + node_inst.set_nodeattr("slr", node_slr) + else: + node_inst.set_nodeattr("slr", default_slr) if unassigned_nodes > 0: warnings.warn( diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py new file mode 100644 index 0000000000..7f37c5ed14 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/instrumentation.py @@ -0,0 +1,203 @@ +import numpy as np +import os +import subprocess +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation + +from finn.custom_op.fpgadataflow.templates import ipgentcl_template +from finn.util.basic import make_build_dir +from finn.util.hls import CallHLS + + +# TODO: duplicate function from make_zynq_proj.py +def collect_ip_dirs(model, ipstitch_path): + # collect list of all IP dirs + ip_dirs = [] + need_memstreamer = False + for node in model.graph.node: + node_inst = getCustomOp(node) + ip_dir_value = node_inst.get_nodeattr("ip_path") + assert os.path.isdir( + ip_dir_value + ), """The directory that should + contain the generated ip blocks doesn't exist.""" + ip_dirs += [ip_dir_value] + if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls": + if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": + need_memstreamer = True + ip_dirs += [ipstitch_path + "/ip"] + if need_memstreamer: + # add RTL streamer IP + ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") + return ip_dirs + + +class GenerateInstrumentationIP(Transformation): + def __init__( + self, + fpga_part, + clk_period_ns, + format="ip", # "ip" for Vivado (Zynq) or "xo" for Vitis (Alveo/Versal) + ): + super().__init__() + self.fpga_part = fpga_part + self.clk_period_ns = clk_period_ns + self.format = format + + def apply(self, model): + # Create directory for code-gen and HLS of instrumentation IP + wrapper_output_dir = make_build_dir(prefix="code_gen_ipgen_Instrumentation_") + model.set_metadata_prop("instrumentation_ipgen", wrapper_output_dir) + + # conservative max for pending feature maps: number of layers + pending = len(model.graph.node) + # query the parallelism-dependent folded input shape from the + # node consuming the graph input + inp_name = model.graph.input[0].name + inp_node = getCustomOp(model.find_consumer(inp_name)) + inp_shape_folded = list(inp_node.get_folded_input_shape()) + inp_stream_width = inp_node.get_instream_width_padded() + # number of beats per input is given by product of folded input + # shape except the last dim (which is the stream width) + ilen = np.prod(inp_shape_folded[:-1]) + ti = "ap_uint<%d>" % inp_stream_width + # perform the same for the output + out_name = model.graph.output[0].name + out_node = getCustomOp(model.find_producer(out_name)) + out_shape_folded = list(out_node.get_folded_output_shape()) + out_stream_width = out_node.get_outstream_width_padded() + olen = np.prod(out_shape_folded[:-1]) + to = "ap_uint<%d>" % out_stream_width + ko = out_shape_folded[-1] + # fill out instrumentation wrapper template + with open( + os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation.template.cpp"), "r" + ) as f: + instrwrp_cpp = f.read() + instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending)) + instrwrp_cpp = instrwrp_cpp.replace("@ILEN@", str(ilen)) + instrwrp_cpp = instrwrp_cpp.replace("@OLEN@", str(olen)) + instrwrp_cpp = instrwrp_cpp.replace("@TI@", str(ti)) + instrwrp_cpp = instrwrp_cpp.replace("@TO@", str(to)) + instrwrp_cpp = instrwrp_cpp.replace("@KO@", str(ko)) + with open(wrapper_output_dir + "/top_instrumentation_wrapper.cpp", "w") as f: + f.write(instrwrp_cpp) + # fill out HLS synthesis tcl template + prjname = "project_instrwrap" + ipgentcl = ipgentcl_template + ipgentcl = ipgentcl.replace("$PROJECTNAME$", prjname) + ipgentcl = ipgentcl.replace("$HWSRCDIR$", wrapper_output_dir) + ipgentcl = ipgentcl.replace("$TOPFXN$", "instrumentation_wrapper") + ipgentcl = ipgentcl.replace("$FPGAPART$", self.fpga_part) + ipgentcl = ipgentcl.replace("$CLKPERIOD$", str(self.clk_period_ns)) + ipgentcl = ipgentcl.replace("$DEFAULT_DIRECTIVES$", "") + if self.format == "xo": + # use Vitis RTL kernel (.xo) output instead of IP-XACT + ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "config_export -format xo") + ipgentcl = ipgentcl.replace( + "export_design -format ip_catalog", "export_design -format xo" + ) + else: + ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "") + with open(wrapper_output_dir + "/hls_syn.tcl", "w") as f: + f.write(ipgentcl) + # build bash script to launch HLS synth and call it + code_gen_dir = wrapper_output_dir + builder = CallHLS() + builder.append_tcl(code_gen_dir + "/hls_syn.tcl") + builder.set_ipgen_path(code_gen_dir + "/{}".format(prjname)) + builder.build(code_gen_dir) + ipgen_path = builder.ipgen_path + assert os.path.isdir(ipgen_path), "HLS IPGen failed: %s not found" % (ipgen_path) + ip_path = ipgen_path + "/sol1/impl/ip" + assert os.path.isdir(ip_path), "HLS IPGen failed: %s not found. Check log under %s" % ( + ip_path, + code_gen_dir, + ) + if self.format == "xo": + assert False, "Not implemented" + # TODO: export for use in VitisBuild or VersalBuild + # xo_dir = self.output_dir + "/xo" + # xo_dir = str(os.path.abspath(xo_dir)) + # os.makedirs(xo_dir, exist_ok=True) + # xo_path = code_gen_dir + "/{}/sol1/impl/export.xo".format(prjname) + # xo_instr_path = xo_dir + "/instrumentation_wrapper.xo" + # shutil.copy(xo_path, xo_instr_path) + else: + # shutil.move(ip_path, self.output_dir) + pass + + return (model, False) + + +class PrepareInstrumentationSim(Transformation): + def __init__(self, fpga_part): + super().__init__() + self.fpga_part = fpga_part + + def apply(self, model): + # Create directory for simulation of instrumentation IP + FINN IP + sim_output_dir = make_build_dir(prefix="sim_Instrumentation_") + model.set_metadata_prop("instrumentation_sim", sim_output_dir) + + # check if instrumentation IP was generated + instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen") + if instr_ip_dir is None or (not os.path.isdir(instr_ip_dir)): + raise Exception( + "Instrumentation IP not generated, run GenerateInstrumentationIP first." + ) + + # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines) + # fill in testbench template + with open( + os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_tb.template.sv"), + "r", + ) as f: + testbench_sv = f.read() + with open(sim_output_dir + "/instrwrap_testbench.sv", "w") as f: + f.write(testbench_sv) + # fill in testbench project creator template + with open( + os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_sim.template.tcl"), + "r", + ) as f: + testbench_tcl = f.read() + + # collect ip repo paths for finn accelerator sub cores so Vivado can find them + ipstitch_path = model.get_metadata_prop("vivado_stitch_proj") + ip_dirs = ["list"] + ip_dirs += collect_ip_dirs(model, ipstitch_path) + ip_dirs += [instr_ip_dir] + ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) + testbench_tcl = testbench_tcl.replace("@FPGA_PART@", self.fpga_part) + testbench_tcl = testbench_tcl.replace("@IP_DIRS_STR@", ip_dirs_str) + with open(sim_output_dir + "/make_instrwrap_sim_proj.tcl", "w") as f: + f.write(testbench_tcl) + + return (model, False) + + +class RunInstrumentationSim(Transformation): + def __init__(self): + super().__init__() + + def apply(self, model): + sim_output_dir = model.get_metadata_prop("instrumentation_sim") + if sim_output_dir is None or (not os.path.isdir(sim_output_dir)): + raise Exception( + "Instrumentation sim not prepared, run PrepareInstrumentationSim first." + ) + + # Prepare bash script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash\n") + script.write("cd %s\n" % (sim_output_dir)) + script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl\n") + + # Run script + print("Running Vivado simulation of instrumentation wrapper") + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 63ce2d3cbf..8192c09bae 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -45,6 +45,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map @@ -102,6 +103,42 @@ def apply(self, model): axilite_idx = 0 global_clk_ns = 0 instance_names = {} + + # instantiate instrumentation IP if it was generated + instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen") + if instr_ip_dir is not None and os.path.isdir(instr_ip_dir): + use_instrumentation = True + # update IP repository + config.append( + "set_property ip_repo_paths " + "[concat [get_property ip_repo_paths [current_project]] [list %s]] " + "[current_project]" % instr_ip_dir + ) + config.append("update_ip_catalog -rebuild -scan_changes") + # create instance + config.append( + "create_bd_cell -type ip -vlnv %s %s" + % ("xilinx.com:hls:instrumentation_wrapper:1.0", "instrumentation_wrap_0") + ) + # connect clock % reset + config.append( + "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_clk] " + "[get_bd_pins smartconnect_0/aclk]" + ) + config.append( + "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_rst_n] " + "[get_bd_pins smartconnect_0/aresetn]" + ) + # connect AXI-lite control interface + config.append( + "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] " + "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx) + ) + config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl") + axilite_idx += 1 + else: + use_instrumentation = False + for node in model.graph.node: assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" sdp_node = getCustomOp(node) @@ -150,7 +187,8 @@ def apply(self, model): # define kernel instances # name kernels connected to graph inputs as idmaxx # name kernels connected to graph outputs as odmaxx - if (producer is None) or (consumer == []): + # do not expect IDMA/ODMA when instrumentation is enabled + if not use_instrumentation and ((producer is None) or (consumer == [])): # TODO not a good way of checking for external inp&out # should look at the list of top-level in/out instead if producer is None: @@ -228,6 +266,26 @@ def apply(self, model): ) ) + # connect first/last dataflow partition to instrumentation wrapper + if use_instrumentation: + if producer is None: + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_0] " + "[get_bd_intf_pins instrumentation_wrap_0/finnix]" + % (instance_names[node.name]) + ) + if consumer == []: + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/m_axis_0] " + "[get_bd_intf_pins instrumentation_wrap_0/finnox]" + % (instance_names[node.name]) + ) + + # TODO: WORKAROUND, do not instantiate smartconnect when not needed! + if use_instrumentation: + config.append("delete_bd_objs [get_bd_cells smartconnect_0]") + aximm_idx = 1 + # create a temporary folder for the project vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) @@ -305,6 +363,7 @@ def __init__( platform, period_ns, enable_debug=False, + enable_instrumentation=False, partition_model_dir=None, ): super().__init__() @@ -313,19 +372,27 @@ def __init__( self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug + self.enable_instrumentation = enable_instrumentation self.partition_model_dir = partition_model_dir def apply(self, model): # first infer layouts model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels - prep_transforms = [ - InsertIODMA(self.axi_port_width), - InsertDWC(), - SpecializeLayers(self.fpga_part), - Floorplan(), - CreateDataflowPartition(partition_model_dir=self.partition_model_dir), - ] + if self.enable_instrumentation: + prep_transforms = [ + GenerateInstrumentationIP(self.fpga_part, self.period_ns), + Floorplan(), + CreateDataflowPartition(partition_model_dir=self.partition_model_dir), + ] + else: + prep_transforms = [ + InsertIODMA(self.axi_port_width), + InsertDWC(), + SpecializeLayers(self.fpga_part), + Floorplan(), + CreateDataflowPartition(partition_model_dir=self.partition_model_dir), + ] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -337,7 +404,10 @@ def apply(self, model): sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) - kernel_model = kernel_model.transform(InsertFIFO()) + # InsertFIFO at this stage interferes with tLastMarker + # TODO: is this really needed here at all? + if not self.enable_instrumentation: + kernel_model = kernel_model.transform(InsertFIFO()) kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part)) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) From 419e18f65d67e3b8f498a9f4620123f1170582bf Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 19 Feb 2025 16:10:48 +0000 Subject: [PATCH 035/125] Nest AXI interconnects if required --- .../fpgadataflow/make_zynq_proj.py | 94 +++++++++++++++++-- 1 file changed, 87 insertions(+), 7 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 8192c09bae..5e86a58b6e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -27,6 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math import os import subprocess from qonnx.core.modelwrapper import ModelWrapper @@ -100,6 +101,9 @@ def apply(self, model): idma_idx = 0 odma_idx = 0 aximm_idx = 0 + nested_interconnect_count = 0 + master_axilite_idx = 0 + axilite_interconnect_idx = 0 axilite_idx = 0 global_clk_ns = 0 instance_names = {} @@ -132,13 +136,62 @@ def apply(self, model): # connect AXI-lite control interface config.append( "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx) + "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (master_axilite_idx) ) config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl") - axilite_idx += 1 + master_axilite_idx += 1 else: use_instrumentation = False + # instantiate nested AXI interconnects if required + # only the nested interconnects and all interfaces connected before this line + # will be connected to the original (master) interconnect + total_axilite_count = 0 + for node in model.graph.node: + sdp_node = getCustomOp(node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames")) + total_axilite_count += len(ifnames["axilite"]) + if total_axilite_count > (64 - master_axilite_idx): + nested_interconnect_count = math.ceil(total_axilite_count / 64.0) + for i in range(1, nested_interconnect_count + 1): + # create instance + config.append( + "create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_%d" % (i) + ) + # configure instance + config.append( + "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]" + % (max(64, total_axilite_count), i) + ) + # connect to master interconnect + config.append( + "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]" + % (master_axilite_idx, i) + ) + # connect clocks TODO: suppport zynq_7000 + config.append( + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/ACLK]" + % (i) + ) + config.append( + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/S00_ACLK]" + % (i) + ) + # connect reset + config.append( + "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]" + % (i) + ) + master_axilite_idx += 1 + total_axilite_count = min(0, total_axilite_count - 64) + + assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!" + + # start populating the first nested interconnect + axilite_interconnect_idx = 1 + for node in model.graph.node: assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" sdp_node = getCustomOp(node) @@ -211,8 +264,13 @@ def apply(self, model): assert axilite_intf_name is not None config.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" - % (instance_names[node.name], axilite_intf_name, axilite_idx) + "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]" + % ( + instance_names[node.name], + axilite_intf_name, + axilite_interconnect_idx, + axilite_idx, + ) ) # assign_bd_address with appropriate range/offset config.append( @@ -221,6 +279,11 @@ def apply(self, model): aximm_idx += 1 axilite_idx += 1 + if axilite_idx == 64: + axilite_interconnect_idx += 1 + axilite_idx = 0 + if axilite_interconnect_idx == 0: + master_axilite_idx += 1 else: instance_names[node.name] = node.name config.append( @@ -230,8 +293,13 @@ def apply(self, model): for axilite_intf_name in ifnames["axilite"]: config.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" - % (instance_names[node.name], axilite_intf_name, axilite_idx) + "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]" + % ( + instance_names[node.name], + axilite_intf_name, + axilite_interconnect_idx, + axilite_idx, + ) ) # assign_bd_address with appropriate range/offset config.append( @@ -239,6 +307,11 @@ def apply(self, model): % (instance_names[node.name], axilite_intf_name) ) axilite_idx += 1 + if axilite_idx == 64: + axilite_interconnect_idx += 1 + axilite_idx = 0 + if axilite_interconnect_idx == 0: + master_axilite_idx += 1 sdp_node.set_nodeattr("instance_name", instance_names[node.name]) config.append( @@ -286,6 +359,13 @@ def apply(self, model): config.append("delete_bd_objs [get_bd_cells smartconnect_0]") aximm_idx = 1 + # finalize nested interconnect clock TODO: support zynq_7000 + for i in range(1, nested_interconnect_count + 1): + config.append( + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_%d/M*_ACLK]" + % (i) + ) + # create a temporary folder for the project vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) @@ -300,7 +380,7 @@ def apply(self, model): templates.custom_zynq_shell_template % ( fclk_mhz, - axilite_idx, + master_axilite_idx, aximm_idx, self.platform, pynq_part_map[self.platform], From 5628ab2a1a2505ad4014626e885ddc11c8e59238 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 19 Feb 2025 16:25:07 +0000 Subject: [PATCH 036/125] Fix AXI interconnect connection --- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 5e86a58b6e..8c990a8b3d 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -191,6 +191,8 @@ def apply(self, model): # start populating the first nested interconnect axilite_interconnect_idx = 1 + else: + axilite_idx = master_axilite_idx for node in model.graph.node: assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" From 0c57d1b373527337f80ede1714a739cb83771bad Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 19 Feb 2025 22:19:16 +0000 Subject: [PATCH 037/125] Make floorplan partitioning of AXI-lite interfaces more consistent --- .../transformation/fpgadataflow/floorplan.py | 39 ++++++++++++------- .../fpgadataflow/make_zynq_proj.py | 4 +- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 7d93ff88fc..0b806ff44a 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -134,25 +134,27 @@ def apply(self, model): ) non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)) + # assign every DMA node to its own partition for node in dma_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 + # assign every dynamic tLastMarker node to its own partition for node in dyn_tlastmarker_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 + # handle remaining nodes for node in non_dma_nodes: pre_node = model.find_producer(node.input[0]) node_inst = getCustomOp(node) if pre_node not in non_dma_nodes: - # input node + # input node -> start new partition node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 continue - elif not ( node.op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") is not None @@ -160,25 +162,36 @@ def apply(self, model): ): pre_nodes = model.find_direct_predecessors(node) else: + # exception for external weight MVAU: only consider primary input + # TODO: (why) is this necessary? should we consider such exceptions for other cases? pre_nodes = [pre_node] + axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"] + if len(axilite_intf_name) != 0: + # This node has an AXI-Lite interface -> start new partition + node_inst.set_nodeattr("partition_id", partition_cnt) + partition_cnt += 1 + continue + + # examine all predecessor nodes to determine partition id for this node node_slr = node_inst.get_nodeattr("slr") + slr_mismatch_count = 0 for pre_node in pre_nodes: pre_inst = getCustomOp(pre_node) pre_slr = pre_inst.get_nodeattr("slr") if node_slr == pre_slr: - axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"] - if len(axilite_intf_name) != 0: - node_inst.set_nodeattr("partition_id", partition_cnt) - partition_cnt += 1 - else: - partition_id = pre_inst.get_nodeattr("partition_id") - node_inst.set_nodeattr("partition_id", partition_id) - + # Default case -> assign to same partition as predecessor + partition_id = pre_inst.get_nodeattr("partition_id") + node_inst.set_nodeattr("partition_id", partition_id) + break else: - # no matching, new partition - node_inst.set_nodeattr("partition_id", partition_cnt) - partition_cnt += 1 + # SLR mismatch with predecessor, can't assign same partition + slr_mismatch_count += 1 + + if slr_mismatch_count == len(pre_nodes): + # SLR mismatch with ALL predecessors -> start new partition + node_inst.set_nodeattr("partition_id", partition_cnt) + partition_cnt += 1 # save the updated floorplan floorplan = model.analysis(floorplan_params) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 8c990a8b3d..4d2ee3d50e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -163,7 +163,7 @@ def apply(self, model): # configure instance config.append( "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]" - % (max(64, total_axilite_count), i) + % (min(64, total_axilite_count), i) ) # connect to master interconnect config.append( @@ -185,7 +185,7 @@ def apply(self, model): % (i) ) master_axilite_idx += 1 - total_axilite_count = min(0, total_axilite_count - 64) + total_axilite_count = max(0, total_axilite_count - 64) assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!" From 684459c76189c22b9aa004a7c0028ee1c77a5a0d Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 19 Feb 2025 22:56:06 +0000 Subject: [PATCH 038/125] Add GPIO IP for reset --- .../transformation/fpgadataflow/make_zynq_proj.py | 14 +++++++++++--- src/finn/transformation/fpgadataflow/templates.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 4d2ee3d50e..456441bca8 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -94,6 +94,7 @@ def __init__(self, platform, enable_debug=False): super().__init__() self.platform = platform self.enable_debug = 1 if enable_debug else 0 + self.enable_gpio_reset = 0 def apply(self, model): # create a config file and empty list of xo files @@ -112,6 +113,12 @@ def apply(self, model): instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen") if instr_ip_dir is not None and os.path.isdir(instr_ip_dir): use_instrumentation = True + + # instantiate GPIO IP to trigger reset + self.enable_gpio_reset = 1 + # in the template this will connect to first port of interconnect_0 + master_axilite_idx += 1 + # update IP repository config.append( "set_property ip_repo_paths " @@ -170,7 +177,7 @@ def apply(self, model): "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]" % (master_axilite_idx, i) ) - # connect clocks TODO: suppport zynq_7000 + # connect clocks/reset TODO: suppport zynq_7000 config.append( "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/ACLK]" % (i) @@ -179,7 +186,7 @@ def apply(self, model): "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/S00_ACLK]" % (i) ) - # connect reset + # connect reset TODO: probably unneeded config.append( "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]" % (i) @@ -361,7 +368,7 @@ def apply(self, model): config.append("delete_bd_objs [get_bd_cells smartconnect_0]") aximm_idx = 1 - # finalize nested interconnect clock TODO: support zynq_7000 + # finalize nested interconnect clock/reset TODO: support zynq_7000 for i in range(1, nested_interconnect_count + 1): config.append( "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_%d/M*_ACLK]" @@ -388,6 +395,7 @@ def apply(self, model): pynq_part_map[self.platform], config, self.enable_debug, + self.enable_gpio_reset, ) ) diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index ccf4e7a943..0f6ba7c3c4 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -218,6 +218,17 @@ ] } +# set up GPIO to trigger reset +if {%d == 1} { + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0 + set_property -dict [list CONFIG.C_ALL_OUTPUTS {1} CONFIG.C_DOUT_DEFAULT {0x00000001} CONFIG.C_GPIO_WIDTH {1}] [get_bd_cells axi_gpio_0] + connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/M00_AXI] + assign_axi_addr_proc axi_gpio_0/S_AXI + connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aclk] [get_bd_pins axi_interconnect_0/ACLK] + connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aresetn] [get_bd_pins axi_interconnect_0/ARESETN] + connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_o] [get_bd_pins rst_zynq_ps_*/aux_reset_in] +} + #finalize clock and reset connections for interconnects if {$ZYNQ_TYPE == "zynq_us+"} { apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_0/M*_ACLK] From 8d454886c16f7495106d4ec477c54f5ba99bcb3d Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 20 Feb 2025 07:55:52 +0000 Subject: [PATCH 039/125] Remove unneeded connect_bd_net --- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 456441bca8..d462dc9d6b 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -186,11 +186,6 @@ def apply(self, model): "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/S00_ACLK]" % (i) ) - # connect reset TODO: probably unneeded - config.append( - "connect_bd_net [get_bd_pins axi_interconnect_%d/ARESETN] [get_bd_pins axi_interconnect_0/ARESETN]" - % (i) - ) master_axilite_idx += 1 total_axilite_count = max(0, total_axilite_count - 64) From 960a7f46a48519d4d63183a4de234bd0b12857bf Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 20 Feb 2025 18:01:02 +0000 Subject: [PATCH 040/125] Fix redundant bd_automation --- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index d462dc9d6b..846d95a11b 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -182,10 +182,6 @@ def apply(self, model): "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/ACLK]" % (i) ) - config.append( - "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/S00_ACLK]" - % (i) - ) master_axilite_idx += 1 total_axilite_count = max(0, total_axilite_count - 64) From 76ef35d988611261142395633eb2eeb28886f9c8 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 21 Feb 2025 11:12:12 +0000 Subject: [PATCH 041/125] Remove tcl.collectionResultDisplayLimit --- src/finn/transformation/fpgadataflow/templates.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index 0f6ba7c3c4..d9040d83f2 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -100,6 +100,10 @@ set FPGA_PART %s create_project finn_zynq_link ./ -part $FPGA_PART +# Prevent limitation on number of elements for string representations of Vivado collections of objects +# Otherwise we might run into the default limit of 500 if we have many IP_REPO_PATHS +set_param tcl.collectionResultDisplayLimit 0 + # set board part repo paths to find PYNQ-Z1/Z2 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]] set paths_param [get_param board.repoPaths] From 9c6c3cd8439ee162c3c5f153ec2123ea6591211a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sat, 22 Feb 2025 22:46:47 +0000 Subject: [PATCH 042/125] Add driver for iterative live FIFO-sizing --- driver/iterative_live_fifosizing_driver.ipynb | 833 ++++++++++++++++++ 1 file changed, 833 insertions(+) create mode 100644 driver/iterative_live_fifosizing_driver.ipynb diff --git a/driver/iterative_live_fifosizing_driver.ipynb b/driver/iterative_live_fifosizing_driver.ipynb new file mode 100644 index 0000000000..83a329d263 --- /dev/null +++ b/driver/iterative_live_fifosizing_driver.ipynb @@ -0,0 +1,833 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0ee21ecb", + "metadata": {}, + "outputs": [ + { + "data": { + "application/javascript": [ + "\n", + "try {\n", + "require(['notebook/js/codecell'], function(codecell) {\n", + " codecell.CodeCell.options_default.highlight_modes[\n", + " 'magic_text/x-csrc'] = {'reg':[/^%%microblaze/]};\n", + " Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n", + " Jupyter.notebook.get_cells().map(function(cell){\n", + " if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n", + " });\n", + "});\n", + "} catch (e) {};\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "try {\n", + "require(['notebook/js/codecell'], function(codecell) {\n", + " codecell.CodeCell.options_default.highlight_modes[\n", + " 'magic_text/x-csrc'] = {'reg':[/^%%pybind11/]};\n", + " Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n", + " Jupyter.notebook.get_cells().map(function(cell){\n", + " if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n", + " });\n", + "});\n", + "} catch (e) {};\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import time\n", + "import json\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "from IPython.display import clear_output\n", + "import numpy as np\n", + "from pynq import Overlay\n", + "\n", + "path = \"bitstreams/resnet50/live_instrumentation\"\n", + "bitstream = path + \"/finn-accel.bit\"\n", + "\n", + "# Program FPGA\n", + "ol = Overlay(bitstream, download=True, device=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f476fd87", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#FIFO IP detected: 266\n", + "#FIFO width information found: 266\n" + ] + } + ], + "source": [ + "### Sanity checks\n", + "# We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps\n", + "# We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights\n", + "print(\"#FIFO IP detected: %d\" % (len(ol.ip_dict.keys()) - 3))\n", + "\n", + "# We expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,\n", + "# {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}\n", + "with open(path + \"/fifo_widths.json\", \"r\") as f:\n", + " fifo_info = json.load(f)\n", + "print(\"#FIFO width information found: %d\" % len(fifo_info[\"fifo_widths\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e419656f", + "metadata": {}, + "outputs": [], + "source": [ + "### Instrumentation driver\n", + "# Register map\n", + "#ap_uint<32> cfg, \t// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed\n", + "#ap_uint<32> &status,\t// [0] - timestamp overflow; [1] - timestamp underflow\n", + "#ap_uint<32> &latency,\n", + "#ap_uint<32> &interval,\n", + "#ap_uint<32> &checksum,\n", + "#ap_uint<32> &min_latency\n", + "\n", + "def read_register(ol, name):\n", + " return ol.instrumentation_wrap_0.read(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"])\n", + "\n", + "def write_register(ol, name, value):\n", + " return ol.instrumentation_wrap_0.write(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"], value=value)\n", + "\n", + "def observe_instrumentation(debug_print=True):\n", + " status_reg = read_register(ol, \"status\")\n", + " chksum_reg = read_register(ol, \"checksum\")\n", + " min_latency = read_register(ol, \"min_latency\")\n", + " latency = read_register(ol, \"latency\")\n", + " interval = read_register(ol, \"interval\")\n", + "\n", + " frame = (chksum_reg >> 24) & 0x000000ff\n", + " checksum = chksum_reg & 0x00ffffff\n", + " overflow_err = (status_reg & 0x00000001) != 0\n", + " underflow_err = (status_reg & 0x00000002) != 0\n", + "\n", + " if debug_print:\n", + " print(\"---INSTRUMENTATION_REPORT---\")\n", + " if overflow_err or underflow_err:\n", + " print(\"Status ERROR\")\n", + " print(\"Overflow error: %s\" % overflow_err)\n", + " print(\"Underflow error: %s\" % underflow_err)\n", + " else:\n", + " print(\"Status OK\")\n", + " print(\"Frame number (8-bit): %d\" % frame)\n", + " print(\"Checksum: 0x%06x\" % checksum)\n", + " print(\"Min Latency (cycles): %d\" % min_latency)\n", + " print(\"Latency (cycles): %d\" % latency)\n", + " print(\"Interval (cycles): %d\" % interval)\n", + " print(\"----------------------------\")\n", + "\n", + " return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)\n", + "\n", + "def start_accelerator():\n", + " lfsr_seed = 0x00010000 # upper 16 bits\n", + " write_register(ol, \"cfg\", lfsr_seed + 1) # start operation\n", + "\n", + "### Virtual FIFO driver\n", + "# Register map\n", + "mode_offset = 0x10\n", + "depth_offset = 0x18\n", + "occupancy_offset = 0x20\n", + "occupancy_ctrl_offset = 0x24\n", + "max_occupancy_offset = 0x30\n", + "max_occupancy_ctrl_offset = 0x34\n", + "\n", + "def configure_fifo(ol, i, mode, depth = 2):\n", + " ip_name = \"StreamingDataflowPartition_%d\" % i\n", + " getattr(ol, ip_name).write(offset=mode_offset, value = mode)\n", + " getattr(ol, ip_name).write(offset=depth_offset, value = depth)\n", + "\n", + "def total_fifo_size(depths):\n", + " # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs\n", + " total_size_bits = 0\n", + " for i, depth in enumerate(depths):\n", + " total_size_bits += depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % i]\n", + " total_size_kB = total_size_bits / 8.0 / 1000.0\n", + " return total_size_kB\n", + "\n", + "### GPIO Reset Driver\n", + "def reset_accelerator():\n", + " ol.axi_gpio_0.write(offset=ol.ip_dict[\"axi_gpio_0\"][\"registers\"][\"GPIO_DATA\"][\"address_offset\"], value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2e2a4b88", + "metadata": {}, + "outputs": [], + "source": [ + "### Iterative FIFO-sizing function\n", + "def size_iteratively(start_depth, iteration_runtime, reduction_factor = 0.5):\n", + " num_fifos = len(fifo_info[\"fifo_widths\"])\n", + " fifo_minimum_reached = [False] * num_fifos\n", + " \n", + " if isinstance(start_depth, list):\n", + " # Individual start depth for each FIFO has been supplied\n", + " fifo_depths = start_depth\n", + " else:\n", + " # Initialize all depths to the same start depth\n", + " fifo_depths = [start_depth] * num_fifos\n", + " \n", + " # Reset accelerator and configure FIFOs\n", + " reset_accelerator()\n", + " for i in range(0, num_fifos):\n", + " configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n", + "\n", + " # Run once to determine target interval\n", + " start_accelerator()\n", + " time.sleep(1)\n", + " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n", + " log_total_fifo_size = [int(total_fifo_size(fifo_depths))]\n", + " log_interval = [interval]\n", + " log_min_latency = [min_latency]\n", + " log_latency = [latency]\n", + " target_interval = interval\n", + " \n", + " # Iteratively reduce FIFO depth until all FIFOs are minimized\n", + " iteration = 0\n", + " start_time = time.time()\n", + " while not all(fifo_minimum_reached):\n", + " for fifo_id in range(0, num_fifos):\n", + " if not fifo_minimum_reached[fifo_id]:\n", + " fifo_depth_before = fifo_depths[fifo_id]\n", + " fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor)\n", + "\n", + " # Reset accelerator\n", + " reset_accelerator()\n", + "\n", + " # Configure all FIFOs\n", + " for i in range(0, num_fifos):\n", + " configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n", + "\n", + " # Start accelerator\n", + " start_accelerator()\n", + "\n", + " # Let it run\n", + " time.sleep(iteration_runtime)\n", + "\n", + " # Check if throughput dropped or deadlock occured \n", + " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n", + "\n", + " if interval > target_interval or interval == 0 or overflow_err or underflow_err:\n", + " # Revert depth reduction and mark FIFO as minimized\n", + " fifo_depths[fifo_id] = fifo_depth_before\n", + " fifo_minimum_reached[fifo_id] = True\n", + " else:\n", + " log_total_fifo_size.append(int(total_fifo_size(fifo_depths)))\n", + " log_interval.append(interval)\n", + " log_min_latency.append(min_latency)\n", + " log_latency.append(latency) \n", + "\n", + " if fifo_depths[fifo_id] == 1:\n", + " fifo_minimum_reached[fifo_id] = True\n", + "\n", + " # Report status\n", + " clear_output(wait=True)\n", + " print(\"Iteration: %d\" % iteration)\n", + " print(\"Reducing depth of FIFO: %d/%d\" % (fifo_id, num_fifos))\n", + " print(\"Numer of minimized FIFOs: %d/%d\" % (sum(fifo_minimum_reached), num_fifos))\n", + " print(\"Interval: %d\" % log_interval[-1])\n", + " print(\"Min. latency / latency: %d/%d\" % (log_min_latency[-1], log_latency[-1]))\n", + " print(\"Total FIFO Size (kB): %d\" % log_total_fifo_size[-1])\n", + "\n", + " iteration += 1\n", + "\n", + " end_time = time.time()\n", + " print(\"Done (%d seconds)\" % int(end_time - start_time))\n", + " \n", + " return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2ebb2aa3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing start depth of 64\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 128\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 256\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 512\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 1024\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 2048\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 4096\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 0\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 4294967295\n", + "Latency (cycles): 0\n", + "Interval (cycles): 0\n", + "----------------------------\n", + "Testing start depth of 8192\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 108\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 2548522\n", + "Latency (cycles): 5030984\n", + "Interval (cycles): 903174\n", + "----------------------------\n", + "Testing start depth of 16384\n", + "---INSTRUMENTATION_REPORT---\n", + "Status OK\n", + "Frame number (8-bit): 108\n", + "Checksum: 0x000000\n", + "Min Latency (cycles): 2548522\n", + "Latency (cycles): 7496520\n", + "Interval (cycles): 903174\n", + "----------------------------\n", + "Determined start depth for all FIFOs: 8192\n", + "Determined iteration runtime based on performance: 0.127426 s\n" + ] + } + ], + "source": [ + "### Attempt to determine start depth for all FIFOs automatically\n", + "# If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis\n", + "start_depth = 64\n", + "last_interval = 0\n", + "start_depth_found = False\n", + "\n", + "while not start_depth_found:\n", + " print(\"Testing start depth of %d\" % start_depth)\n", + " reset_accelerator()\n", + "\n", + " # Configure FIFOs\n", + " num_fifos = len(fifo_info[\"fifo_widths\"])\n", + " for i in range(0, num_fifos):\n", + " configure_fifo(ol, i, mode = 1, depth = start_depth)\n", + " \n", + " # Start accelerator and let it run for a long time\n", + " start_accelerator()\n", + " time.sleep(1)\n", + " \n", + " # Examine performance\n", + " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation()\n", + " if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:\n", + " # Accelerator runs with stable interval, reset to previous start depth\n", + " start_depth_found = True\n", + " start_depth = last_start_depth\n", + " else:\n", + " # Start depth is still too small, increase for next try\n", + " last_start_depth = start_depth\n", + " start_depth = start_depth * 2\n", + " \n", + " last_interval = interval\n", + " \n", + "# Determine runtime per iteration based on performance, so that stable-state is guaranteed\n", + "# Use a simple overestimation for now to be safe\n", + "iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)\n", + "\n", + "print(\"Determined start depth for all FIFOs: %d\" % start_depth)\n", + "print(\"Determined iteration runtime based on performance: %f s\" % iteration_runtime)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4ba40f96", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration: 12\n", + "Reducing depth of FIFO: 265/266\n", + "Numer of minimized FIFOs: 266/266\n", + "Interval: 903174\n", + "Min. latency / latency: 2549314/2580777\n", + "Total FIFO Size (kB): 244\n", + "Done (389 seconds)\n" + ] + } + ], + "source": [ + "### First pass\n", + "(fifo_depths,\n", + " log_total_fifo_size,\n", + " log_interval,\n", + " log_min_latency,\n", + " log_latency) = size_iteratively(start_depth, iteration_runtime)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ebf027a4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdgAAAE3CAYAAAAJy1DOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAxOAAAMTgF/d4wjAABNoElEQVR4nO3dd5wU5f3A8c+ze527oyPlhKHpDjZEUEFRMRjLGjTRoCZijMZIJImKbWPys0XjGiOaWGLFCnZAdAELitgQVCAis1KXKkXKHe3a7vz+mNljOa7M7u3eXvm+X6993e48U76znnzveeYpyjRNhBBCCJFcrnQHIIQQQrREkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECkgCVYIIYRIAUmwQgghRApkpDsAIYQQLYPmC/wHGAX0Ao4K+b1LHByTDTwAnAmUAwtDfu+lKQ20kUgNVgghRLK8AZwMrInjGD8QAQ4L+b1HADelIrB0UDKTkxBCiGTSfIEQcG60Bqv5Av2Bh4AuQBbwRMjvfUzzBdoAG4CikN+7O03hpow0EQshhEgZzRdwA5OBMSG/N6j5AnnAPM0XmAdUAtuAv2m+wEhgH3BHyO+dnb6Ik0eaiIUQQqTS4cARwCuaL7AI+BwoAAYAmUAfYGnI7x0M/NHer3OaYk0qqcEKIYRIJQX8GPJ7B1Yv0HyBTljPXycBhPzexZovsBorIc9pxBhTQmqwQgghUul7YK/mC1wW3aD5Av00X6BDyO/9EZiN1YMYzRfoBfS2j2n2pJOTEEKIpNB8gUeB84CuwI/A7pDf28/u5PQg0BNwA1uBX4f83g2aL9AHmAh0BMLAnSG/d2pabiDJJMEKIYQQKSBNxEIIIUQKtKpOTkop0+VK8G8K07ReiR4vhBAiLpFIBNM0VbrjSFSrSrAul4twOJzQsfv+9z9Coy+i0zV/oPOf/5zkyIQQQlSnlErsH+wmQqpjDmX17AlA5dYf0xyJEEKI5kASrEOuNm0ACO/cmd5AhBBCNAuSYB1SmZlk9evLrg8+YNfsFjGLlxBCiBRqVcN03G63megzWIDydetYecZPyR8xgkP/+1gSIxMtkWmaVS8hxMGUUtTV8VQpFTZNs9n2FWq2gadD1qGHkqVp7J0/n/DOnbjbtUt3SKIJikQibNmyhZ07d0pyFaIemZmZ9OzZk6ysrHSHknSSYOPU7sIL2PKvB9j+wgvSm1jUaM2aNbhcLjRNIzMzM93hCNFkmabJtm3bWLt2Lf369Ut3OEknCTZO7UaPZsu/HqBy69Z0hyKaoEgkQmlpKf379ycjQ/73EqI+HTt2ZPv27UQikTqbi5ujlnU3jcCVlwdAxcYf0hyJaIqiTcJKNdux8UI0quj/Ky3xcYokWIciEZPSijCmy01G926Ur1mT7pCEEEI0YZJgHZrw/jI8/zeLVT/uIW/gQCrWr2fntGnpDksIRzRNo0uXLlRUVFRt+/DDD1FKceONNwIwffp0brrppnrPtXHjRkaMGJGyWBMxfvx4XnnlFQAWLFjAsGHDyMvL48ILL3R0/LRp05g/f36t5StXrmTQoEEce+yxPPvss0mJOV433HADL7/8cq3lp512Gu+8805c57zjjjsoLy9vaGiiFvKQyKEMt9WMURmJ0OWWW9i74Cu2PvgQ7c4/P72BCeFQz549mT59OhdccAEAEydOZPDgwVXlo0aNYtSoUfWep3v37nz00UcpizNeGzZsYObMmTzwwAMAdOvWjYceeoiFCxfy/vvvOzrHtGnTGDx4MMcff3yN5W+88QZDhw7l0UcfPaissrKyUZ6333LLLQwfPpyLLrooac8q77zzTm688cZm34NX8wVCQKn9Arg35Pe+WsN+VwI+rMrlbOCakN9bmaq4JME6lOm2fqErwyaZPQ4hb8hgSmbMZO/CheQde2yaoxNN2aiXR7Fyx8qUnLtv+75Mv2S6o32vuOIKJk6cyAUXXEBxcTHz5s3jkksuYd++fQA899xzvPPOO7zxxhvMmTOH6667jmHDhvHZZ59RWVnJ888/z+DBgwmFQgwePJgff7SmDVVK8Y9//IOpU6fy448/8uSTTzJ79mxmzZpFeXk5r732GkcccQRz5szhxhtv5KuvvgJgyZIlnHvuuYRCoapzjh07lkAgwL59+3jppZd48sknmTdvHjk5OUybNo3u3bsfdF8TJ07kwgsvrHqWV1RURFFREUuXLj1o33nz5jFu3DjC4TCVlZWMGzeOXr16MX36dD744AOefvpp/vjHP/K73/2u6pgXXniBBx98kEgkwmeffcbkyZO55pprOOmkk5g3bx4A7777Ll6vl23btrFv3z4GDhzIU089RV5eHs899xyTJ0+mQ4cOLFq0iO7du/Pwww9z8803s3z5cgYNGsTkyZNxuVzs2rWL8ePHs3jxYkpLSxk2bBgPP/wwmZmZdOnShd69ezN79mzOOOMMx78jEyZM4OWXX6ayspLMzEwefvhhTjjhBMaOHQvAsGHDcLlcvPfee+Tm5tZ6/dNOO40TTjiBzz//nI0bN3LGGWfw+OOPA1BcXMwNN9zAl19+icvl4rjjjuOxxx5D0zQWLFjAoYceCsBf/vIXIpEI9913n+P443BhyO9dUluh5gv0Bv4OHAtsAd4CrgSeSEUwIE3EjmW4rP95K8IRANpfeikAJTNmpi0mIeJxyimnsGrVKjZs2MDLL7/ML3/5S9xud637f/fdd1xxxRUsXryYP/3pT/z1r3+tdd/CwkLmz5/Pfffdx3nnncfJJ5/MwoUL+c1vfsM999zjKL5t27YxdOhQFi5cyJVXXsnIkSO55ppr+N///sfgwYN55JFHajxuzpw5DBs2zNE17r33Xm644QYWLVrEkiVLuPjiiznnnHMYNWoUPp+PRYsWHZBcAS677DLGjh3LZZddxqJFixgwYAAAixYtYtasWcyePRu3283kyZP56quvWLJkCYWFhTz22P7JaBYsWMC//vUvgsEgeXl5/OpXv2Ly5MksXbqUpUuX8sEHHwBWM/App5zC/PnzWbx4MZWVlQfc97Bhw5gd50xyY8aMYcGCBSxcuJD//Oc/XHnllQBVyfHzzz9n0aJFdOnSpd7rr1y5kjlz5rBkyRLeffddvvjiCwCuu+46cnNzWbx4MYsXL+a+++4jJyeHK6+8kieesPJXWVkZzz77LH/4wx/iij+JLgSmhvzezSG/1wQeBy5J5QWlButQRrQGG7F6uuUcfjgAkV270haTaB6c1jAbw5gxY3j++eeZNm0akyZNYtKkSbXue/jhh1c1IQ8dOpR//etfte570UUXATBo0CBcLhderxeA4447jilTpjiKLT8/v+q4QYMGUVRUxMCBA6vOU1tz7/r16+natauja4wYMYK7776bFStWcPrpp3PyySc7Oq4mY8aMqRrnbJomDz74IIFAgMrKSoqLiznllFOq9j3ppJMoKioC4Nhjj0XTNNq2bQvAMcccw6pVqwCrqXrevHlVzd379u07oPm2a9euzJ07N644Fy5cyD333MO2bdvIyMhg6dKllJeX19gsXN/1L774YtxuN7m5uQwcOJCVK1cydOhQ3nnnHb7++uuqpuvOnTsDcM0113DCCSdw22238corr3DCCSegaVo84buUUutjPk8wTXNCLftO0nwBF/Al8JeQ31t9LGVPILZ3asjeljKSYB3KdB9Yg1V5eZCZSdmKFekMS4i4XH755QwaNIjDDjuM/v3717lvTk5O1Xu3201lZe2PqqL7ut1usrOzazwuIyPjgOUiS0tLDzhH9eOcXj8vL6+qmbs+1113HaNGjWL27NnceuutHHnkkQfUNOORn59f9X7y5Ml8/PHHzJ07l4KCAv7zn/8ckAir30tt92aaJtOmTaNPnz41XrO0tJTc3FzHMZaXl3PBBRcwZ84cjjvuOEpKSmjbtm2tCba+68fzOwHQo0cPhg8fzhtvvMGjjz7quDUjRsQ0zSIH+50S8nvXar5AJnA38DxwTg37xY4FSvlYOmkidijDtf8ZLFjPnbL79aNi3Toi1f6hEKKp6t69O/fee2+qnoHVqXfv3qxevZpt27YB8OKLLyblvEcffTTBYNDRvt9//z19+vThqquu4tZbb616hlpYWEhxcXHCMezYsYOOHTtSUFDArl27eO655xI6z6hRo/D7/VWJa8eOHayI+SPeMAyOOeYYx+crLS2loqKi6hnoww8/fEB5QUHBAfdd3/Xrivv+++8nErEqIFtjJuK59tprueWWWygpKWHkyJGOY49HyO9da/+sAB4Chtew21pAi/ncy96WMpJgHYrtRRxVeOaZhIuL2XTHnekKS4i4/fa3v2Xo0KGNft0ePXpw4403MnjwYEaMGEG7JM3lfeGFFzJz5v6+ECtXrqSoqIjx48czY8YMioqKqmqpDz/8MEcccQTHHnssf/vb36qaQseMGcPkyZMZOHAgTz/9dNwxXHbZZezevZsBAwbwi1/8guHDa/r3vX4PPfQQGRkZDBw4kKOPPpqRI0cSCoUAq3Y5e/ZszjvvvFqPv/zyy6s6eRUVFfHdd99x1113cfzxx3PKKacc0EoA1jPf008/nYEDB7Jly5Y6r1+XBx98kL1793LkkUcycOBAbr311qqyE088kXbt2jFu3LiUTMCi+QJtNF+gXcymS4CFNez6JvBzzRc4RPMFFDAWeCXpAcWQ1XQcmrpwPde/upgnxhzHmUdYz3vMSITQL0dTtmwZhy9aiKqjw4hoHcLhMMuWLeOwww6rswORSJ5IJMKQIUN46623qp5ztkSzZs1i0qRJSav5N5Z169Zx/PHHs2zZMgoKCg4qr+v/GSer6Wi+QB+s5OnGavZdBVwb8ntDmi/wNDA95PdOt/e9CrgFq3L5IfAHu9abEvIM1qHqTcQAyuUi74QTKP3uO3a88godfv3rdIUnRKvlcrl44oknCIVCLTrBFhcXp6VpvyFuu+02Jk6ciN/vrzG5JkPI712FNfSmprLfVfv8FPBUSgKpgSRYhzJraCIG6HDZGLZPnCidnYRIo9gJM1qqaE/t5uSuu+7irrvuSncYaSPPYB2K1mArwgc2qbvbtwcgvG17o8ckhBCi6Up5Ddbw6NnAA8CZQDmwUA8alxoevQvwAtAXKAPG6kHjU/uYPOAZYAgQAXx60Jhil7mAf2N1wTaBCXrQSKyffRyqOjmFD6zBqqwsXHl5lK9NaWc00Uy05JVBhEiFlrwCVWM0EfuxkuRhetAwDY/eLWb7PD1onGV49CHAG4ZH76sHjUrgRqBMDxr9DI/eG/jC8Ogf6UFjB3ApMAA4DGgLfGN49A/1oOGsn36ColMlVkQO/IdTKUWWphHeuTOVlxfNhMvlIicnhw0bNnDIIYfIgutC1CG64HpmZmaLWwsWUpxgDY/eBvgtUKQHDRNADxrRhVRHA73tbQsMj74ZOBmYA1wEXG6XrTY8+lzgPOA5u+xxPWiEge2GR38NuBi4I5X3kpNp9W7bU3bwwGpXQQFlq1dTsWkTmQ5nlBEtV69evdiyZQuhUEhqskLUIzMzk549UzqhUtqkugbbF9gG/M3w6COBfViJcBHg0oNG7FRWIfZPW1XXlFY1ldXYw0EpNR4YH/M5kXsAoGtbawaTLSVlB5W1u+AX7P3ySzaMvwFtcu1Tz4nWweVy0bVrVw455BBM05QkK0QtlFItsuYaleoEmwn0AZbqQcNnePRjgA+AIzlwyio4eNqquqa0cjTdlT1nZdW8lW63O+F/6dzRZ2sHhQ1tR41i+wsvUr56daKnFy2QUqpFPlcSQjiT6j8d1mA9f50EoAeNxcBqQAcwPHrnmH1jp62qa0qrRp/uCsBeTIfaKiMZnToR3rEDsyJlY5aFEEI0IylNsHrQ+BFrUdszAQyP3gvruev3wOvAOHv7EKAr8Kl9aGxZb+BUYHpM2dWGR3cbHr0D1jPZgxbWTbqqBFtzhs3qa02OXTJrVspDEUII0fQ1RuP3WOBmw6N/i7XA7e/tjk63AMMMj74cq/PSGLsHMcD9QK7h0VcA7wLj9KARHWj6IlaCXgYsAO7Xg4aR6ptQRJuIa1ZoL4BcuXlzqkMRQgjRDKR8mI4eNFYBp9WwfTPw01qO2YNVM62pLIxdu21Mqp4mYldhIQAVm7c0UkRCCCGaspbbfSvJol1VaurkBJDZvTuuwkJKZs2ssVwIIUTrIgnWIVfVDD21lOfk0OaE4wn/uE2GZQghhJAE61S0iThSR+505ReAacqsTkIIISTBOqViGolrk9XLmgujLJjSWRuFEEI0A5JgnaqnkxNA7rGDAPjx0ccwy8sbISghhBBNlSRYh+rrRQyQd/wQ2l10EXu/+oo9X85vnMCEEEI0SZJgHXLVMVVilFKKwjOtkUfFU6c0SlxCCCGaJkmwDkWfwNbVyQkg74QTyOrbl10fzUl1SEIIIZowSbAOOWkiBlBuN9mH9cfctw+z8uCl7YQQQrQOkmAd2j9VYv1jXN1t2wJQsWFDSmMSQgjRdEmCdUjVP0qnSnb//gDsW7QoZfEIIYRo2iTBxsnJHE0Fp52Gq6CATX+/mwqZ/F8IIVolSbAORXsRRxxMg5jZowddbryRyO7d7Pns81SHJoQQogmSBOuQ005OUXlDBgOw/dlnUxSREEKIpkwSrENxPIIFILtPH9qcdBLloVCKIhJCCNGUSYJ1SFWtpuN8pRx3+/aYFRVEyspSFZYQQogmShKsQ/HWYAEyunQBoMwwkh6PEEKIpk0SrEPxDNOJyj36aABKZs5KfkBCCCGaNEmwDqk4ehFH5Z8+gpwBA9j+/POULl2aqtCEEEI0QZJg46CU817EAK6sLDr89nIASmbOTE1QQgghmiRJsHFQOJsqMVb+iBG4O3Zk5xtvpiYoIYQQTZIk2DgopeKqwQK48/PJG3Qs4eLiuHogCyGEaN4kwcZBUf9ydTVxtcmHSITw9u1Jj0kIIUTTJAk2Dm2yM9hdVhH3cVl9+gBQulSG6wghRGshCTYO3drmsKm4NO7j2px4AgA/3H4b4ZKSZIclhBCiCZIEG4eubXP4obg07mepuUcfTYcrrqBy4w+yhJ0QQrQSkmDj0LUwh7LKCDv3xt9MnHvUkQBE9uxJdlhCCCGaoIx0B9CctMm2vq59FWHax3msu2NHAHbN/pDCs89OcmRCCNG6ab7A7cAdwFEhv3dJtbLTgBnAspjNQ0N+775UxiQJNg5ZGVaFv7wyEvexeYMH4+7UibJly+rfWQghhGOaLzAIOBFYW8duS0N+7+BGCglohARrePQQUGq/AO7Vg8arhkfvArwA9AXKgLF60PjUPiYPeAYYAkQAnx40pthlLuDfwDlYMwNP0IPGY6m+D4Ast5VgyxJIsMrlIvOQQ6jcIUN1hBAiWTRfIBt4FPgV8FGawzlAYz2DvVAPGgPt16v2Nj8wTw8a/YHfApMMjx5N+DcCZXrQ6AecCTxmePRoq+ylwADgMOB44GbDo3sa4yYaUoMFyOjalcqNP1D6vdRihRDCAZdSan3Ma3wN+9wFvBTye1fXc67DNV/gG80XWKD5AtekINaDpLOT02isvzrQg8YCYDNwsl12UUzZamAucF5M2eN60AjrQWM78BpwcWMEnB1NsOFwQscXnnUWAGXfB5MWkxBCtGAR0zSLYl4TYgs1X2AoVktnfa2Y3wBFIb93EPBzYKzmC4xOTcj7NVaCnWR49G8Nj/604dE7Gx69I+DSg8bWmH1CQE/7fU9gTQJlB1BKjY/966ehUxVGE2wiTcQAGZ07AVC5TZqJhRAiCU4FPMBqzRcIAUXAu5ovcEBP0pDfWxLye4vt9+uBl4HhqQ6uMTo5naIHjbWGR88E7gaeB8Zw8MqqqtpnM8Gy/TtZf+1U/cXjdrsblGEb2kSc2a2bdfyqVQ0JQwghBBDye/1YjxsBsJPsuTX0Iu4GbA75vRHNFygAzsXq55NSKa/B6kFjrf2zAngIGK4HjW0AhkfvHLNrL/b3AFsLaAmUpVSDE2zPnqjsbPZ+9RXh3TIeVgghUkXzBZ7WfIFR9scLgG81X2AxMA94H3g21TGktAZrePQ2QKYeNHbamy4BFtrvXwfGAXcYHn0I0BX4tFrZ5YZH743VDDA2puxqw6NPAdpiPZM9K5X3EZXldgNQHk4swSql6PSHsWx96N/seOlFOo0dW/9BQgghHAn5vVrM+9/FvH8EeKSx40l1DfYQ4CPDo//P8OjfYiXKy+yyW4BhhkdfDjwHjNGDRqVddj+Qa3j0FcC7wDi7QxPAi8D3WAOGFwD360GjUWbRz8m0vq7SisQSLED7S8cAsOvDjzAjiZ9HCCFE05bSGqweNFYBx9ZSthn4aS1le7BqpjWVhbFqt40uJ9Oqwe6rSKwXMYA7vw35I3/C7g9mU7Z8OTmHH56s8IQQQjQhMhdxHKIJtmRf/HMRx2pz4lAAwjuLGxyTEEKIpkkSbBzy7bmIF4QaNszGXZAPQPmaUENDEkII0URJgo1DUftcADq2yW7QebLtZuHy0Jp69hRCCNFcSYKNg7JH3JoHDeGNT1afPmR07syOSZMoW748CZEJIYRoaiTBxkFRlWEbxJWVRdc778QsK2PXh01qbmohhBBJUm8vYntlm/pE9KBRWv9uzZuqdc6o+OUeOxCVnc32l16k45VXoDJk5UAhhGhJnNRgdwO77J/VX9HtK1MVYFPUwAosABnt21Po9RLe+iPhYulNLIQQLY2TatNiPWjUOJY1yvDoC+sqFzXL6NgBgHBxMRkdO6Y5GiGEEMnkpAb7pyTt02I0dFWeqIzO1lTMZd9/n5TzCSGEaDrqTbB60Pg0Gfu0BCo5fZyq5Bx1FAAl776XtKQthBCiaXDSySkXuBzYgbW4+T+BM7HmA75WDxobUhlgU6JqXxkvIbnHHEP+iBHsmjWLfZddRt6gOlvihRBCNCNOmoifAs4Bfg+8B7QDbgZWA4+nLLImLFmVTeVy0e7CCwDY9e67yTmpEEKIJsFJgh2kB42fYSXZwcDv9aAxUw8aNwG9UxpdE5PsJmKANiedREbXrhS//XYSzyqEECLdnCTYMgB7nOtqPWjErrFWnpKomqjkNhBbXDk55AwYQHjXLnkOK4QQLYiTYTrZhkfXsfJL7HuAnJRF1oQlOxG6CwqgooLwtm1kdOqU1HMLIYRwTvMFtjjYbVPI7z26vp2cJNg8YEbM5xm17djSqWRO5RQju38/AEq/+478U09NyTWEEEI4shXrkWhtFDDdyYnqTbB60NCcxdR6JLsht83w4fDQv9l05130mTUTV1ZWkq8ghBDCoTtDfm+dS51pvsDdTk7keLJ/w6OfWcO2sU6Pbwmq6q9JzrA5hx9Oh8suo2LjRsqWLk3uyYUQQjgW8ntfS8Y+EN9qOvcbHv2o6AfDo48Brojj+GYvRS3EALQZeiIA2yY+m7qLCCGEcETzBe7SfIF2mi+gNF8goPkCP2q+wAXxnCOeBHsxMNnw6N0Nj/4L4Ebg7Hgu1lI0dD3YmuQPH05G926Ur5FF2IUQogk4L+T37gRGApXAScBf4zmB4wSrB42lwJ+xJpv4O3CmHjS2xXOx5i5VnZyiMtq1p3KLkw5sQgghUiw6JPVU4PWQ3xv3pPFOpkr8Z7VNlcByYLzh0dGDxs3xXrS5S9Vw1YyuXSldupRwcTHutm1TcxEhhBBO7NF8AR9W6+1Jmi/gAuLqgeqkBrun2msqsCTmc6uTqgSbe7Q1rGrnlKmpuYAQQginLge6AjeH/N7NQB9gUjwnUK1p9iC3222Gw+EGnaP3XwKcOaArj485LklR7RfeuZPVoy+icssWDv/ma5QrnkfkQgjRsiilwqZpOpmvISU0X8ANHBrye0OJHF/vv+CGR6+3p7CTfVqSVHRyAnC3a0f+Kadglpay+6OPUnINIYQQ9dN8geHAGmCu/XmI5gu8GM85nPxlcKPh0b+g7ql4rwMmxnPh5kqRuiZigA6X/podL71ESWAGBT/5SeouJIQQoi7/xOrg9AZAyO9doPkCg+I5QSJTJdZkazwXbc5S3ZM4S9NQOTmEd+9K6XWEEELUKSPk967UfIHYbXEtcCNTJSYg1U+t3W3bUr5iZYqvIoQQog6lmi+Qj/1PvuYLHAGUxnMC6UUTp9TWXy3Z/fpRsWULlTt2NMLVhBBC1ODvwLtAd80XeA6YDfxfPCdotN5Zhke/HbgDOEoPGksMj94FeAHoi7Xm7Fg9aHxq75sHPAMMwRrs69ODxhS7zAX8G2u1AxOYoAeNxxrrPiC1z2ABCs85hz2ffca6q36P9uorKLc7tRcUQghxgJDf+57mCywHzsKqW90d8ntXxHOORkmwhkcfBJwIrI3Z7Afm6UHjLMOjDwHeMDx6Xz1oVGJNw1imB41+hkfvDXxhePSP9KCxA7gUGAAcBrQFvjE8+od60Ag2xr1Yj2BTm2HbXfAL9s7/kuK3plPxww9kFRWl9HpCCCEOFvJ7VwP/TfT4uBOs4dEz7CTodP9s4FHgV0Ds2JPRQG8APWgsMDz6ZuBkYA5wEdYgX/Sgsdrw6HOB84Dn7LLH9aARBrYbHv01rJk27oj3XhKhGqWRGDJ79gQgsnt3o1xPCCEEaL7AAuqoRYX83uOdnstxgjU8+hFYs1h0BA41PPpxwGg9aNxSz6F3AS/ZiTJ6ro6ASw8asb2PQ0BP+31PrPFHTssG13RhpdR4YHzM53pCdaYx5ubI6NQZgF0fzCbH40n9BYUQQoDVgpoU8XRyegT4I/Cj/fkbwFvXAYZHH4r1HLWmZ6TV01T17GcmWLZ/J9OcYJpmUfSVlATbOBVYCr3WV1u2Iq4mfyGEEA0Q8ns/Dvm9HwNfAnNjPn9ib3MsngRbEO2EBKAHDROoqOeYUwEPsNrw6CGgCKtX1vEAhkfvHLNvL/Y/o10LaAmUNYrGmFzSnd8GlZdHZJeMhxVCiDT4ECiM+VwAfBDPCeJ5BltpePRM7PxiePQi9i/nUyM9aPixOjNhHxMCzrV7Eb8OjAPusDs5dQWiCTxadrndyelUYGxM2dWGR5+C1cnpIqxeXo3CmsmpceZvzuzShb1ffSWr6wghRD00X6BqpErI711SQ/mVgA+rYjkbuCbk99bVnygv5PcWRz+E/N5izRdoE09M8TYRTwU6GR79Dqz5Ge+P52LV3AIMMzz6cqzOS2NiOk/dD+QaHn0FVo13nB40tttlLwLfA8uABcD9etAwGhBHXFI8kdMBCs4+C7OsjPJ16xvvokII0czYUxhWH6kSW94ba1zryUA/rArdlfWc1hWbUDVfoADIjCcuxzVYPWi8ZHj0VVi9efOA3+hB45N4LhY7K5QeNDYDP61lvz1YNdOaysJYtdu0aaz1hzI6dQIgLBNOCCFEjTRfoLaRKrEuBKbay86h+QKPAzcDT9Rx6knAe5ovEB2m8wfg+Xhic1yDNTz6cKxxq7foQeNmPWh8Yo9vbVUUqlF6EQNkdLYeUZevkmkThRCtkksptT7mNb6Gfe4CXrLHrNamrpEpNQr5vfcBTwKj7Nd/Q35vXK228TyD/Qh4z/DoF+pBY6+97WmgVSXZxmwizj3iCAB2f/wx7X/9a1RG2pZFFEKIdIiYplnrTDuaLxAdqeJzcC5Ho09izt0u5Pc+T5y11ljxPIP9FqsT0lzDox9ib2vEdNN0NFYTcWaPHhSO+hl7Pv+CXR/MbqSrCiFEs1E1UkXzBULYI1U0X+DsavslMvpkueYLPKX5AkcnGlw8VSJTDxr/MDz6Wqwkez6Nl2uajMb+i6LDmDGUTH+bXe+9S+FZZzby1YUQoukK+b0HjFSxk+y5NfQifhP4VPMF7gK2YI1KeaWe0/fD6gj1puYLbAIeBt4M+b1hp/HFU4NVYHV2wnrYOwPoEcfxLUZjDdMByBkwgOz+/SmZOYtIaVwrJQkhRKul+QJPa77AKICQ37sKuB34DFiJlWSfqev4kN9bHPJ7J4T83v5YSfxfwFrNF/ir0+E68dRgH4m+0YPGh4ZH/xkx0xC2FqlecP2g67nd5A0ZQtny5UR278aVk9Oo1xdCiOYi5PdqMe9/V63sKeCpeM5nD825HLgG+M4+/ifALGB4fcfHM0znmWqflwBXxBFri5COh86uggIAKjZurBq6I4QQInXsoTznYTUvnx/ye7+3i6ZovoCjuRfqTbCGR39RDxpjDI9e4woDetBwvLJAS9GILcQAZPftA0DZ8uXkHp3w83YhhBDOrQA8sbM5xTjdyQmc1GAfsn8mbYWBZi0NVdi8445DZWWx5YEJ5J9+Ohnt2zd+EEII0bp8TMx0wJovUAgcFvJ7vwr5vT84OUG9CVYPGl/bPz+ObjM8ejs9aOyMO9wWwmzkztOZPXrQ+dpr2XL//ez9cr70JhZCiNR7AmuMbdRee9txTk9Qby9iw6NfZ3h03X7vMjz621gLnW+1l6NrVazJ/hv/unnHWy3x2597rvEvLoQQrY8rdkiOvTBAXLP9OBmm8zusbs0Av8QaG9QNq2fVffFcrCVo7F7EUblHHUnuscdStrqu2cCEEEIkSbnmC/SNftB8gX7Uv0TrAZxk40o9aJTb738CvGhP1B8wPPrd8VyspUhHDRbA3aEDkYULMU0zbYleCCFaiTuxJqcI2J/Ppv4VeA7gpAabYXj06L/mQ4HPY8riWrqnJVCq8Z/BRmV0tobolK9alZbrCyFEaxHyewPAKcA39uuUkN87K55zOKnBzgZeNjz6JqwFzj8FMDx6V6AsrohbgHTWG3OPOoqdr7zKrtkfkt23b/0HCCGESFjI710OLE/0eCc12BuA+fb7s2IWRe8PTEj0ws1ZupqIC848k8yiIrY9/TSRslb3t40QQqSc5gtMS8Y+4GyYTiU1JNJ4F1tvKdL57NOdn0+HMZey+V4/u97/gLbnetMWixBCtFBDNV/gn/Xsc4STE8Uz2b+wpasGC1A4ahRkZrJj0qT0BSGEEC3XY8Ceel6POzmRrOAdJ5eC+aHtLN+8C5crHbXZTHaOPJfIzGlUbt9ORocOaYhBCCFappDfe2eyzqUac+m1dHO73WY47Hgpvxqd+/AnLNlQkqSIEnfR97O5as939Jk2FVdeXrrDEUKIpFNKhU3TbLYVQccJ1vDoI4BB9sdv9KDxUcqiSpFkJNhNxaW8umAdlZFI/TungGnCIx+t4Az3Dsa/eQ+F55xDjwkPpCUWIYRIpRafYA2PXggEAA34GmukyiBgDXCOHjTSX51zKBkJtinod+sMTuvXgVteupXKjT/Q//PPpKlYCNHiNPcE66ST0z+BhUAfPWicrweN84C+9rZ/pTI4UbOcTDdlpqLtz0YBsO2JJ9IckRBCtCyaL3C15gs06PmbkwQ7ErhODxpVczDaUydejzV1omhkOZlu9pWH6TDmUlyFhWx//gV2zZ6d7rCEEKIlORVYrfkCD9rzEMfNSYKt0IPGQQ8c7fGx5TXsL1IsN8vFvoowGZ060XPiRAC2PPhgmqMSQoiWI+T3/go4BtgJfKT5AjM0X+CceM7hJMHuMjz60dU3Gh79GKzxQKKR5WS4Ka2wniXnHnkEOUccQfmKlZQtT3hGLyGEENWE/N5N9rCdXwNHAi9pvkBQ8wUctd46eXh8F/tXzpkHmMAw4G/AHxILWzREXnYGq7furlpVp91Fo9l02+2sueIK+r37rgzbEUKIBtJ8gRzgV8A4oBS4CXgDa8H117A6/tbJyVSJ7xgevRL4K/unTPwauEoPGjMTilw0SI92OSxet5OKsElWhqL96NHsW7iI4qlT2f7CC3QaOzbdIQohRHMXAt4Hxob83gUx2+drvsD7Tk6Q8okmDI/+HtAViAC7gD/pQWOR4dG7AC9g9UguA8bqQSO6Uk8e8AwwxD7OpweNKXaZC/g3cA5WbXqCHjQecxJLSxmmc/2ri5i6cAPBv59FTqYbgModO1g+dBi5AweivfJymiMUQoiGS+cwHc0X6Bbye39oyDnqfQZrePTHYt6fl8A1RutB42g9aAwEHgAm2tv9wDw9aPQHfgtMMjx69Iu8ESjTg0Y/4EzgMcOjt7fLLgUGAIcBxwM3Gx7dk0BczVZ0vYHYv40y2rfHlZ9PeSjEvv/9Lz2BCSFEyzFW8wU6Rj9ovkAnzRe4PZ4TOOnkdGLM+7hODqAHjZ0xH9ti1UgBRgOP2vssADYDJ9tlF8WUrQbmAufFlD2uB42wHjS2Y7WFXxxvXM2Zy86w4WqtD11uuonwnj388H+3pSMsIYRoSc4L+b3boh9Cfu+PwPnxnMBJ1VvV8t4xw6O/AIywP55lePSOgEsPGltjdgsBPe33PbFminJaNrim6yqlxgPjYz4nEn6T47bvI1Itwba/aDS73nuPPZ99Run335Nz+OHpCE8IIVqCmhJGZjwncJJgsw2PrtsXi30PgB40ltZ3Aj1oXAZgePTfAPcDY7Cen8aqfjNmgmX7dzLNCcSsZet2u1vEygYuu93BrGE65EKvlz2ffUbxlKnk/MXXuIEJIUTLsUzzBcYDD2LlmeuBYDwncNJEnAfMwJqPODfmfQB4J56L6UHjefbXZDE8eueY4l7AWvv9Wg7sAu20rFVQtdRgAQrOGAmZmeycMqWxwxJCiJbkWuBcYB/WnA9nAX+K5wROhuloiUQGVQsF5OtBY6P9+efANmA78DrW+KI7DI8+BKun8af2odGyyw2P3htryqqxMWVXGx59CtYz3YuwbrzViC5DW/0ZLIC7oICC005j1/vvs+eLL2gzdGgjRyeEEM1fyO/dCJyu+QJt7M9xT6yU6u7PbYE3DY+ei9W5aStwrh40TMOj3wK8aHj05VhTLo6xp18Eqxl5ouHRV9jHjbM7NAG8iDV8Z1l0Xz1oGCm+jyaltmewUW1/fj673n+fdWP/QL8P3iejc+ca9xNCCFE7zRfoBvQGMjRfAICQ3zvX6fFOlqvbysHPS8Fqkzb1oNHFcbRp1lLGwd4x/Tue+zzEl7f+hEMKc2rc58ennmLrAxNoN3o03e66s5EjFEKIhkvzONi/Ys3etAqIJg4z5Pce7/QcTgKvsYeuSB9XPTVYgI6/+x0/PvIoJTNmcMjf/oorK6uxwhNCiJbgCqCfPTwnIU4S7B49aCR8AZF8Vc9gI7UnWKUUhed6KX5zChtvupmifz/UOMEJIUTLsKkhyRWcJdj3gEEAhkd/Rg8aVzbkgqLhXHaGrW+Wy663386ez79g17vvsmfel7Q58YRGiE4IIVqEdzVf4AFgEtZk/wCE/N56h6ZGORmmEzvO9FjnsYlUcdJEDODKyqLr3/4KQPHUqSmPSwghWpDfAr8A3iTBoalOarAtYnKGliTaRFxHC3GVNsOGAVD81lsUnnsu+cNPrucIIYQQIb+3d0PP4STB9jA8+j9reA+AHjRubmgQIj6FudZsXT/uLqN3pzZ17uvKzeXQp55i3VVXsfHmm+k35yNc2dmNEaYQQjRrmi9wHuAJ+b33ab5Ad6BjyO/91unxTpqIH8OaxWJPtffRl2hknq4FACxet9PR/vnDT6bQ6yW8Ywf7Fi1OYWRCCNEyaL7AHVgTHEX7HZnA4/Gcw8lMTjKIsokZ0L0QgNA253/f5I8YQUkgwIZrr6X31ClkduuWqvCEEKJRab7AQeuOh/zeRdX2OQ1rqt9lMZuHhvzefbWc9nzgOOArgJDf+4PmCxTEE1daBvCKhinMsZqId5dW1rNnzDHecyhbtoxtTz7Jjskv0+WG8fUfJIQQzcPokN+7E0DzBc7HWnd8UA37LQ35vU7ndigN+b3h6AxOiXDSRCyamJxMN1luF7viSLBKKTpc/hsAiqdPT1VoQgjR6KLJ1Ra77nhDrNF8gZMBU/MFXJov8DfA8fNXkBpss1WQkxFXggXI6NCB/JE/YfcHs9m3eDG5xxyTouiEECIpXEqp9TGfJ9jLkB5E8wUOWHe8lvMdrvkC32BNffhsyO99rI5r/xl4HjgS2At8AlwaV/Dx7Cyajra5mezYWx73ce0uvBCAHZMnJzskIYRItohpmkUxrxqTK0DI770s5PceCvwNa8GY6r4BikJ+7yDg58BYzRcYXcf5Nof83rOAdkCnkN97Rsjv3RxP8E4m+19AHWNh9aDheOLjdGspk/0DXPncAuYu38rSu84i0+3876TIvn18f+wgXHl5aK++Qnb//imMUgghEpfoZP+aL7APK5luq2OfvwDdQ35vjWu8ar7A/OoT+9e0rS5OAr/R6clE4+nWLoeKsMnOvRV0LnA+rtWVm0vX229j0513sfbK39F31kxceXkpjFQIIVJH8wUKgXx7/VY0XyB23fHY/boBm0N+b8TuDXwu8Ewdpz4gP2q+gBvIjyc2J8N0Po7nhKJxRKdLNBOYaKv9JZew95uFlLz9NjunTaPDr36V7PCEEKKxtAXe1HyBA9YdD/m9puYLPA1MD/m904ELgD9ovkAlVu57HXi2+sk0X+Am4GagreYLbIkpysOal9ixepuIowyP3gm4HTgGqFqEVJqI0+P2t5bw/BdrmH/rT+hSy5qwdSk1DFb//Bdk6zraKy/L7E5CiCYnHevBar5AW6A98F+siSaiSkJ+7454zhVP4BOBz4AzgRuAq4GF8VxMJI+qqsEmJkfXyR6gU7bUYNNtt9P9Pn/yghNCiGYq5PcWA8XA2Q09VzwJtqceNEYZHv3XetB42/Do7wIzGxqASIydX+tdsq4u2ssvs3z4KRS/9RYdf38V2X37Jic4IYRo5jRfoC/wENVabUN+bxen54hnmE50TEiZ4dE7AJVAURzHiyRSOFuyri6u7Gy6jLdmdNr55pSkxCWEEC3E08BLWFMv/gSYhpVwHYsnwX5vJ9aXgHnAl0gTcdpU1WAbeJ78U4aDUmyfOJGS999vcFxCCNFCtA35va8CEXsFnauBM+I5geMEqweNMXrQ2K4HjX9jLUR7JyDdT9PEzq847aRWm8zu3en5rNWRbtP/3UakrKyBkQkhRItQYf/cpfkCvYBsoFc8J3CcYA2PXjWllB40PtODxjvAI/FcTCSPy151vYH5FYA2J55A4aifEd65U5azE0IIy8eaL9ABK899BawA4prIPZ5OTifWsG1oPBcTybO/Bpuc8+Wfciol099mw4030Gf6dDLat0/OiYUQohkK+b03228na77AJ1jjbbfXcchB6k2whkf/JTAa0AyP/lpMUVtkwfX0qXoGm5wMW+g9h73z57PztdfYePMt9HzqyaScVwghmruQ37sOWKf5AmuBnk6Pc1KDXQYEgOPtn1ElwOx4ghTJE+1FnKwarFKKrrffxu5PP2HPJ59glpejsrKSc3IhhGgZVP277OdkqsTFwGLDowf0oLE14bBEUrmS1Is4lnK7yT/lFHa+8io/3HEn3e65u2pCCyGEEPH9kxvPM9gMw6O/w/719mYDV+tB44d4LiiSI5r3GjIOtiadr72WPV98QfGUKeQceYTMUyyEaFU0X2BAHcVxTdsYzzjYJ4HPgR7263N7m0iDZDcRR2W0b0/PZyYC8ON//5vckwshRNMXqONVGs+J4snGh+pB42cxn/2GR18Uz8VE8uxvuU1yhgWyinqQO3Ag+xYtomTWuxSedWbSryGEEE1RyO/tnaxzxZNgXYZH76oHjU0AhkfvQj0PfA2PngO8AgwA9gKbgLF60AjZx78A9AXK7O2f2sflYa3TNwRr+SGfHjSm2GUu4N/AOVjZZYIeNB6jlama7D/5+RWATn8Yy/o/X8uG668nu+9bsjC7EELEqd4mYsOjv2y/vR9YaHj0Jw2P/gTwtb2tPk8Ch+tBYyDwDvublf3APD1o9MeaGWqS4dGjCf9GoEwPGv2wVu95zPDo0YGZl2Il7MOwejbfbHh0j4M4WpToXzaRFCXY/FNPpdvdfwfTZMerr9V/gBBCiAM4eQbrAdCDxotY8zD+D1gCnKkHjZfqOlAPGqV60JihB41oGpgH9LHfjwYetfdbAGwGTrbLLoopWw3MBc6LKXtcDxphPWhsB14DLnZwHy2KSvI42JoUjBwJQMnMmQ2eklEIIVobJ03EVf+y6kFjCVZyTdSfgbcNj94RcFUb9hNi/wDensCaOMoG13QxpdR4YHzM58Qjb2JS1ckplis3l/yRP2H3B7PZ++WXtDmxpsm8hBBC1MRJgj3K8OhbatiuAFMPGo7WxjM8+q1Af6wV4nM5uHdO9exnJli2fyfTnABMiH52u90tphpWNQ42xXfU/pe/ZPcHs9n52uuSYIUQIg5OmoiXYXU2qv4abP+sl+HRbwR+AZytB429etDYZm/vHLNbL2Ct/X4toCVQ1mqkahxsdXlDhkBmJiUzZlDy7nspvZYQQrQkTmqwZXrQWFP/bjUzPPp44BJgpB40dsYUvQ6MA+4wPPoQoCvwabWyyw2P3hs4FavmGy272vDoU7DmQ74IOCvR+JqrxmruduXl0WviM6wZcxmbbr+dghGnyRSKQgjhgJMabML/khsevQh4AGgHfGR49EWGR//SLr4FGGZ49OXAc8AYPWhU2mX3A7mGR18BvAuMszs0AbwIfI9Vs14A3K8HDSPRGJu7xuh7lDdkCIU/s5az27toUeovKIQQLYBqTb1D3W63GQ6H0x1GUvx3zkrumxXkrXEnccyh7VJ+vZKZM9lw/XgyunSh76yZuPLyUn5NIUTrppQKm6YZ1/SETUk8UyWKJkSlYLL/uhSefTZtL/gFlVu2sOmeexrpqkII0Xw1278MWrtou/0bX69LeUenKPOKa1n76f8w53xD//U7GdC9LW5Xyxn6JIQQySRNxM3Uu99t4uoXv05rDHeddwSXDdXSGoMQouVq7k3EzTbw1u7MI7ry+KXHsW773ka9bnjXLta/OY2Xugxm5aIgSIIVQogaSYJtxs46smtarrtNb8dLTy7hh7mfU6KZFJ59dlriEEKIpkw6OYm4tdd6ojDZlZnHhuvHs/7P16Y7JCGEaHIkwYq4uVyKdnlZbDliMK7CQna99x4bb7mFcElJukMTQogmQxKsSEhR+zx2VULv114lW9cpfms66676PWZ5ebpDE0KIJkESrEhIXpabyohJlqbR+43XyezZk32LF7P81NMoX78h3eEJIUTaSYIVCcl0u6iojACg3G56T5lCm2HDCO/YwborryS8e0+aIxRCiPSSBCsSkuFWVEQiVZ/d+W049KknAShfs4ZVo36GWVlZ2+FCCNHiSYIVCcl0u6gMHzhJiXK76f/pJ6isLCo3/sDme/1pik4IIdJPEqxISKZbURkxqT4TWEanTvSdOQOAHZMmsW3is5gtZPYsIYSIhyRYkZCC7EwAtu4uO6gss0cPDn3qKVz5+Wz55z/ZLIsDCCFaIUmwIiHHae0BmBPcWmN5/vCT6fPO26AUOya/zJrLfkOk7OBkLIQQLZUkWJGQY+01aNftqH0u5MyuXekzI4C7Uyf2zp/P+mvGyWQUQohWQ+YiFgk5pG0OAJuKS+vcL7t3b/q8PZ1VZ5/Dns8+Y8WI0zls/pcot7sxwhRCtAKaL/Ae0BWIALuAP4X83kU17Hcl4MOqXM4Grgn5vSkb7iA1WJGQguwM8rMzmLlkU737ZrRvT/9P5pIzYACRPXtYNuwkSr9f1ghRCiFaidEhv/fokN87EHgAmFh9B80X6A38HTgZ6IeVkK9MZVCSYEVClFKUVoSpjBkLW+f+mZn0evEF8k87jUhxMWt+/WsipXXXfoUQwomQ37sz5mNbrJpsdRcCU0N+7+aQ32sCjwOXpDIuSbAiYcP7d4prf1ebNhQ9+giZvXoS2b2bjTfdhFlRkaLohBAtgEsptT7mNb62HTVf4AXNF1gH3A38poZdegJrYj6H7G0pIwlWJMylFA4rsFWU24328suonBx2vf8B3x9/gnR8EkLUJmKaZlHMa0JtO4b83stCfu+hwN+A+2vZLXbgvkpmoDWRBCsS5nIpItUmmnAio0MH+n88h5yjj8bct481v76Ufd99l4IIhRCtTcjvfR4YofkCHasVrQW0mM+97G0pIwlWJMylIJxAggVwt21Lr+efI//00ylbvpzQBRdStmpVkiMUQrR0mi9QqPkC3WM+/xzYBmyvtuubwM81X+AQzRdQwFjglVTGJglWJMztUpgmB02X6JQrN5dDH3uUTtf8AYDV5/+c7ZMmJTNEIUTL1xaYpvkC32q+wGJgHHBuyO81NV/gac0XGAUQ8ntXAbcDnwErgS3AM6kMTCX6j2Nz5Ha7zbDMi5s04yZ/Q+B/P7DyH+fgdjXscUbx22+z8aabAejmv5e2552HUil/RCKEaMKUUmHTNJvtfA1SgxUJc9sJMJHnsNW1/dnPKHr8v+By8YPvL6y94ooGn1MIIdJJEqxIWLTSGo4kpxWk4LTT6PfB+7jy89n7xTzW/OZyKjZvTsq5hRCisUmCFQlz2Rk2mU8ZMrt3p9ekSWT378/eL78kdNHFVGyqf7YoIYRoaiTBioS57CbiRHsS1ybn8MPoPf0tsj0eKjdtYsVpI9j37bdJvYYQQqRayh8eGx79P8AorDFHR+lBY4m9vQvwAtAXKAPG6kHjU7ssD6t31xCsKa98etCYYpe5gH8D52ANGp6gB43HUn0f4mDJfAZbnVKK3q+9ykbfXyiZMYN1v7uKfp/MxZWVlfRrCSFEKjRGDfYNrMmV11Tb7gfm6UGjP/BbYJLh0aMJ/0agTA8a/YAzgccMj97eLrsUGAAcBhwP3Gx4dE+K70HUwGX/9phxzubklMrKoseEB8ju359wcTEbrrue8M6dqbmYEEIkWcprsHrQmAtgePTqRaOB3vY+CwyPvhkrEc8BLgIut8tWGx59LnAe8Jxd9rgeNMLAdsOjvwZcDNyR2jsR1UWH5pz177l0KcxJ3YVG3kBZn1WY+0pRf32dbF0HpcjNdPHPC46hZ8e81F1bCCESlJbxRYZH7wi49KCxNWZziP0TL9c1KXNNZYNruo49MfT4mM8NiFpU9/NjezB/9XZ2l1aytSTFK+N07Unltm2Y5eWwch1bswsBmLpwA9eO7J/aawshRALSOYC3+oO76tmvrkmZHU3YbE8MXTU5tNvtbj2zajSC43p14L3rT22060X27GHT3fdQPHUquws78MvTb+WzlT9KghVCNElp6UWsB41tAIZH7xyzOXbi5bomZW70CZtF0+Bq04bu9/6DtuefT37JdrLCFexZt57ydevSHZoQQhwkncN0XseaMxLDow/BWl3+0xrKegOnAtNjyq42PLrb8OgdsJ7JvtqIcYs063bvPyj672N0qdjFnm07WXnGT1n1s1Hs/uRTTJkKUwjRRKQ8wRoe/VHDo68HioAPDI++wi66BRhmePTlWJ2XxuhBo9Iuux/Itfd9FxinB43oyggvAt8Dy4AFwP160DBSfR+i6VBKUTBiBG21nhR37Ia7a1fKli9n3VVXsWrUeRS/9VbCCxAIIUSyyGT/otn627RveWneWmb8+WS0jcvZdM8/KLP/1soZMIBOfxxHwemnpzlKIUSiZLJ/IdLknCO7AfDMpyHyBg+mz9Qp9PtwNrnHHUfp0qWsv2YcG/9yK2WrVqc5UiFEayQJVjRbQ3p3AMD4oaRqW2b37miTXqLnC8/jatuW4qlTWXXOOay/7nr2fv11ukIVQrRCkmBFs5XpdtG7UxvKwwdPJdXm+OPpP/djekx4gMxDD2XXrFms+fWlrLnsN5TMmiXPaIUQKScJVjRr2RkuKmtIsACu7GwKzzmHvu+9S89nJ5J34onsnT+fDdddz7LBQyiZMaORoxVCtCaSYEWzlul2URGuuzaqlKLN0KH0eu5Zek95k/yRPyGyZw8bxt/A+j9fy+65cxspWiFEayIJVjRrGW5FZcT5agM5AwZw6COPoL3yMrnHHMOu995j3e+vZvUvR7Pj9dcx4ziXEELURRKsaNYyXS4q66nB1iR34EC0V1+hT+AdCs46i9Jvv2XT/93GsqHD2P7884SLi1MQrRCiNZFxsKJZ+9VT85i/ejunHd4FAKWsyamtn8r6ab+nqsyavlrF7B/ZvZuKUIjyVatQmCgTcjyHkd2nL67cnAPOhf0+w6W4bGgv+nUpSM/NC9HCNfdxsJJgRbM24b3vefzjVZiYmKa1CoRpmvbP1F//mKK2vPXHk1N/ISFaIUmwzYgk2NbJNM2qZFs9AUcTM9U+h8vK2Pna62yfPJnKLVsxFbS/4grannceGYd0xQRO8n9IRTjC8nvOlqUQhUgBSbDNiCRYkYidU6byw//9H9i/O+1Gj6bDby7jtoV7ePWrdTx+6SDOsmeVEkIkjyTYZkQSrEhUePduiqdOY9tTT1G5ZQsAG0aez+/yT6Z9tosF44eR0bZtmqMUomWRBNuMSIIVDWWaJiXvvMOOyS+zb/Firjn1Ola37c606T7yCvPJ7t+fzEMPJfeoI2kzfDiZPXpI87EQCZIE24xIghXJVLljB3+ZNJ/X11cy0/yC3NAKylasILJnT9U+GV27kjfoWLIPO4wsTSOrVy8yu3fHLbVdIeolCbYZkQQrku3v7yzlmU9Xc/7A7uRmuQGo3LWbyu07qNy6lYptPxIu2YU1GAhMBSYKd2Eh7q5dcbdvj6ugAJWTCy57H7s3tPXerHqP3QkLoF+XAq4f2V9qx6JFa+4JttkGLkRTcNgh+QBMW7SxhtLO0L4ztK/l4FLghzD8sBPYGeeVN+E9qhuHd5UxuEI0VVKDFaKBSkoriERMazILsCa02P/2oIktAKgMU75yBfu+XULFmjWUr1pF+Zo1VG7YgIqEq/bHNMk95hjyTzqJ7N69ye6jsUi145IXFjGsb0ce/dUg2rfJasS7FaLxNPcarCRYIZqQSHk5FevWUb5mLeWrVrJnwQL2fPIpxMyRHEHx91OuZl6Hfly1L8hv22wno2NHMjp3IuOQrmT26EFG506427bFlZOTxrsRomEkwTYjkmBFcxTevYeKdWspW7GS8lCI8jVr2LB2E6P7XMwhe7dTtGtL7Qe7XKjMTFwZGaisLFR2Nioz03plZaEyMlCZmZCRgSszA+XOiD4urtFPB3TlVyf0TP5NClEDSbDNiCRY0VJUhiP84r+fs3LLbsDqDIUZgYhp95IyqxaVN83922pjxmZVZTdrx75QlJrWPocXunG5XCiXwuV2oVwulNtt/VQqZj5odcC80Pvngj54nuiqJvUDjgGXUrTLyyIn8+B1SWq7m9pv8+CC2vatdXstV619f+fnr+3ccW6mtn/Ta9oaf9zOzw1w7U/6c9ghifcTkATbjEiCFa2ZGYkQ3r6d8M6dhHfupGLzZiK7dhEuLrG2FRcT2VVCeMdOyteupfLHHw9oml7YuT+v9R/BpjYdwFoSAdNe/MAE+7MCl/1Tueyfan+Zsn/an62zq6re1Sax57NeFbKCYLM1+aoTGNa3U8LHS4JtRiTBCuGcGYkQLi62knJxCeGSYsLbthMpK8UsL8fct4/I3r1E9uwlsncvZnkZkX2l1vuyMiKlMe/37iVSWgoVFXHHsTszh7CqeWVNFf3nK1qLzs6ymrzdLpQrA+VyQYbbaibPykS53Ci329rf5YKMDFR2JsqdiXIpcLlRbqsmjtuFUi5UZhYq0z6X/QcEygUul33p/duVy4XKyrb3jWkJsHu+7X974PbYZaCiZSp6TIYblZlt3W9sT7mYRgdX9H3ssC0V7XZ34LYDNsXs79rfjLB/9/0njjmH/SnaeS+mEx/VzpFz5JFkduhQ4387J5p7gm22gScikhfhiMeOSHcYQrQcufbLKdNV1ZRd1XQdidhN2hH7s9XcbUbsOqxZbG0jegzW9mhzuDVAuFpTeMx+sWVV22PLIvv3jSmi0n4f/98EwpY5vweuvHh+QVqWVpVghRBpprBrfwf3pWoyU2bEJtuqZZdin5CaHPjQsXrSrvb5oHMf/KHm3avFUPuJ6lfn7g7OleDxKqt1DyGTJmIhhBBNkjQRCyGEEGmi+QI5wCvAAGAvsAkYG/J7Q9X2Ow2YASyL2Tw05PfuS1VskmCFEEI0d08CM0N+r6n5An+0P/+0hv2WhvzewY0VlCRYIYQQzVbI7y3FqplGzQOuS080B5IEK4QQoqlyKaXWx3yeYJrmhHqO+TPwdi1lh2u+wDdAGHg25Pc+lowga9NsE6zh0fsDzwOdsJYiuVwPGkvTGpQQQohkipimWeR0Z80XuBXoD4ytofgboCjk9xZrvkARMEPzBX4M+b2vJSnWg9Q8ert5eAJ4Ug8ahwH/BJ5JczxCCCHSRPMFbgR+AZwd8nv3Vi8P+b0lIb+32H6/HngZGJ7KmJplgjU8ehdgEPCSvelNoLfh0bW0BSWEECItNF9gPHAJcEbI791Zyz7dNF/AZb8vAM4FFqYyrubaRHwosFEPGpUAetAwDY++FugJhKI7KaXGA+NjD1RKNWQgrAuQmVEt8l1Y5HvYT74Li3wP+zX0u3DXt4Pd3PsAsAr4SPMFAMpCfu8Jmi/wNDA95PdOBy4A/qD5ApVYue914NkGxFavZjnRhOHRjwNe0IPGETHbFgA36EFjbqquq5RaH8/zgJZMvguLfA/7yXdhke9hv9b+XTTLJmJgHVBkePQMAMOjK6xa7dq0RiWEEELYmmWC1YPGFqy280vtTRcAIT1ohNIWlBBCCBGjuT6DBbgaeM7w6LcCJcBvGuGa9Y2/ak3ku7DI97CffBcW+R72a9XfRbN8BiuEEEI0dc2yiVgIIYRo6iTBCiGEECkgCVYIIYRIAUmwDiml+iulPldKLVNKzVdKDUh3TKmilAoppYJKqUX26yJ7exel1Cyl1HKl1BKl1Mkxx+QppV5WSq2wv6NfpO8OEqOU+o9976ZS6siY7Qndt1LKpZR6WCm10i6/prHvKVF1fBdzlFKrYn43ro8pa3HfhVIqRyk1zb6fRfbvgWaXtarfi3q+i1b1e+GYaZrycvACPgQut99fCHyR7phSeK8h4Mgatk8E7rDfDwHWABn259uA5+z3vbEWPW6f7nuJ875PAYqq33+i9w1cBszGmo2mg31eT7rvs4HfxRzg3FqOaXHfBZADnMP+DqF/BN5rjb8X9XwXrer3wulLarAOKKVqnPs4+tdbKzIaeBTANM0FwGYg+lf7RTFlq4G5wHlpiDFhpmnONU1zfQ1Fid73RcDjpmmGTdPcDrwGXJy6O0ieOr6LurS478I0zVLTNGeYdjbAWmu0j/2+Vf1e1PNd1KXFfRdOSYJ15lBgo2malQD2L1h07uOWapJS6lul1NNKqc5KqY6AyzTNrTH7hNj/HfTE+gu+prJmq4H33SK/E+B++3fjVaVU7D+wreG7+DPwtvxeAAevu9qafy9qJAnWueoDhlVaomgcp5imeQxWrX0b1rq7UP93YNZR1pw15L5b2ncyxjRNHTga+AR4p1p5i/0ulFLRtUb/am9qtb8XNXwXrfb3oi6SYJ1ZBxQppTIAlFIteu5j0zTX2j8rgIeA4aZpbgNQSnWO2bUX+7+DtYBWS1mz1cD7bnHfiWma6+yfpmmajwB97NoctODvQilVtdaoaZp7W/PvRfXvAlrv70V9JME6YJpmjXMfm6YZSltQKaKUaqOUahez6RL2r5n4OjDO3m8I0BX4tIay3sCpwPRGCLkxJHrfrwNXK6XcSqkOWM+bXm3EuJNKKZWhlDok5vMFwOZosqGFfhfKWvbyEuAM0zR3xhS1ut+Lmr6L1vp74Ui6e1k1lxdwOPAFsAz4Cjgi3TGl6D77YCXU/wHfAm8Bml12CPAesBz4Djg15rg2WP9jrLC/owvTfS8J3PujwHqgEqun44qG3DdWz8hHgZX264/pvseGfBf2vX5l/14sxur9eUxL/i6welKbdsyL7NeXrfH3orbvojX+Xjh9yVzEQgghRApIE7EQQgiRApJghRBCiBSQBCuEEEKkgCRYIYQQIgUkwQohhBApIAlWiEZir05zpFLqcqXUYSk4fzul1M3Vtj2tlBqe7GsJIeonCVaIxnc5EHeCtZf2quv/2XbAAQnWNM3fmab5SbzXEkI0nCRYIRrXacBg4D/2upnngDX9nLLWGf5GKTVDKXWovf0OpdSLSqkpWAP7uyml7ldKLbCP/1gp1d8+9+NAO3v7V/bxc5RS59rvD1FKTbUnZF+ilPp9NCi7dn27stY8Xq2U+ltjfSFCtFQZ6Q5AiFZmDtasN/8yTfMdAKXUr7BqtENN0wwrpcYAj7B/Sa8RwCDTmrITpdR9pmneZL+/GHgQOBcYC3xlmubAWq79HyBomubP7SUYv1ZKLTJNc75d3s40zWH2/LorlFLPmqa5Ial3L0QrIglWiPQ7H6tW+7W1jgRuIBxT/k40udp+qpT6E1CA1QpV6PA6I4FjwJpf264V/wSIJthJdtlWpdQqrMWxJcEKkSBJsEKknwLuNk1zYi3lu6t2VKonVk30eNM0VymljgY+jONa1edGjf1cGvM+jPz7IESDyDNYIRpfCdA25vN04Bp7NRGUUplKqWNrObYtUA5sspdN/GO18+ZFl1WswQfA7+1rdAZ+TnzJWQgRB0mwQjS+J4Hbop2cTNN8EXgJmKOUWozVmWlETQeapvkt1hJf32E9z10bU7Ydq5n322gnp2r+DBytlPof8BFwT8zzVyFEkslqOkIIIUQKSA1WCCGESAFJsEIIIUQKSIIVQgghUkASrBBCCJECkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECnw/9u1/1xcs4TpAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "### Visualize results\n", + "mpl.rcParams['figure.dpi'] = 80\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "color = 'tab:red'\n", + "ax1.set_xlabel('Iteration')\n", + "ax1.set_ylabel('Total FIFO Size [kB]', color=color)\n", + "ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)\n", + "ax1.tick_params(axis='y', labelcolor=color)\n", + "ax1.set_ylim(0, max(log_total_fifo_size))\n", + " \n", + "ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis\n", + "\n", + "color = 'tab:blue'\n", + "ax2.set_ylabel('Latency [cycles]', color=color)\n", + "ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)\n", + "ax2.tick_params(axis='y', labelcolor=color)\n", + "#ax2.set_ylim(0, max(log_latency))\n", + "\n", + "ax2.axhline(log_min_latency[0], color=\"green\", label=\"Minimum (1st frame) Latency\")\n", + "ax2.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('fifo_iterative_graph.png', dpi = 300)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "466f818f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Iteration: 11\n", + "Reducing depth of FIFO: 48/266\n", + "Numer of minimized FIFOs: 266/266\n", + "Interval: 903174\n", + "Min. latency / latency: 2549314/2580781\n", + "Total FIFO Size (kB): 226\n", + "Done (49 seconds)\n" + ] + } + ], + "source": [ + "### Optional second pass for fine-tuning\n", + "(fifo_depths,\n", + " log_total_fifo_size,\n", + " log_interval,\n", + " log_min_latency,\n", + " log_latency) = size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2c707459", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FIFO DEPTH | SIZE\n", + "FIFO 000: 1 | 24\n", + "FIFO 001: 2 | 48\n", + "FIFO 002: 2 | 48\n", + "FIFO 003: 16 | 2048\n", + "FIFO 004: 8 | 64\n", + "FIFO 005: 2 | 16\n", + "FIFO 006: 8 | 64\n", + "FIFO 007: 32 | 256\n", + "FIFO 008: 32 | 128\n", + "FIFO 009: 32 | 128\n", + "FIFO 010: 2 | 8\n", + "FIFO 011: 128 | 8192\n", + "FIFO 012: 1 | 32\n", + "FIFO 013: 1 | 2\n", + "FIFO 014: 16 | 128\n", + "FIFO 015: 256 | 2048\n", + "FIFO 016: 2 | 16\n", + "FIFO 017: 2 | 16\n", + "FIFO 018: 355 | 45440\n", + "FIFO 019: 1 | 4\n", + "FIFO 020: 4 | 256\n", + "FIFO 021: 1 | 8\n", + "FIFO 022: 1 | 10\n", + "FIFO 023: 1 | 8\n", + "FIFO 024: 4096 | 32768\n", + "FIFO 025: 1 | 8\n", + "FIFO 026: 1 | 4\n", + "FIFO 027: 4096 | 32768\n", + "FIFO 028: 1 | 64\n", + "FIFO 029: 256 | 1024\n", + "FIFO 030: 256 | 2048\n", + "FIFO 031: 2 | 16\n", + "FIFO 032: 2 | 16\n", + "FIFO 033: 288 | 36864\n", + "FIFO 034: 1 | 4\n", + "FIFO 035: 1 | 64\n", + "FIFO 036: 1 | 8\n", + "FIFO 037: 1 | 10\n", + "FIFO 038: 4 | 32\n", + "FIFO 039: 4 | 32\n", + "FIFO 040: 4096 | 32768\n", + "FIFO 041: 4096 | 32768\n", + "FIFO 042: 8 | 32\n", + "FIFO 043: 16 | 1024\n", + "FIFO 044: 256 | 1024\n", + "FIFO 045: 256 | 2048\n", + "FIFO 046: 2 | 16\n", + "FIFO 047: 2 | 16\n", + "FIFO 048: 288 | 36864\n", + "FIFO 049: 1 | 4\n", + "FIFO 050: 1 | 128\n", + "FIFO 051: 1 | 8\n", + "FIFO 052: 1 | 10\n", + "FIFO 053: 1 | 8\n", + "FIFO 054: 1 | 4\n", + "FIFO 055: 1 | 4\n", + "FIFO 056: 1 | 4\n", + "FIFO 057: 1 | 8\n", + "FIFO 058: 28 | 3584\n", + "FIFO 059: 1 | 4\n", + "FIFO 060: 1 | 8\n", + "FIFO 061: 1 | 8\n", + "FIFO 062: 114 | 14592\n", + "FIFO 063: 1 | 8\n", + "FIFO 064: 2 | 16\n", + "FIFO 065: 1 | 8\n", + "FIFO 066: 243 | 31104\n", + "FIFO 067: 1 | 4\n", + "FIFO 068: 2 | 128\n", + "FIFO 069: 1 | 8\n", + "FIFO 070: 1 | 10\n", + "FIFO 071: 1 | 8\n", + "FIFO 072: 1 | 8\n", + "FIFO 073: 4096 | 32768\n", + "FIFO 074: 4096 | 32768\n", + "FIFO 075: 1 | 4\n", + "FIFO 076: 6 | 384\n", + "FIFO 077: 60 | 240\n", + "FIFO 078: 128 | 1024\n", + "FIFO 079: 2 | 16\n", + "FIFO 080: 2 | 16\n", + "FIFO 081: 394 | 50432\n", + "FIFO 082: 1 | 4\n", + "FIFO 083: 1 | 64\n", + "FIFO 084: 15 | 120\n", + "FIFO 085: 15 | 150\n", + "FIFO 086: 16 | 128\n", + "FIFO 087: 16 | 128\n", + "FIFO 088: 4096 | 32768\n", + "FIFO 089: 4096 | 32768\n", + "FIFO 090: 16 | 64\n", + "FIFO 091: 32 | 2048\n", + "FIFO 092: 64 | 256\n", + "FIFO 093: 128 | 1024\n", + "FIFO 094: 32 | 256\n", + "FIFO 095: 2 | 16\n", + "FIFO 096: 394 | 50432\n", + "FIFO 097: 1 | 4\n", + "FIFO 098: 1 | 64\n", + "FIFO 099: 15 | 120\n", + "FIFO 100: 15 | 150\n", + "FIFO 101: 16 | 128\n", + "FIFO 102: 16 | 128\n", + "FIFO 103: 4096 | 32768\n", + "FIFO 104: 4096 | 32768\n", + "FIFO 105: 16 | 64\n", + "FIFO 106: 32 | 2048\n", + "FIFO 107: 64 | 256\n", + "FIFO 108: 128 | 1024\n", + "FIFO 109: 32 | 256\n", + "FIFO 110: 2 | 16\n", + "FIFO 111: 394 | 50432\n", + "FIFO 112: 1 | 4\n", + "FIFO 113: 1 | 64\n", + "FIFO 114: 1 | 8\n", + "FIFO 115: 8 | 80\n", + "FIFO 116: 8 | 64\n", + "FIFO 117: 8 | 32\n", + "FIFO 118: 1 | 4\n", + "FIFO 119: 8 | 32\n", + "FIFO 120: 1 | 8\n", + "FIFO 121: 16 | 2048\n", + "FIFO 122: 8 | 32\n", + "FIFO 123: 1 | 8\n", + "FIFO 124: 8 | 64\n", + "FIFO 125: 121 | 15488\n", + "FIFO 126: 1 | 8\n", + "FIFO 127: 2 | 16\n", + "FIFO 128: 1 | 8\n", + "FIFO 129: 243 | 31104\n", + "FIFO 130: 2 | 8\n", + "FIFO 131: 8 | 512\n", + "FIFO 132: 1 | 8\n", + "FIFO 133: 8 | 80\n", + "FIFO 134: 8 | 64\n", + "FIFO 135: 8 | 64\n", + "FIFO 136: 1024 | 8192\n", + "FIFO 137: 8192 | 65536\n", + "FIFO 138: 8 | 32\n", + "FIFO 139: 16 | 1024\n", + "FIFO 140: 4 | 16\n", + "FIFO 141: 8 | 64\n", + "FIFO 142: 2 | 16\n", + "FIFO 143: 2 | 16\n", + "FIFO 144: 512 | 65536\n", + "FIFO 145: 1 | 4\n", + "FIFO 146: 1 | 64\n", + "FIFO 147: 30 | 240\n", + "FIFO 148: 32 | 320\n", + "FIFO 149: 32 | 256\n", + "FIFO 150: 32 | 256\n", + "FIFO 151: 1024 | 8192\n", + "FIFO 152: 8192 | 65536\n", + "FIFO 153: 32 | 128\n", + "FIFO 154: 32 | 2048\n", + "FIFO 155: 32 | 128\n", + "FIFO 156: 32 | 256\n", + "FIFO 157: 2 | 16\n", + "FIFO 158: 2 | 16\n", + "FIFO 159: 512 | 65536\n", + "FIFO 160: 1 | 4\n", + "FIFO 161: 1 | 64\n", + "FIFO 162: 30 | 240\n", + "FIFO 163: 32 | 320\n", + "FIFO 164: 32 | 256\n", + "FIFO 165: 32 | 256\n", + "FIFO 166: 1024 | 8192\n", + "FIFO 167: 8192 | 65536\n", + "FIFO 168: 32 | 128\n", + "FIFO 169: 32 | 2048\n", + "FIFO 170: 32 | 128\n", + "FIFO 171: 32 | 256\n", + "FIFO 172: 2 | 16\n", + "FIFO 173: 2 | 16\n", + "FIFO 174: 512 | 65536\n", + "FIFO 175: 1 | 4\n", + "FIFO 176: 1 | 64\n", + "FIFO 177: 30 | 240\n", + "FIFO 178: 32 | 320\n", + "FIFO 179: 32 | 256\n", + "FIFO 180: 32 | 256\n", + "FIFO 181: 1024 | 8192\n", + "FIFO 182: 8192 | 65536\n", + "FIFO 183: 32 | 128\n", + "FIFO 184: 32 | 2048\n", + "FIFO 185: 32 | 128\n", + "FIFO 186: 32 | 256\n", + "FIFO 187: 2 | 16\n", + "FIFO 188: 2 | 16\n", + "FIFO 189: 512 | 65536\n", + "FIFO 190: 1 | 4\n", + "FIFO 191: 1 | 64\n", + "FIFO 192: 30 | 240\n", + "FIFO 193: 32 | 320\n", + "FIFO 194: 32 | 256\n", + "FIFO 195: 1024 | 8192\n", + "FIFO 196: 32 | 256\n", + "FIFO 197: 32 | 128\n", + "FIFO 198: 8192 | 65536\n", + "FIFO 199: 32 | 2048\n", + "FIFO 200: 32 | 128\n", + "FIFO 201: 32 | 256\n", + "FIFO 202: 2 | 16\n", + "FIFO 203: 2 | 16\n", + "FIFO 204: 512 | 65536\n", + "FIFO 205: 1 | 4\n", + "FIFO 206: 1 | 64\n", + "FIFO 207: 1 | 8\n", + "FIFO 208: 1 | 10\n", + "FIFO 209: 1 | 8\n", + "FIFO 210: 1 | 10\n", + "FIFO 211: 1 | 4\n", + "FIFO 212: 1 | 4\n", + "FIFO 213: 1 | 4\n", + "FIFO 214: 1 | 8\n", + "FIFO 215: 8 | 1024\n", + "FIFO 216: 1 | 4\n", + "FIFO 217: 1 | 8\n", + "FIFO 218: 2 | 16\n", + "FIFO 219: 121 | 15488\n", + "FIFO 220: 1 | 8\n", + "FIFO 221: 2 | 16\n", + "FIFO 222: 1 | 8\n", + "FIFO 223: 218 | 27904\n", + "FIFO 224: 4 | 16\n", + "FIFO 225: 8 | 512\n", + "FIFO 226: 3 | 24\n", + "FIFO 227: 4 | 40\n", + "FIFO 228: 8 | 64\n", + "FIFO 229: 8 | 64\n", + "FIFO 230: 3696 | 29568\n", + "FIFO 231: 7782 | 62256\n", + "FIFO 232: 8 | 32\n", + "FIFO 233: 64 | 4096\n", + "FIFO 234: 16 | 64\n", + "FIFO 235: 16 | 128\n", + "FIFO 236: 2 | 16\n", + "FIFO 237: 2 | 16\n", + "FIFO 238: 512 | 65536\n", + "FIFO 239: 4 | 16\n", + "FIFO 240: 8 | 512\n", + "FIFO 241: 3 | 24\n", + "FIFO 242: 4 | 40\n", + "FIFO 243: 8 | 64\n", + "FIFO 244: 8 | 64\n", + "FIFO 245: 3696 | 29568\n", + "FIFO 246: 7782 | 62256\n", + "FIFO 247: 8 | 32\n", + "FIFO 248: 64 | 4096\n", + "FIFO 249: 16 | 64\n", + "FIFO 250: 16 | 128\n", + "FIFO 251: 2 | 16\n", + "FIFO 252: 2 | 16\n", + "FIFO 253: 512 | 65536\n", + "FIFO 254: 4 | 16\n", + "FIFO 255: 8 | 512\n", + "FIFO 256: 2 | 16\n", + "FIFO 257: 2 | 20\n", + "FIFO 258: 2 | 16\n", + "FIFO 259: 2 | 20\n", + "FIFO 260: 4 | 80\n", + "FIFO 261: 2 | 40\n", + "FIFO 262: 1 | 16\n", + "FIFO 263: 1 | 20\n", + "FIFO 264: 1 | 21\n", + "FIFO 265: 1 | 16\n" + ] + } + ], + "source": [ + "### Display resulting FIFO depths\n", + "print(\"FIFO DEPTH | SIZE\")\n", + "for fifo, depth in enumerate(fifo_depths):\n", + " size = depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % fifo]\n", + " print(\"FIFO %03d: \"%(fifo) + (\"%d\"%(depth)).rjust(7) + \" | %d\"%(size))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "64c444f5", + "metadata": {}, + "outputs": [], + "source": [ + "### Export for use in FINN\n", + "fifo_depth_export = {}\n", + "for fifo, depth in enumerate(fifo_depths):\n", + " fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo] = {}\n", + " # Try to account for additional registers introduced by virtual FIFO HLS implementation\n", + " fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo][\"depth\"] = depth + 4\n", + "\n", + "with open(\"fifo_depth_export.json\", \"w\") as f:\n", + " json.dump(fifo_depth_export, f, indent=2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b394bba7d4603f149e034c82ef296db93fc575f5 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 26 Feb 2025 10:25:07 +0000 Subject: [PATCH 043/125] Initialize DVC --- .dvc/.gitignore | 3 +++ .dvc/config | 9 +++++++++ .dvcignore | 4 ++++ requirements.txt | 3 ++- 4 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000000..528f30c71c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000000..000da4310e --- /dev/null +++ b/.dvc/config @@ -0,0 +1,9 @@ +[core] + remote = public +['remote "push"'] + url = webdavs://uni-paderborn.sciebo.de/public.php/webdav + user = XKrfO8JuRmm9pBo +['remote "public"'] + url = webdavs://uni-paderborn.sciebo.de/public.php/webdav + user = zkYThpsdAk69ZOb + password = "" diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000000..be35ed42ab --- /dev/null +++ b/.dvcignore @@ -0,0 +1,4 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore +__pycache__ diff --git a/requirements.txt b/requirements.txt index 1683695576..8233f97a54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ bitstring==3.1.7 clize==5.0.1 dataclasses-json==0.5.7 +dvc[webdav]~=3.59.1 gspread==3.6.0 importlib-resources==6.1.0 ipython==8.12.2 @@ -11,7 +12,7 @@ onnxruntime==1.18.1 pre-commit==3.3.2 protobuf==3.20.3 psutil==5.9.4 -pyscaffold==4.4 +pyscaffold==4.6 scipy==1.10.1 setupext-janitor>=1.1.2 sigtools==4.0.1 From d8bc10d9f6a86ab1dd6c5eda1d6ba8f71b28f87c Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 26 Feb 2025 17:49:25 +0000 Subject: [PATCH 044/125] fix metafi, test dvc --- benchmarking/bench-ci.yml | 1 + benchmarking/cfg/metafi_fifosizing_test.json | 4 ++-- benchmarking/cfg/metafi_test.json | 2 +- benchmarking/collect.py | 22 ++++++++++++++++++++ requirements.txt | 3 ++- 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index c3c40d4b0e..c7803e27ec 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -51,6 +51,7 @@ Result Collection: - image_build script: - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json + - dvc exp push -r origin artifacts: name: "bench_results" when: always diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json index f61ec93217..c61d1265fa 100644 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ b/benchmarking/cfg/metafi_fifosizing_test.json @@ -20,7 +20,7 @@ { "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], "board": ["RFSoC2x2"], "clock_period_ns": [10], @@ -39,7 +39,7 @@ { "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], "board": ["RFSoC2x2"], "clock_period_ns": [10], diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index 0ee1339441..6475f1aadd 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -2,7 +2,7 @@ { "dut": ["metafi"], "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config _metaFi_f25.json"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], "board": ["RFSoC2x2"], "clock_period_ns": [10], diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 3bc9aaf04b..ffe2222f73 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -3,6 +3,7 @@ import os import sys import time +from dvclive import Live def merge_dicts(a: dict, b: dict): for key in b: @@ -79,6 +80,27 @@ def wait_for_power_measurements(): print("Consolidating synthesis results from all sub-jobs of the array") consolidate_logs(sys.argv[1], sys.argv[2]) + # TEST DVC + # TODO: proper metric collection directly from .jsons in report build dir + combined_log = [] + with open(sys.argv[2], "r") as f: + combined_log = json.load(f) + + for run in combined_log: + with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live: + metadata = { + "run_id": run["run_id"], + "task_id": run["task_id"], + "status": run["status"], + "total_time": run["total_time"] + } + live.log_params(metadata) + live.log_params(run["params"]) + + if "builder" in run["output"]: + for key in run["output"]["builder"]: + live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False) + # TODO: disabled for now, update accordingly to new runner-based measurement setup # wait_for_power_measurements() # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", diff --git a/requirements.txt b/requirements.txt index 8233f97a54..c553f637e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ bitstring==3.1.7 clize==5.0.1 dataclasses-json==0.5.7 -dvc[webdav]~=3.59.1 +dvc[webdav]==3.59.1 +dvclive[image]==3.48.2 gspread==3.6.0 importlib-resources==6.1.0 ipython==8.12.2 From 8324083aa297f736cf16996f97c38eb8ef5709c2 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 13:49:05 +0000 Subject: [PATCH 045/125] Fix ResNet-50 streamlining --- benchmarking/dut/resnet50_custom_steps.py | 95 ++++++++++++----------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/benchmarking/dut/resnet50_custom_steps.py b/benchmarking/dut/resnet50_custom_steps.py index ddf8b0d0de..90deae5721 100644 --- a/benchmarking/dut/resnet50_custom_steps.py +++ b/benchmarking/dut/resnet50_custom_steps.py @@ -27,75 +27,65 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper - +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine +from qonnx.transformation.composed import ComposedTransformation +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat from qonnx.transformation.fold_constants import FoldConstants - from qonnx.transformation.general import ( - ConvertSubToAdd, + ApplyConfig, ConvertDivToMul, + ConvertSubToAdd, GiveReadableTensorNames, GiveUniqueNodeNames, - SortGraph, - RemoveUnusedTensors, GiveUniqueParameterTensors, RemoveStaticGraphInputs, - ApplyConfig, + RemoveUnusedTensors, + SortGraph, ) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.insert_topk import InsertTopK +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.builder.build_dataflow_config import DataflowBuildConfig, ShellFlowType +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline.absorb import ( - AbsorbScalarMulAddIntoTopK, - AbsorbAddIntoMultiThreshold, - AbsorbMulIntoMultiThreshold, - FactorOutMulSignMagnitude, - Absorb1BitMulIntoMatMul, Absorb1BitMulIntoConv, + Absorb1BitMulIntoMatMul, + AbsorbAddIntoMultiThreshold, AbsorbConsecutiveTransposes, + AbsorbMulIntoMultiThreshold, + AbsorbScalarMulAddIntoTopK, AbsorbTransposeIntoMultiThreshold, + FactorOutMulSignMagnitude, ) - from finn.transformation.streamline.collapse_repeated import ( CollapseRepeatedAdd, CollapseRepeatedMul, ) +# just for not linear from finn.transformation.streamline.reorder import ( + MoveAddPastConv, MoveAddPastMul, - MoveScalarMulPastMatMul, + MoveLinearPastEltwiseAdd, + MoveLinearPastFork, + MoveMaxPoolPastMultiThreshold, MoveScalarAddPastMatMul, - MoveAddPastConv, - MoveScalarMulPastConv, MoveScalarLinearPastInvariants, - MoveMaxPoolPastMultiThreshold, + MoveScalarMulPastConv, + MoveScalarMulPastMatMul, + MoveTransposePastEltwise, + MoveTransposePastFork, + MoveTransposePastJoinAdd, ) - from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.transformation.streamline.sign_to_thres import ConvertSignToThres -from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine - -# just for not linear -from finn.transformation.streamline.reorder import ( - MoveLinearPastEltwiseAdd, - MoveLinearPastFork, -) - -from qonnx.transformation.double_to_single_float import DoubleToSingleFloat -from qonnx.transformation.remove import RemoveIdentityOps -from qonnx.core.datatype import DataType - -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_data_layouts import InferDataLayouts -from qonnx.transformation.insert_topk import InsertTopK -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw -from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul - -from finn.builder.build_dataflow_config import ( - DataflowBuildConfig, - ShellFlowType, -) - -from finn.transformation.move_reshape import RemoveCNVtoFCFlatten def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig): @@ -170,6 +160,19 @@ def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(DoubleToSingleFloat()) + # Lower convolutions and streamline resulting transposes + model = model.transform(LowerConvsToMatMul()) + model = model.transform( + ComposedTransformation( + [ + MoveTransposePastJoinAdd(), + MoveTransposePastFork(), + MoveTransposePastEltwise(), + AbsorbConsecutiveTransposes(), + AbsorbTransposeIntoMultiThreshold(), + ] + ) + ) return model @@ -181,17 +184,15 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(SortGraph()) to_hw_transformations = [ - to_hw.InferAddStreamsLayer, - LowerConvsToMatMul, to_hw.InferChannelwiseLinearLayer, to_hw.InferPool, - AbsorbTransposeIntoMultiThreshold, + AbsorbConsecutiveTransposes, RoundAndClipThresholds, to_hw.InferQuantizedMatrixVectorActivation, to_hw.InferThresholdingLayer, - AbsorbConsecutiveTransposes, to_hw.InferConvInpGen, to_hw.InferDuplicateStreamsLayer, + to_hw.InferAddStreamsLayer, to_hw.InferLabelSelectLayer, ] for trn in to_hw_transformations: @@ -249,4 +250,4 @@ def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): # comment: apply floorplan to model # model = model.transform(ApplyConfig(floorplan)) # print("SLR floorplanning applied from partitioner") - return model \ No newline at end of file + return model From 66a9c6e6e6e01850577e43d535322dc8a6a10add Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 13:50:04 +0000 Subject: [PATCH 046/125] Remove transformer debug streamlining code --- benchmarking/bench-ci.yml | 4 ++-- benchmarking/dut/transformer_custom_steps.py | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index c7803e27ec..206d395839 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -50,8 +50,8 @@ Result Collection: tags: - image_build script: - - python benchmarking/collect.py bench_artifacts/tasks_output bench_results.json - - dvc exp push -r origin + - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json + - dvc exp push origin artifacts: name: "bench_results" when: always diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 1a96117e22..4ff497b892 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -276,11 +276,6 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): # Note: Contains some sets of nested exhaustive transformations meant for # particular architectural patterns, e.g., residual topologies. model = model.transform(Streamline()) - # DEBUG for streamlining after moving to MoveLinearPastFork with workaround applied - model = model.transform(MoveMulPastAdd()) - model = model.transform(AbsorbMulIntoMultiThreshold()) - model = model.transform(AbsorbAddIntoMultiThreshold()) - model = model.transform(MoveAddPastMul()) # If configured, run a verification of the transformed model on some # sample inputs if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): # noqa From c1696d9f82c6506c586cf3fe09cd1fd0cbba39d2 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 14:27:13 +0000 Subject: [PATCH 047/125] Enable live fifosizing option --- benchmarking/bench_base.py | 17 +++++++++++++++-- benchmarking/dut/metafi.py | 2 -- benchmarking/dut/resnet50.py | 3 +-- benchmarking/dut/transformer.py | 5 ----- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 9493a12786..a97054aca9 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -24,7 +24,8 @@ from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.analysis.fpgadataflow.res_estimation import res_estimation from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs -from finn.util.basic import make_build_dir, pynq_native_port_width, part_map +import finn.builder.build_dataflow_config as build_cfg +from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( @@ -1065,6 +1066,11 @@ def steps_full_build_flow(self): # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow cfg = self.step_build_setup() cfg.board = self.board + if self.board in alveo_part_map: + cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO + cfg.vitis_platform=alveo_default_platform[self.board] + else: + cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ cfg.verbose = False cfg.enable_build_pdb_debug = False cfg.force_python_rtlsim = False @@ -1072,10 +1078,17 @@ def steps_full_build_flow(self): #cfg.default_swg_exception #cfg.large_fifo_mem_style - # "manual or "characterize" or "largefifo_rtlsim" + # "manual or "characterize" or "largefifo_rtlsim" or "live" if "fifo_method" in self.params: if self.params["fifo_method"] == "manual": cfg.auto_fifo_depths = False + elif self.params["fifo_method"] == "live": + cfg.auto_fifo_depths = False + cfg.live_fifo_sizing = True + cfg.enable_instrumentation = True + # Overwrite output products + # TODO: make configurable directly via JSON/YAML cfg + cfg.generate_outputs = [build_cfg.DataflowOutputType.BITFILE] else: cfg.auto_fifo_depths = True cfg.auto_fifo_strategy = self.params["fifo_method"] diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index 7808f11856..b4bd4246b7 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -49,8 +49,6 @@ def step_build_setup(self): steps=steps, target_fps=None, #23 - shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end - #vitis_platform=vitis_platform, split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py index 87c6e04e2e..ec03e44a8b 100644 --- a/benchmarking/dut/resnet50.py +++ b/benchmarking/dut/resnet50.py @@ -39,9 +39,8 @@ def step_build_setup(self): output_dir = self.build_inputs["build_dir"], synth_clk_period_ns = self.clock_period_ns, steps=resnet50_build_steps, - shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO, # TODO: generalize/adapt to new back-end + split_large_fifos=True, - vitis_platform=alveo_default_platform[self.board], # TODO: generalize/adapt to new back-end # enable extra performance optimizations (physopt) vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST, diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 014da2e13e..91c73bbffe 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -897,10 +897,6 @@ def step_build_setup(self): with open("folding.yaml", "w") as f: f.write(template_folding_yaml) - if self.board in alveo_part_map: - shell_flow = "vitis_alveo" - else: - shell_flow = "vivado_zynq" # Create a configuration for building the scaled dot-product attention # operator to a hardware accelerator @@ -910,7 +906,6 @@ def step_build_setup(self): output_dir = self.build_inputs["build_dir"], stitched_ip_gen_dcp = False, # only needed for further manual integration synth_clk_period_ns = self.clock_period_ns, - shell_flow_type = shell_flow, folding_config_file = "folding.yaml", specialize_layers_config_file = "specialize_layers.json", standalone_thresholds = True, From 01d5551f1d93a6c97760e3a4335018e314f9de6b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 15:37:31 +0000 Subject: [PATCH 048/125] Generate FIFO size report as part of step_set_fifo_depths --- src/finn/builder/build_dataflow_steps.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index fe0cb68a88..ef90cba0b6 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -656,6 +656,23 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(SplitLargeFIFOs()) model = model.transform(RemoveShallowFIFOs()) + # generate a dedicated report about final FIFO sizes + fifo_info = {} + fifo_info["fifo_depths"] = {} + fifo_info["fifo_sizes"] = {} + total_fifo_size = 0 + for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") + fifo_info["fifo_sizes"][ + node.name + ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") + total_fifo_size += fifo_info["fifo_sizes"][node.name] + fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) + + with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f: + json.dump(fifo_info, f, indent=2) + # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) From 3598501532ede834cf894439bab9793dc49a853f Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 17:49:47 +0000 Subject: [PATCH 049/125] Add PYNQ driver for ZYNQ platforms --- src/finn/builder/build_dataflow_steps.py | 10 +- .../driver/driver_instrumentation.py | 143 ++++++++++++++++++ .../fpgadataflow/make_pynq_driver.py | 33 +++- 3 files changed, 183 insertions(+), 3 deletions(-) create mode 100644 src/finn/qnn-data/templates/driver/driver_instrumentation.py diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index a4481ed778..96f3bd7c63 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -90,7 +90,10 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker -from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_pynq_driver import ( + MakePYNQDriverIODMA, + MakePYNQDriverInstrumentation, +) from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.minimize_accumulator_width import ( MinimizeAccumulatorWidth, @@ -782,7 +785,10 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig): if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs: driver_dir = cfg.output_dir + "/driver" - model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform())) + if cfg.enable_instrumentation: + model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns)) + else: + model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) print("PYNQ Python driver written into " + driver_dir) return model diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py new file mode 100644 index 0000000000..fea9446bf5 --- /dev/null +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -0,0 +1,143 @@ +import time +import json +import argparse +import matplotlib as mpl +import matplotlib.pyplot as plt +from IPython.display import clear_output +import numpy as np +from pynq import Overlay +from pynq.ps import Clocks +from pynq.pl_server.device import Device + +### Instrumentation wrapper register map ### +#ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed +#ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow +#ap_uint<32> &latency, +#ap_uint<32> &interval, +#ap_uint<32> &checksum, +#ap_uint<32> &min_latency + +class FINNInstrumentationOverlay(Overlay): + def __init__( + self, + bitfile_name, + platform = "zynq", + fclk_mhz = 100.0, + device = None, + download = True, + seed = 1, + ): + super().__init__(bitfile_name, download=download, device=device) + + self.platform = platform + self.fclk_mhz = fclk_mhz + self.seed = seed + + # configure clock (for ZYNQ platforms) + if self.platform == "zynq": + if self.fclk_mhz > 0: + Clocks.fclk0_mhz = self.fclk_mhz + self.fclk_mhz_actual = Clocks.fclk0_mhz + + def instrumentation_read(self, name): + return self.instrumentation_wrap_0.read(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"]) + + def instrumentation_write(self, name, value): + return self.instrumentation_wrap_0.write(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], value=value) + + def reset_accelerator(self): + self.axi_gpio_0.write(offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0) + + def start_accelerator(self): + lfsr_seed = (self.seed << 16) & 0xffff0000 # upper 16 bits + self.instrumentation_write("cfg", lfsr_seed + 1) # start operation + + def observe_instrumentation(self, debug_print=True): + status_reg = self.instrumentation_read("status") + chksum_reg = self.instrumentation_read("checksum") + min_latency = self.instrumentation_read("min_latency") + latency = self.instrumentation_read("latency") + interval = self.instrumentation_read("interval") + + frame = (chksum_reg >> 24) & 0x000000ff + checksum = chksum_reg & 0x00ffffff + overflow_err = (status_reg & 0x00000001) != 0 + underflow_err = (status_reg & 0x00000002) != 0 + + if debug_print: + print("---INSTRUMENTATION_REPORT---") + if overflow_err or underflow_err: + print("Status ERROR") + print("Overflow error: %s" % overflow_err) + print("Underflow error: %s" % underflow_err) + else: + print("Status OK") + print("Frame number (8-bit): %d" % frame) + print("Checksum: 0x%06x" % checksum) + print("Min Latency (cycles): %d" % min_latency) + print("Latency (cycles): %d" % latency) + print("Interval (cycles): %d" % interval) + print("----------------------------") + + return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper') + parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10) + parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0) + parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1) + parser.add_argument('--device', help='FPGA device to be used', type=int, default=0) + parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit") + parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json") + parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="") + # parse arguments + args = parser.parse_args() + runtime = args.runtime + frequency = args.frequency + seed = args.seed + bitfile = args.bitfile + reportfile = args.reportfile + settingsfile = args.settingsfile + devID = args.device + device = Device.devices[devID] + + # overwrite frequency if specified in settings file + if settingsfile != "": + with open(settingsfile, "r") as f: + settings = json.load(f) + if "fclk_mhz" in settings: + frequency = settings["fclk_mhz"] + + # instantiate FINN accelerator driver and pass batchsize and bitfile + print("Programming FPGA..") + accel = FINNInstrumentationOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed) + + # start accelerator + print("Running accelerator..") + accel.start_accelerator() + + # let it run for specified runtime + time.sleep(runtime) + + # read measurement from instrumentation + (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = accel.observe_instrumentation() + + # write report to file + report = { + "error": overflow_err or underflow_err or interval == 0, + "checksum": checksum, + "min_latency_cycles": min_latency, + "latency_cycles": latency, + "interval_cycles": interval, + "frequency_mhz": round(accel.fclk_mhz_actual), + "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))), + "min_pipeline_depth": round(min_latency / interval, 2), + "pipeline_depth" : round(latency / interval, 2), + } + with open(reportfile, "w") as f: + json.dump(report, f, indent=2) + + print("Done.") diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index ea9bd2aa26..b935f5eea0 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -28,6 +28,7 @@ import numpy as np import os +import json import qonnx import shutil import warnings @@ -62,7 +63,7 @@ def to_external_tensor(init, w_dtype): return ext_weight -class MakePYNQDriver(Transformation): +class MakePYNQDriverIODMA(Transformation): """Create PYNQ Python code to correctly interface the generated accelerator, including data packing/unpacking. Should be called after conversion to HLS layers, folding and the creation of @@ -302,4 +303,34 @@ def apply(self, model): else: continue + +class MakePYNQDriverInstrumentation(Transformation): + def __init__(self, platform, clk_period_ns): + super().__init__() + self.platform = platform + self.clk_period_ns = clk_period_ns + + def apply(self, model): + # TODO: support runtime-writable and external weights + # TODO: support Alveo and Versal platforms + + # create a temporary folder for the generated driver + pynq_driver_dir = make_build_dir(prefix="pynq_driver_") + model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) + + # create (copy) the static instrumentation driver + driver_template = ( + os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py" + ) + driver_py = pynq_driver_dir + "/driver.py" + shutil.copy(driver_template, driver_py) + + # write default settings to driver config file + settings = { + "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3, + } + settingsfile = pynq_driver_dir + "/settings.json" + with open(settingsfile, "w") as f: + json.dump(settings, f, indent=2) + return (model, False) From f32e884b81ba4f916a96a80b8d30b0bf44b8613a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 21:09:27 +0000 Subject: [PATCH 050/125] Add non-interactive driver --- src/finn/builder/build_dataflow_steps.py | 2 +- .../templates/driver/driver_fifosizing.py | 320 ++++++++++++++++++ .../fpgadataflow/make_pynq_driver.py | 24 +- 3 files changed, 343 insertions(+), 3 deletions(-) create mode 100644 src/finn/qnn-data/templates/driver/driver_fifosizing.py diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index dd50e8880f..2f05886afd 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -826,7 +826,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig): if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs: driver_dir = cfg.output_dir + "/driver" if cfg.enable_instrumentation: - model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns)) + model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing)) else: model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py new file mode 100644 index 0000000000..560959991f --- /dev/null +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -0,0 +1,320 @@ +import time +import json +import os +import argparse +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +from pynq.pl_server.device import Device + +from driver_instrumentation import FINNInstrumentationOverlay + + +class FINNLiveFIFOOverlay(FINNInstrumentationOverlay): + def __init__( + self, + bitfile_name, + platform = "zynq", + fclk_mhz = 100.0, + device = None, + download = True, + seed = 1, + fifo_widths = {}, + ): + super().__init__(bitfile_name, platform = platform, fclk_mhz = fclk_mhz, seed = seed, download = download, device = device) + + self.error = False + self.fifo_widths = fifo_widths + self.num_fifos = len(self.fifo_widths) + # Try to account for additional registers introduced by virtual FIFO HLS implementation + self.fifo_depth_offset = 4 + + # Sanity check + # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps + # We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights + if (len(self.ip_dict.keys()) - 3) != self.num_fifos: + self.error = True + + def configure_fifo(self, i, mode, depth = 2): + ### Virtual FIFO register map ### + mode_offset = 0x10 + depth_offset = 0x18 + occupancy_offset = 0x20 + occupancy_ctrl_offset = 0x24 + max_occupancy_offset = 0x30 + max_occupancy_ctrl_offset = 0x34 + + ip_name = "StreamingDataflowPartition_%d" % i + getattr(self, ip_name).write(offset=mode_offset, value = mode) + getattr(self, ip_name).write(offset=depth_offset, value = depth) + + def total_fifo_size(self, depths): + # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs + total_size_bits = 0 + for i, depth in enumerate(depths): + total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths["StreamingFIFO_hls_%d" % i] + total_size_kB = total_size_bits / 8.0 / 1000.0 + return total_size_kB + + def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.5): + ### Iterative FIFO-sizing function ### + fifo_minimum_reached = [False] * self.num_fifos + + if isinstance(start_depth, list): + # Individual start depth for each FIFO has been supplied + fifo_depths = start_depth + else: + # Initialize all depths to the same start depth + fifo_depths = [start_depth] * self.num_fifos + + # Reset accelerator and configure FIFOs + self.reset_accelerator() + for i in range(0, self.num_fifos): + self.configure_fifo(i, mode = 1, depth = fifo_depths[i]) + + # Run once to determine target interval + self.start_accelerator() + time.sleep(1) + (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False) + log_total_fifo_size = [int(self.total_fifo_size(fifo_depths))] + log_interval = [interval] + log_min_latency = [min_latency] + log_latency = [latency] + target_interval = interval + + # Iteratively reduce FIFO depth until all FIFOs are minimized + iteration = 0 + start_time = time.time() + while not all(fifo_minimum_reached): + for fifo_id in range(0, self.num_fifos): + if not fifo_minimum_reached[fifo_id]: + fifo_depth_before = fifo_depths[fifo_id] + fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor) + + # Reset accelerator + self.reset_accelerator() + + # Configure all FIFOs + for i in range(0, self.num_fifos): + self.configure_fifo(i, mode = 1, depth = fifo_depths[i]) + + # Start accelerator + self.start_accelerator() + + # Let it run + time.sleep(iteration_runtime) + + # Check if throughput dropped or deadlock occured + (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False) + + if interval > target_interval or interval == 0 or overflow_err or underflow_err: + # Revert depth reduction and mark FIFO as minimized + fifo_depths[fifo_id] = fifo_depth_before + fifo_minimum_reached[fifo_id] = True + else: + log_total_fifo_size.append(int(self.total_fifo_size(fifo_depths))) + log_interval.append(interval) + log_min_latency.append(min_latency) + log_latency.append(latency) + + if fifo_depths[fifo_id] == 1: + fifo_minimum_reached[fifo_id] = True + + # Report status + print("Iteration: %d" % iteration) + print("Numer of minimized FIFOs: %d/%d" % (sum(fifo_minimum_reached), self.num_fifos)) + print("Interval: %d" % log_interval[-1]) + print("Min. latency / latency: %d/%d" % (log_min_latency[-1], log_latency[-1])) + print("Total FIFO Size (kB): %d" % log_total_fifo_size[-1]) + + iteration += 1 + + end_time = time.time() + duration = int(end_time - start_time) + print("Done (%d seconds)" % duration) + + return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency, duration + + def determine_start_depth(self, ): + ### Attempt to determine start depth for all FIFOs automatically ### + # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis + start_depth = 64 + last_interval = 0 + start_depth_found = False + + while not start_depth_found and not self.error: + print("Testing start depth of %d" % start_depth) + self.reset_accelerator() + + # Configure FIFOs + for i in range(0, self.num_fifos): + self.configure_fifo(i, mode = 1, depth = start_depth) + + # Start accelerator and let it run for a long time + self.start_accelerator() + time.sleep(1) + + # Examine performance + (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation() + if interval > 0 and interval == last_interval and not overflow_err and not underflow_err: + # Accelerator runs with stable interval, reset to previous start depth + start_depth_found = True + start_depth = last_start_depth + else: + # Start depth is still too small, increase for next try + last_start_depth = start_depth + start_depth = start_depth * 2 + + last_interval = interval + + if start_depth > 1000000: + print("Couldn't find a working start depth, please set manually") + self.error = True + + # Determine runtime per iteration based on performance, so that stable-state is guaranteed + # Use a simple overestimation for now to be safe + iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000) + + print("Determined start depth for all FIFOs: %d" % start_depth) + print("Determined iteration runtime based on performance: %f s" % iteration_runtime) + return (start_depth, iteration_runtime) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper') + parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10) + parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0) + parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1) + parser.add_argument('--device', help='FPGA device to be used', type=int, default=0) + parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit") + parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json") + parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="") + # parse arguments + args = parser.parse_args() + runtime = args.runtime + frequency = args.frequency + seed = args.seed + bitfile = args.bitfile + reportfile = args.reportfile + report_dir = os.path.dirname(reportfile) + settingsfile = args.settingsfile + devID = args.device + device = Device.devices[devID] + + # overwrite frequency if specified in settings file + if settingsfile != "": + with open(settingsfile, "r") as f: + settings = json.load(f) + if "fclk_mhz" in settings: + frequency = settings["fclk_mhz"] + + # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g., + # {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}} + fifo_widths = settings["fifo_widths"] + + + print("Programming FPGA..") + accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths) + + (start_depth, iteration_runtime) = accel.determine_start_depth() + + ### First pass + print("Starting first pass..") + pass1_result = accel.size_iteratively(start_depth, iteration_runtime) + (fifo_depths, + log_total_fifo_size, + log_interval, + log_min_latency, + log_latency, + duration) = pass1_result + + ### Visualize results and save as "fifo_sizing_graph.png" + fig, ax1 = plt.subplots() + + color = 'tab:red' + ax1.set_xlabel('Iteration') + ax1.set_ylabel('Total FIFO Size [kB]', color=color) + ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color) + ax1.tick_params(axis='y', labelcolor=color) + ax1.set_ylim(0, max(log_total_fifo_size)) + + ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis + + color = 'tab:blue' + ax2.set_ylabel('Latency [cycles]', color=color) + ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color) + ax2.tick_params(axis='y', labelcolor=color) + #ax2.set_ylim(0, max(log_latency)) + + ax2.axhline(log_min_latency[0], color="green", label="Minimum (1st frame) Latency") + ax2.legend() + + plt.tight_layout() + plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi = 300) + + ### Second pass for fine-tuning + print("Starting second pass..") + pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95) + (fifo_depths, + log_total_fifo_size, + log_interval, + log_min_latency, + log_latency, + duration) = pass2_result + + ### Generate fifo_sizing_report.json + fifo_report = { + "error": accel.error, + "fifo_size_total_kB": log_total_fifo_size[-1], + "fifo_depths": {}, + "fifo_sizes": {}, + "pass_1": { + "duration": pass1_result[5], + "log_total_fifo_size": pass1_result[1], + "log_interval": pass1_result[2], + "log_min_latency": pass1_result[3], + "log_latency": pass1_result[4], + }, + "pass_2": { + "duration": pass2_result[5], + "log_total_fifo_size": pass2_result[1], + "log_interval": pass2_result[2], + "log_min_latency": pass2_result[3], + "log_latency": pass2_result[4], + }, + } + for fifo, depth in enumerate(fifo_depths): + size = (depth + accel.fifo_depth_offset) * accel.fifo_widths["StreamingFIFO_hls_%d" % fifo] + fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset + fifo_report["fifo_sizes"][fifo] = size + with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f: + json.dump(fifo_report, f, indent=2) + + ### Generate fifo_depth_export.json to export FIFO depths for use in FINN + fifo_depth_export = {} + for fifo, depth in enumerate(fifo_depths): + fifo_depth_export["StreamingFIFO_rtl_%d" % fifo] = {} + fifo_depth_export["StreamingFIFO_rtl_%d" % fifo]["depth"] = depth + accel.fifo_depth_offset + with open(os.path.join(report_dir, "fifo_depth_export.json"), "w") as f: + json.dump(fifo_depth_export, f, indent=2) + + ### Generate the usual instrumentation performance report based on final state + min_latency = log_min_latency[-1] + latency = log_latency[-1] + interval = log_interval[-1] + report = { + "error": accel.error, + "checksum": 0, + "min_latency_cycles": min_latency, + "latency_cycles": latency, + "interval_cycles": interval, + "frequency_mhz": round(accel.fclk_mhz_actual), + "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))), + "min_pipeline_depth": round(min_latency / interval, 2), + "pipeline_depth" : round(latency / interval, 2), + } + with open(reportfile, "w") as f: + json.dump(report, f, indent=2) + + print("Done.") \ No newline at end of file diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index b935f5eea0..93c0e45e6c 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -305,10 +305,11 @@ def apply(self, model): class MakePYNQDriverInstrumentation(Transformation): - def __init__(self, platform, clk_period_ns): + def __init__(self, platform, clk_period_ns, live_fifo_sizing): super().__init__() self.platform = platform self.clk_period_ns = clk_period_ns + self.live_fifo_sizing = live_fifo_sizing def apply(self, model): # TODO: support runtime-writable and external weights @@ -322,13 +323,32 @@ def apply(self, model): driver_template = ( os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py" ) - driver_py = pynq_driver_dir + "/driver.py" + if self.live_fifo_sizing: + driver_py = pynq_driver_dir + "/driver_instrumentation.py" + else: + driver_py = pynq_driver_dir + "/driver.py" shutil.copy(driver_template, driver_py) + # add-on driver for live fifosizing + if self.live_fifo_sizing: + driver_template = ( + os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_fifosizing.py" + ) + driver_py = pynq_driver_dir + "/driver.py" + shutil.copy(driver_template, driver_py) + # write default settings to driver config file settings = { "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3, } + if self.live_fifo_sizing: + # export FIFO widths to the settings file as well + fifo_widths = {} + for node in model.get_nodes_by_op_type("StreamingFIFO_hls"): + node_inst = getCustomOp(node) + fifo_widths[node.name] = node_inst.get_instream_width() + settings["fifo_widths"] = fifo_widths + settingsfile = pynq_driver_dir + "/settings.json" with open(settingsfile, "w") as f: json.dump(settings, f, indent=2) From 0bc66389add4249f00f055772e1c39993197331e Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 27 Feb 2025 21:13:58 +0000 Subject: [PATCH 051/125] DVC push fix --- benchmarking/bench-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 206d395839..f62f2eb35a 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -51,7 +51,7 @@ Result Collection: - image_build script: - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json - - dvc exp push origin + - dvc exp push git@github.com:eki-project/finn-plus.git artifacts: name: "bench_results" when: always From cd66c9211c1a32159cdfb5da47f4b11d2990ea97 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 11:14:27 +0000 Subject: [PATCH 052/125] Refactor and remove old code --- .gitlab-ci.yml | 7 +- benchmarking/bench_base.py | 567 +----------------- benchmarking/cfg/fifosizing_test.json | 23 - benchmarking/cfg/metafi_fifosizing_test.json | 57 -- benchmarking/cfg/metafi_test.json | 4 +- benchmarking/cfg/mvau_test.json | 4 +- .../cfg/resnet50_fifosizing_test.json | 66 -- benchmarking/cfg/resnet50_test.json | 18 +- benchmarking/cfg/synthetic_fifotest.json | 64 ++ benchmarking/cfg/transformer_gpt_all.json | 26 +- benchmarking/cfg/transformer_radioml_all.json | 14 +- benchmarking/cfg/transformer_test.json | 5 +- benchmarking/collect.py | 10 +- benchmarking/dut/metafi.py | 14 - benchmarking/dut/resnet50.py | 13 - benchmarking/dut/synthetic_nonlinear.py | 11 - benchmarking/dut/transformer.py | 42 +- benchmarking/harness/sink/ip/component.xml | 256 -------- .../harness/sink/ip/src/harness_sink.v | 39 -- .../sink/ip/xgui/harness_sink_v1_0.tcl | 25 - benchmarking/harness/vector_xor.v | 32 - src/finn/builder/build_dataflow.py | 5 +- src/finn/builder/build_dataflow_config.py | 6 +- 23 files changed, 142 insertions(+), 1166 deletions(-) delete mode 100644 benchmarking/cfg/fifosizing_test.json delete mode 100644 benchmarking/cfg/metafi_fifosizing_test.json delete mode 100644 benchmarking/cfg/resnet50_fifosizing_test.json create mode 100644 benchmarking/cfg/synthetic_fifotest.json delete mode 100644 benchmarking/harness/sink/ip/component.xml delete mode 100644 benchmarking/harness/sink/ip/src/harness_sink.v delete mode 100644 benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl delete mode 100644 benchmarking/harness/vector_xor.v diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c19da1d908..a82ad24eeb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -211,14 +211,9 @@ Bench: PARENT_PIPELINE_ID: $CI_PIPELINE_ID parallel: matrix: - - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all] - -#dev: mvau_test -#fifo: fifosizing_test, metafi_fifosizing_test, resnet50_fifosizing_test -#transformer: transformer_test, transformer_radioml_all + - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest] #TODO: add selector for none, reduced, full benchmark suite - #TODO: introduce result collect job on parent level for easier visualization/excel interfacing #TODO: more control via (optional) variables #TODO: move power measurement from polling-based script to its own job/runner diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index a97054aca9..636af6bb5e 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -42,422 +42,6 @@ import pandas as pd import onnxruntime as ort -class MakeZYNQHarnessProject(Transformation): - """Based on MakeZYNQProject transformation, but integrates IP into test harness instead of DMA shell.""" - - def __init__(self, platform, output_dir, dut_duplication=1, clock_period_ns=10): - super().__init__() - self.platform = platform - self.output_dir = output_dir - self.dut_duplication = dut_duplication - self.clock_period_ns = clock_period_ns - - def apply(self, model): - # create a config file and empty list of xo files - config = [] - idma_idx = 0 - odma_idx = 0 - aximm_idx = 0 - axilite_idx = 0 - global_clk_ns = 0 - - # assume single stitched-ip (previously dataflowpartition) as DUT - # assume single primary input/output - input_tensor = model.graph.input[0] - output_tensor = model.graph.output[0] - input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) - output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) - instream_width = input_node_inst.get_instream_width_padded() - outstream_width = output_node_inst.get_outstream_width_padded() - - # assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" - # sdp_node = getCustomOp(node) - # dataflow_model_filename = sdp_node.get_nodeattr("model") - # kernel_model = ModelWrapper(dataflow_model_filename) - kernel_model = model - - ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj") - if ipstitch_path is None or (not os.path.isdir(ipstitch_path)): - raise Exception("No stitched IPI design found, apply CreateStitchedIP first.") - - vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv") - if vivado_stitch_vlnv is None: - raise Exception("No vlnv found, apply CreateStitchedIP first.") - - ip_dirs = ["list"] - ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path) - ip_dirs.append("$::env(FINN_ROOT)/benchmarking/harness/sink/ip") - ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) - config.append( - "set_property ip_repo_paths " - "[concat [get_property ip_repo_paths [current_project]] %s] " - "[current_project]" % ip_dirs_str - ) - config.append("update_ip_catalog -rebuild -scan_changes") - config.append( - "import_files -fileset sources_1 -norecurse $::env(FINN_ROOT)/benchmarking/harness/vector_xor.v" - ) - - # get metadata property clk_ns to calculate clock frequency - clk_ns = float(kernel_model.get_metadata_prop("clk_ns")) - if clk_ns > global_clk_ns: - global_clk_ns = clk_ns - - ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames")) - - # instantiate DUT, TODO: switch to wrapper verilog file for (multiple-) DUT instantiation - for id in range(self.dut_duplication): - dut_instance_name = "finn_design_%d" % id - config.append( - "create_bd_cell -type ip -vlnv %s %s" % (vivado_stitch_vlnv, dut_instance_name) - ) - # sdp_node.set_nodeattr("instance_name", instance_names[node.name]) - config.append( - "connect_bd_net [get_bd_pins %s/ap_clk] [get_bd_pins axi_interconnect_0/aclk]" - % dut_instance_name - ) - config.append( - "connect_bd_net [get_bd_pins %s/ap_rst_n] [get_bd_pins axi_interconnect_0/aresetn]" - % dut_instance_name - ) - - # instantiate input harness - if instream_width > 8192: - print("ERROR: DUT input stream width > 8192") - raise Exception("ERROR: DUT input stream width > 8192") - elif instream_width > 4096: - num_sources = 8 - source_width = roundup_to_integer_multiple(instream_width / 8, 8) - elif instream_width > 2048: - num_sources = 4 - source_width = roundup_to_integer_multiple(instream_width / 4, 8) - elif instream_width > 1024: - num_sources = 2 - source_width = roundup_to_integer_multiple(instream_width / 2, 8) - else: - num_sources = 1 - source_width = instream_width - - if self.dut_duplication > 1: - if num_sources > 1: - print("ERROR: DUT duplication with >1024 stream width not supported!") - raise Exception("ERROR: DUT duplication with >1024 stream width not supported!") - - num_sources = self.dut_duplication # one source per DUT instance - seed = 0xABCD - for id in range(num_sources): - config.append( - "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d" - % id - ) - config.append( - "set_property -dict [list \ - CONFIG.C_ATG_MODE {AXI4-Stream} \ - CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \ - CONFIG.C_AXIS_SPARSE_EN {false} \ - CONFIG.C_AXIS_TDATA_WIDTH {%d} \ - CONFIG.C_AXIS_TDEST_WIDTH {0} \ - CONFIG.C_AXIS_TID_WIDTH {0} \ - CONFIG.C_AXIS_TUSER_WIDTH {0} \ - CONFIG.STRM_DATA_SEED {%s} \ - ] [get_bd_cells axi_traffic_gen_%d]" - % (source_width, "0x{:04X}".format(seed), id) - ) - config.append( - "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]" - % id - ) - config.append( - "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]" - % id - ) - seed = seed + 99 - - config.append( - "connect_bd_intf_net [get_bd_intf_pins axi_traffic_gen_%d/M_AXIS_MASTER] [get_bd_intf_pins finn_design_%d/s_axis_0]" - % (id, id) - ) - - else: - seed = 0xABCD - for id in range(num_sources): - config.append( - "create_bd_cell -type ip -vlnv xilinx.com:ip:axi_traffic_gen:3.0 axi_traffic_gen_%d" - % id - ) - config.append( - "set_property -dict [list \ - CONFIG.C_ATG_MODE {AXI4-Stream} \ - CONFIG.C_ATG_STREAMING_MAX_LEN_BITS {1} \ - CONFIG.C_AXIS_SPARSE_EN {false} \ - CONFIG.C_AXIS_TDATA_WIDTH {%d} \ - CONFIG.C_AXIS_TDEST_WIDTH {0} \ - CONFIG.C_AXIS_TID_WIDTH {0} \ - CONFIG.C_AXIS_TUSER_WIDTH {0} \ - CONFIG.STRM_DATA_SEED {%s} \ - ] [get_bd_cells axi_traffic_gen_%d]" - % (source_width, "0x{:04X}".format(seed), id) - ) - config.append( - "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aclk] [get_bd_pins axi_interconnect_0/aclk]" - % id - ) - config.append( - "connect_bd_net [get_bd_pins axi_traffic_gen_%d/s_axi_aresetn] [get_bd_pins axi_interconnect_0/aresetn]" - % id - ) - config.append( - "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tready] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tready]" - % id - ) - seed = seed + 99 - - if num_sources > 1: - config.append( - "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_tdata" - ) - config.append( - "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_tdata]" % num_sources - ) - - for id in range(num_sources): - config.append( - "connect_bd_net [get_bd_pins xlconcat_tdata/In%d] [get_bd_pins axi_traffic_gen_%d/m_axis_1_tdata]" - % (id, id) - ) - - config.append( - "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins xlconcat_tdata/dout]" - ) - else: - config.append( - "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tdata] [get_bd_pins axi_traffic_gen_0/m_axis_1_tdata]" - ) - - # only connect valid from source 0 to DUT - config.append( - "connect_bd_net [get_bd_pins finn_design_0/s_axis_0_tvalid] [get_bd_pins axi_traffic_gen_0/m_axis_1_tvalid]" - ) - - # instantiate output harness - for id in range(self.dut_duplication): - config.append( - "create_bd_cell -type ip -vlnv xilinx.com:user:harness_sink:1.0 sink_%d" % id - ) - config.append( - "set_property -dict [list CONFIG.STREAM_WIDTH {%d}] [get_bd_cells sink_%d]" - % (outstream_width, id) - ) - config.append( - "connect_bd_intf_net [get_bd_intf_pins sink_%d/s_axis_0] [get_bd_intf_pins finn_design_%d/m_axis_0]" - % (id, id) - ) - - # GPIO control (TODO: connect interrupt) - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0") - config.append( - "set_property -dict [list \ - CONFIG.C_ALL_INPUTS {0} \ - CONFIG.C_GPIO_WIDTH {5} \ - CONFIG.C_INTERRUPT_PRESENT {1} \ - ] [get_bd_cells axi_gpio_0]" - ) - config.append( - "connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (axilite_idx) - ) - config.append("assign_axi_addr_proc axi_gpio_0/S_AXI") - axilite_idx += 1 - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_0") - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_1") - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlslice:1.0 xlslice_2") - config.append( - "set_property -dict [list \ - CONFIG.DIN_FROM {0} \ - CONFIG.DIN_TO {0} \ - CONFIG.DIN_WIDTH {5} \ - ] [get_bd_cells xlslice_0]" - ) - config.append( - "set_property -dict [list \ - CONFIG.DIN_FROM {1} \ - CONFIG.DIN_TO {1} \ - CONFIG.DIN_WIDTH {5} \ - ] [get_bd_cells xlslice_1]" - ) - config.append( - "set_property -dict [list \ - CONFIG.DIN_FROM {2} \ - CONFIG.DIN_TO {2} \ - CONFIG.DIN_WIDTH {5} \ - ] [get_bd_cells xlslice_2]" - ) - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_0") - config.append( - "set_property -dict [list CONFIG.IN1_WIDTH.VALUE_SRC USER CONFIG.IN2_WIDTH.VALUE_SRC USER CONFIG.IN0_WIDTH.VALUE_SRC USER] [get_bd_cells xlconcat_0]" - ) - config.append( - "set_property -dict [list \ - CONFIG.IN0_WIDTH {3} \ - CONFIG.NUM_PORTS {3} \ - ] [get_bd_cells xlconcat_0]" - ) - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconstant:1.1 xlconstant_0") - config.append( - "set_property -dict [list \ - CONFIG.CONST_VAL {0} \ - CONFIG.CONST_WIDTH {3} \ - ] [get_bd_cells xlconstant_0]" - ) - config.append( - """ - connect_bd_net [get_bd_pins xlslice_0/Din] [get_bd_pins axi_gpio_0/gpio_io_o] - connect_bd_net [get_bd_pins xlslice_1/Din] [get_bd_pins axi_gpio_0/gpio_io_o] - connect_bd_net [get_bd_pins xlslice_2/Din] [get_bd_pins axi_gpio_0/gpio_io_o] - connect_bd_net [get_bd_pins xlconstant_0/dout] [get_bd_pins xlconcat_0/In0] - connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_i] [get_bd_pins xlconcat_0/dout] - """ - ) - if self.dut_duplication > 1: - config.append("create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_valid") - config.append( - "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_valid]" - % self.dut_duplication - ) - config.append( - "create_bd_cell -type ip -vlnv xilinx.com:ip:xlconcat:2.1 xlconcat_checksum" - ) - config.append( - "set_property CONFIG.NUM_PORTS {%d} [get_bd_cells xlconcat_checksum]" - % self.dut_duplication - ) - - config.append("create_bd_cell -type module -reference vector_xor vector_xor_valid") - config.append( - "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_valid]" - % self.dut_duplication - ) - config.append("create_bd_cell -type module -reference vector_xor vector_xor_checksum") - config.append( - "set_property CONFIG.WIDTH {%d} [get_bd_cells vector_xor_checksum]" - % self.dut_duplication - ) - - config.append( - "connect_bd_net [get_bd_pins vector_xor_valid/in_data] [get_bd_pins xlconcat_valid/dout]" - ) - config.append( - "connect_bd_net [get_bd_pins vector_xor_checksum/in_data] [get_bd_pins xlconcat_checksum/dout]" - ) - config.append( - "connect_bd_net [get_bd_pins vector_xor_valid/out_data] [get_bd_pins xlconcat_0/In1]" - ) - config.append( - "connect_bd_net [get_bd_pins vector_xor_checksum/out_data] [get_bd_pins xlconcat_0/In2]" - ) - for id in range(self.dut_duplication): - config.append( - "connect_bd_net [get_bd_pins sink_%d/valid] [get_bd_pins xlconcat_valid/In%d]" - % (id, id) - ) - config.append( - "connect_bd_net [get_bd_pins sink_%d/checksum] [get_bd_pins xlconcat_checksum/In%d]" - % (id, id) - ) - else: - config.append("connect_bd_net [get_bd_pins sink_0/valid] [get_bd_pins xlconcat_0/In1]") - config.append( - "connect_bd_net [get_bd_pins sink_0/checksum] [get_bd_pins xlconcat_0/In2]" - ) - for id in range(self.dut_duplication): - config.append( - "connect_bd_net [get_bd_pins xlslice_2/Dout] [get_bd_pins sink_%d/enable]" % id - ) - for id in range(num_sources): - config.append( - "connect_bd_net [get_bd_pins xlslice_0/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_start]" - % id - ) - config.append( - "connect_bd_net [get_bd_pins xlslice_1/Dout] [get_bd_pins axi_traffic_gen_%d/core_ext_stop]" - % id - ) - - # create a temporary folder for the project - vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") - model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) - - fclk_mhz = int(1 / (global_clk_ns * 0.001)) - - # create a TCL recipe for the project - ipcfg = vivado_pynq_proj_dir + "/ip_config.tcl" - config = "\n".join(config) + "\n" - with open(ipcfg, "w") as f: - f.write( - zynq_harness_template - % ( - fclk_mhz, - axilite_idx, - aximm_idx, - self.platform, - part_map[self.platform], - config, - ) - ) - - # create a TCL recipe for the project - synth_project_sh = vivado_pynq_proj_dir + "/synth_project.sh" - working_dir = os.environ["PWD"] - with open(synth_project_sh, "w") as f: - f.write("#!/bin/bash \n") - f.write("cd {}\n".format(vivado_pynq_proj_dir)) - f.write("vivado -mode batch -source %s\n" % ipcfg) - f.write("cd {}\n".format(working_dir)) - - # call the synthesis script - bash_command = ["bash", synth_project_sh] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - - # collect results - os.makedirs(self.output_dir, exist_ok=True) - - bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" - if not os.path.isfile(bitfile_name): - raise Exception( - "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir - ) - hwh_name = vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh" - if not os.path.isfile(hwh_name): - raise Exception( - "Synthesis failed, no hwh file found. Check logs under %s" % vivado_pynq_proj_dir - ) - synth_report_name = vivado_pynq_proj_dir + "/synth_report.xml" - model.set_metadata_prop("vivado_synth_rpt", synth_report_name) - model.set_metadata_prop("bitfile", bitfile_name) - model.set_metadata_prop("hw_handoff", hwh_name) - - shcopy(bitfile_name, self.output_dir) - shcopy(hwh_name, self.output_dir) - shcopy(synth_report_name, self.output_dir) - - post_synth_resources = model.analysis(post_synth_res) - with open(self.output_dir + "/post_synth_resources.json", "w") as f: - json.dump(post_synth_resources, f, indent=2) - - timing_rpt = ("%s/finn_zynq_link.runs/impl_1/top_wrapper_timing_summary_routed.rpt"% vivado_pynq_proj_dir) - shcopy(timing_rpt, self.output_dir + "/post_route_timing.rpt") - return (model, False) - -def step_synth_harness(model: ModelWrapper, cfg: DataflowBuildConfig): - # Build step version of above transformation (used for full builds) - model = model.transform(MakeZYNQHarnessProject( - platform=cfg.board, - output_dir=os.path.join(cfg.output_dir, "harness"), - #dut_duplication=dut_duplication, #TODO: enable for full builds - clock_period_ns=cfg.synth_clk_period_ns - )) - return model def start_test_batch_fast(results_path, project_path, run_target, pairs): # Prepare tcl script @@ -786,14 +370,14 @@ def step_synth_power(self): build_dir = "temp_output_harness_build" # TODO: replace hold harness with new instr wrapper implementation #TODO: if synth fails this could contain stale bitstreams which will be power tested - model = model.transform( - MakeZYNQHarnessProject( - platform=self.board, - output_dir=build_dir, - dut_duplication=dut_duplication, - clock_period_ns=self.clock_period_ns - ) - ) + # model = model.transform( + # MakeZYNQHarnessProject( + # platform=self.board, + # output_dir=build_dir, + # dut_duplication=dut_duplication, + # clock_period_ns=self.clock_period_ns + # ) + # ) # COPY bitstreams and other outputs # TODO: integrate better (e.g. as artifact) and remove redundant copy @@ -872,120 +456,6 @@ def step_parse_builder_output(self, build_dir): else: pass #TODO: warn/skip? - ### ANALYZE FIFOs ### - fifo_info = {} - # TODO: skip if not present - model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") - - fifo_info["fifo_depths"] = {} - fifo_info["fifo_sizes"] = {} - total_fifo_size = 0 - for node in model_final.get_nodes_by_op_type("StreamingFIFO_rtl"): - node_inst = getCustomOp(node) - fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") - fifo_info["fifo_sizes"][node.name] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") - total_fifo_size += fifo_info["fifo_sizes"][node.name] - fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) - - self.output_dict["fifos"] = fifo_info - - def step_fifotest(self, onnx_path, cfg, build_dir): - # requires certain output products (e.g., ESTIMATE_REPORTS, RTLSIM_PERFORMANCE) - # TODO: check them and skip/warn if missing - log = {} - # load performance reports - with open(build_dir + "/report/estimate_network_performance.json") as f: - est_data = json.load(f) - with open(build_dir + "/report/rtlsim_performance.json") as f: - sim_data = json.load(f) - - # check for deadlock - model_final = ModelWrapper(build_dir + "/intermediate_models/step_create_stitched_ip.onnx") - first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) - last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] - deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected - log["deadlock"] = deadlock.tolist() - - # check rtlsim throughput - throughput = sim_data["throughput[images/s]"] - stable_throughput = sim_data["stable_throughput[images/s]"] - estimated_throughput = est_data["estimated_throughput_fps"] - throughput_factor = throughput / estimated_throughput - stable_throughput_factor = stable_throughput / estimated_throughput - - # TODO: Take throughput or stable_throughput? - throughput_pass = throughput_factor > self.params["fifo_throughput_factor_threshold"] - - log["throughput_pass"] = throughput_pass - log["throughput"] = throughput - log["stable_throughput"] = stable_throughput - log["estimated_throughput"] = estimated_throughput - - # reduce individual FIFO sizes by some amount and observe throughput drop or deadlock appear - fifo_reduction_pass = [] - log["fifo_reduction_results"] = {} - model_orig = ModelWrapper(build_dir + "/intermediate_models/step_hw_ipgen.onnx") - for node_orig in model_orig.get_nodes_by_op_type("StreamingFIFO_rtl"): - model = copy.deepcopy(model_orig) - node = model.get_node_from_name(node_orig.name) - node_inst = getCustomOp(node) - - # skip shallow FIFOs - # TODO: do we need to consider rounding-up of FIFO depths for impl_style=vivado? - if node_inst.get_nodeattr("depth") <= self.params["fifo_reduction_skip_threshold"]: - log["fifo_reduction_results"][node.name] = "skip" - continue - - # reduce depth of current FIFO and reset generated code - node_inst.set_nodeattr("depth", int(node_inst.get_nodeattr("depth") * self.params["fifo_reduction_factor"])) - node_inst.set_nodeattr("code_gen_dir_ipgen", "") - node_inst.set_nodeattr("ip_path", "") - node_inst.set_nodeattr("ipgen_path", "") - - # save model variation - tmp_output_dir_var = build_dir + "/variations/" + node.name - os.makedirs(tmp_output_dir_var) - model.save(tmp_output_dir_var + "/model.onnx") - - # build again, only re-run necessary steps to save time - cfg.output_dir = tmp_output_dir_var - cfg.steps = ["step_hw_codegen", "step_create_stitched_ip", "step_measure_rtlsim_performance"] - build.build_dataflow_cfg(tmp_output_dir_var + "/model.onnx", cfg) - - # load performance report - with open(tmp_output_dir_var + "/report/rtlsim_performance.json") as f: - sim_data = json.load(f) - - # check for deadlock - model_final = ModelWrapper(tmp_output_dir_var + "/intermediate_models/step_create_stitched_ip.onnx") - first_node = getCustomOp(model_final.find_consumer(model_final.graph.input[0].name)) - last_node = getCustomOp(model_final.find_producer(model_final.graph.output[0].name)) - input_txns_expected = np.prod(first_node.get_folded_input_shape()[:-1]) * self.params["rtlsim_n"] - output_txns_expected = np.prod(last_node.get_folded_output_shape()[:-1]) * self.params["rtlsim_n"] - var_deadlock = sim_data["N_IN_TXNS"] != input_txns_expected or sim_data["N_OUT_TXNS"] != output_txns_expected - - # check rtlsim throughput - var_throughput = sim_data["throughput[images/s]"] - var_stable_throughput = sim_data["stable_throughput[images/s]"] - # TODO: take throughput or stable_throughput? - throughput_drop = (throughput - var_throughput) / throughput - - if var_deadlock: - fifo_reduction_pass.append(True) - log["fifo_reduction_results"][node.name] = 1.0 - elif throughput_drop > self.params["fifo_reduction_throughput_drop_threshold"]: - fifo_reduction_pass.append(True) - log["fifo_reduction_results"][node.name] = throughput_drop - else: - fifo_reduction_pass.append(False) - log["fifo_reduction_results"][node.name] = "fail (no drop)" - - if "fifos" not in self.output_dict: - self.output_dict["fifos"] = {} - self.output_dict["fifos"]["fifotest"] = log - def steps_simple_model_flow(self): # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops) do_hls = self.params["do_hls"] if "do_hls" in self.params else False @@ -1023,8 +493,8 @@ def steps_simple_model_flow(self): self.step_synthesis() if do_sim_power: self.step_sim_power() - if do_synth_power: - self.step_synth_power() + #if do_synth_power: + # self.step_synth_power() def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow @@ -1062,18 +532,24 @@ def steps_full_build_flow(self): self.build_inputs["floorplan_path"] = self.params["floorplan_path"] ### BUILD SETUP ### - # TODO: select output products here, depending on what shall be tested - # TODO: set as much as possible here, e.g. verbose, debug, force_python, vitisopt, shell_flow cfg = self.step_build_setup() + cfg.generate_outputs = self.params["output_products"] + cfg.output_dir = self.build_inputs["build_dir"] + cfg.synth_clk_period_ns = self.clock_period_ns cfg.board = self.board if self.board in alveo_part_map: cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO cfg.vitis_platform=alveo_default_platform[self.board] else: cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ + # enable extra performance optimizations (physopt) + cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST cfg.verbose = False cfg.enable_build_pdb_debug = False + cfg.stitched_ip_gen_dcp = False # only needed for further manual integration cfg.force_python_rtlsim = False + cfg.split_large_fifos = True + cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet #rtlsim_use_vivado_comps # TODO ? #cfg.default_swg_exception #cfg.large_fifo_mem_style @@ -1086,9 +562,6 @@ def steps_full_build_flow(self): cfg.auto_fifo_depths = False cfg.live_fifo_sizing = True cfg.enable_instrumentation = True - # Overwrite output products - # TODO: make configurable directly via JSON/YAML cfg - cfg.generate_outputs = [build_cfg.DataflowOutputType.BITFILE] else: cfg.auto_fifo_depths = True cfg.auto_fifo_strategy = self.params["fifo_method"] @@ -1125,7 +598,3 @@ def steps_full_build_flow(self): ### ANALYSIS ### self.step_parse_builder_output(self.build_inputs["build_dir"]) - - # Only run in-depth FIFO test if selected - if "fifo_throughput_factor_threshold" in self.params: - self.step_fifotest(self.build_inputs["onnx_path"], cfg, self.build_inputs["build_dir"]) diff --git a/benchmarking/cfg/fifosizing_test.json b/benchmarking/cfg/fifosizing_test.json deleted file mode 100644 index cf49aa80a7..0000000000 --- a/benchmarking/cfg/fifosizing_test.json +++ /dev/null @@ -1,23 +0,0 @@ -[ - { - "dut": ["synthetic_nonlinear"], - "dim": [32], - "kernel_size": [5], - "ch": [4], - "simd": [4], - "pe": [4], - "parallel_window": [1], - - "lb_num_layers": [1], - "rb_num_layers": [3], - - "fifo_method": ["characterize"], - "fifo_strategy": ["analytical", "rtlsim"], - - "rtlsim_n": [10], - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [64], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - } - ] \ No newline at end of file diff --git a/benchmarking/cfg/metafi_fifosizing_test.json b/benchmarking/cfg/metafi_fifosizing_test.json deleted file mode 100644 index c61d1265fa..0000000000 --- a/benchmarking/cfg/metafi_fifosizing_test.json +++ /dev/null @@ -1,57 +0,0 @@ -[ - { - "dut": ["metafi"], - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/metafi_fifosizing_xsi_n2.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "rtlsim_n": [10], - - "fifo_method": ["manual"], - - "fifo_rtlsim_n": [2], - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - }, - { - "dut": ["metafi"], - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "rtlsim_n": [5], - - "fifo_method": ["largefifo_rtlsim"], - - "fifo_rtlsim_n": [2, 4, 8], - "fifo_throttle_factor": [0.5, 2], - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - }, - { - "dut": ["metafi"], - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "rtlsim_n": [5], - - "fifo_method": ["characterize"], - "fifo_strategy": ["rtlsim", "analytical"], - - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - } - ] \ No newline at end of file diff --git a/benchmarking/cfg/metafi_test.json b/benchmarking/cfg/metafi_test.json index 6475f1aadd..bc10f857c3 100644 --- a/benchmarking/cfg/metafi_test.json +++ b/benchmarking/cfg/metafi_test.json @@ -7,8 +7,8 @@ "board": ["RFSoC2x2"], "clock_period_ns": [10], - "fifo_method": ["manual"], + "fifo_method": ["live"], - "rtlsim_n": [3] + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json index e9fc3358b5..d4cb2072be 100644 --- a/benchmarking/cfg/mvau_test.json +++ b/benchmarking/cfg/mvau_test.json @@ -25,6 +25,8 @@ "do_sim_power": [true], "do_synth_power": [true], - "dut_duplication": [1] + "dut_duplication": [1], + + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] ## } ] diff --git a/benchmarking/cfg/resnet50_fifosizing_test.json b/benchmarking/cfg/resnet50_fifosizing_test.json deleted file mode 100644 index 075acda981..0000000000 --- a/benchmarking/cfg/resnet50_fifosizing_test.json +++ /dev/null @@ -1,66 +0,0 @@ -[ - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/rn-50_fifosizing_xsi_n2.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["U250"], - "clock_period_ns": [4], - - "rtlsim_n": [10], - - "fifo_method": ["manual"], - - "fifo_rtlsim_n": [2], - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - }, - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["U250"], - "clock_period_ns": [4], - - "rtlsim_n": [5], - - "fifo_method": ["largefifo_rtlsim"], - - "fifo_rtlsim_n": [2, 4, 8], - "fifo_throttle_factor": [0.5, 2], - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - }, - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["U250"], - "clock_period_ns": [4], - - "rtlsim_n": [5], - - "fifo_method": ["characterize"], - "fifo_strategy": ["rtlsim", "analytical"], - - "fifo_throughput_factor_threshold": [0.9], - "fifo_reduction_skip_threshold": [99999999999], - "fifo_reduction_factor": [0.5], - "fifo_reduction_throughput_drop_threshold": [0.01] - } - ] \ No newline at end of file diff --git a/benchmarking/cfg/resnet50_test.json b/benchmarking/cfg/resnet50_test.json index 4937cb8395..06a96729ab 100644 --- a/benchmarking/cfg/resnet50_test.json +++ b/benchmarking/cfg/resnet50_test.json @@ -12,6 +12,22 @@ "fifo_method": ["manual"], - "rtlsim_n": [3] + "rtlsim_n": [5], + "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]] + }, + { + "dut": ["resnet50"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "fifo_method": ["live"], + + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json new file mode 100644 index 0000000000..1b40feb9e8 --- /dev/null +++ b/benchmarking/cfg/synthetic_fifotest.json @@ -0,0 +1,64 @@ +[ + { + "dut": ["synthetic_nonlinear"], + "dim": [32], + "kernel_size": [5], + "ch": [4], + "simd": [4], + "pe": [4], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [3], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "rtlsim_n": [5], + + "fifo_method": ["live"], + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["synthetic_nonlinear"], + "dim": [32], + "kernel_size": [5], + "ch": [4], + "simd": [4], + "pe": [4], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [3], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "rtlsim_n": [5], + + "fifo_method": ["characterize"], + "fifo_strategy": ["analytical", "rtlsim"], + "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["synthetic_nonlinear"], + "dim": [32], + "kernel_size": [5], + "ch": [4], + "simd": [4], + "pe": [4], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [3], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "rtlsim_n": [5], + + "fifo_method": ["largefifo_rtlsim"], + "fifo_rtlsim_n": [2], + "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/benchmarking/cfg/transformer_gpt_all.json b/benchmarking/cfg/transformer_gpt_all.json index 4b1ee011c1..b0b70fb0aa 100644 --- a/benchmarking/cfg/transformer_gpt_all.json +++ b/benchmarking/cfg/transformer_gpt_all.json @@ -2,25 +2,11 @@ { "dut": ["transformer"], "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a"], - "dut_duplication": [1] - }, - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b"], - "dut_duplication": [1] - }, - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c"], - "dut_duplication": [1] - }, - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], - "dut_duplication": [1] + "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], + + "board": ["U280"], + "clock_period_ns": [10], + + "output_products": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] } ] diff --git a/benchmarking/cfg/transformer_radioml_all.json b/benchmarking/cfg/transformer_radioml_all.json index f2c8733c20..5eeea031b2 100644 --- a/benchmarking/cfg/transformer_radioml_all.json +++ b/benchmarking/cfg/transformer_radioml_all.json @@ -3,12 +3,20 @@ "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], - "dut_duplication": [1] + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["transformer"], "seed": [12], "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], - "dut_duplication": [1] + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] } -] +] \ No newline at end of file diff --git a/benchmarking/cfg/transformer_test.json b/benchmarking/cfg/transformer_test.json index a740a447b6..e0fcbc160d 100644 --- a/benchmarking/cfg/transformer_test.json +++ b/benchmarking/cfg/transformer_test.json @@ -16,6 +16,9 @@ "model_mask": ["none"], "model_positional_encoding": ["binary"], - "dut_duplication": [1] + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] } ] diff --git a/benchmarking/collect.py b/benchmarking/collect.py index ffe2222f73..7ba7dc4cb0 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -89,10 +89,12 @@ def wait_for_power_measurements(): for run in combined_log: with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live: metadata = { - "run_id": run["run_id"], - "task_id": run["task_id"], - "status": run["status"], - "total_time": run["total_time"] + "metadata": { + "run_id": run["run_id"], + "task_id": run["task_id"], + "status": run["status"], + "total_time": run["total_time"], + } } live.log_params(metadata) live.log_params(run["params"]) diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index b4bd4246b7..4c9dec2521 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -44,14 +44,8 @@ def step_build_setup(self): ] cfg = build_cfg.DataflowBuildConfig( - output_dir = self.build_inputs["build_dir"], - synth_clk_period_ns = self.clock_period_ns, steps=steps, - target_fps=None, #23 - - split_large_fifos=True, # probably needed #TODO: account for this in FIFO reduction test - # folding_config_file=folding_config_file, # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", @@ -59,14 +53,6 @@ def step_build_setup(self): #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, # standalone_thresholds=True, - # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST, - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, - build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, - build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing - ], ) # where is this used and why? diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py index ec03e44a8b..bf5aed8ab4 100644 --- a/benchmarking/dut/resnet50.py +++ b/benchmarking/dut/resnet50.py @@ -36,20 +36,7 @@ def step_build_setup(self): ] cfg = build_cfg.DataflowBuildConfig( - output_dir = self.build_inputs["build_dir"], - synth_clk_period_ns = self.clock_period_ns, steps=resnet50_build_steps, - - split_large_fifos=True, - - # enable extra performance optimizations (physopt) - vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST, - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, - build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, - build_cfg.DataflowOutputType.OOC_SYNTH, # not required for FIFO test, include for general testing - ], ) return cfg \ No newline at end of file diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index 19ba3a6ce0..4eb59ef7b2 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -289,19 +289,8 @@ def step_build_setup(self): # create build config for synthetic test models cfg = build_cfg.DataflowBuildConfig( - output_dir = self.build_inputs["build_dir"], - synth_clk_period_ns = self.clock_period_ns, - - split_large_fifos=False, # manual folding target_fps=None, - - shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, # TODO: generalize/adapt to new back-end - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, - build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, - ], ) return cfg diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 91c73bbffe..87522ad2e5 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -901,27 +901,11 @@ def step_build_setup(self): # Create a configuration for building the scaled dot-product attention # operator to a hardware accelerator cfg = build_cfg.DataflowBuildConfig( - # Unpack the build configuration parameters - #**params["build"]["finn"], - output_dir = self.build_inputs["build_dir"], - stitched_ip_gen_dcp = False, # only needed for further manual integration - synth_clk_period_ns = self.clock_period_ns, folding_config_file = "folding.yaml", specialize_layers_config_file = "specialize_layers.json", standalone_thresholds = True, max_multithreshold_bit_width = 16, mvau_wwidth_max = 2048, - split_large_fifos = True, - - generate_outputs=[ - build_cfg.DataflowOutputType.ESTIMATE_REPORTS, - build_cfg.DataflowOutputType.STITCHED_IP, # required for HarnessBuild, OOC_SYNTH, and RTLSIM - #build_cfg.DataflowOutputType.PYNQ_DRIVER, #TODO: currently broken (assert i_consumer.op_type == "StreamingDataflowPartition"), might be useful for functional verification on hw later - #build_cfg.DataflowOutputType.OOC_SYNTH, # requires stitched-ip, not needed because ZynqBuild/HarnessBuild is performed - #build_cfg.DataflowOutputType.BITFILE, # does not require stitched-ip, not needed because HarnessBuild is performed - #build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, # not possible due to float components TODO: try with pyXSI - #build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE # not needed, just a copy operation - ], verify_steps=[ # Verify the model after converting to the FINN onnx dialect @@ -1006,30 +990,12 @@ def step_build_setup(self): # Only for debugging for now, does not work if "vivado" style # StreamingFIFOs are used # node_by_node_rtlsim, - - #test_step_insert_tlastmarker, # required for instrumentation_wrapper - "step_create_stitched_ip", - # "step_measure_rtlsim_performance", # not possible due to float components - - step_synth_harness, #TODO: replace with instr wrapper (or port it into this step) - - #"step_out_of_context_synthesis", # for synthesis results (e.g. utilization) - - # normal deployment TODO: replace with instr wrapper (or port it into this step as an option) - #"step_synthesize_bitfile", - #"step_make_pynq_driver", - #"step_deployment_package", - - #test_step_gen_vitis_xo, # preparation step for original instr wrapper integration - #test_step_gen_instrumentation_wrapper, # preparation step for original instr wrapper integration - - #test_step_gen_instrwrap_sim, # preparation step for simulation of original instr wrapper integration - #test_step_run_instrwrap_sim, # simulation with instr wrapper, disabled for now due to extreme runtime - - #test_step_export_xo, # preparation step for original instr wrapper integration - #test_step_build_platform # synthesis with instr wrapper + "step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", ] ) diff --git a/benchmarking/harness/sink/ip/component.xml b/benchmarking/harness/sink/ip/component.xml deleted file mode 100644 index cb20a9abad..0000000000 --- a/benchmarking/harness/sink/ip/component.xml +++ /dev/null @@ -1,256 +0,0 @@ - - - xilinx.com - user - harness_sink - 1.0 - - - s_axis_0 - - - - - - - TDATA - - - s_axis_0_tdata - - - - - TVALID - - - s_axis_0_tvalid - - - - - TREADY - - - s_axis_0_tready - - - - - - - - - xilinx_anylanguagesynthesis - Synthesis - :vivado.xilinx.com:synthesis - Verilog - harness_sink - - xilinx_anylanguagesynthesis_view_fileset - - - - viewChecksum - 18b9f9a4 - - - - - xilinx_anylanguagebehavioralsimulation - Simulation - :vivado.xilinx.com:simulation - Verilog - harness_sink - - xilinx_anylanguagebehavioralsimulation_view_fileset - - - - viewChecksum - 18b9f9a4 - - - - - xilinx_xpgui - UI Layout - :vivado.xilinx.com:xgui.ui - - xilinx_xpgui_view_fileset - - - - viewChecksum - 6955aee3 - - - - - - - enable - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - valid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - checksum - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_0_tdata - - in - - 7 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axis_0_tvalid - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_0_tready - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - - - STREAM_WIDTH - Stream Width - 8 - - - - - - xilinx_anylanguagesynthesis_view_fileset - - src/harness_sink.v - verilogSource - CHECKSUM_18b9f9a4 - IMPORTED_FILE - - - - xilinx_anylanguagebehavioralsimulation_view_fileset - - src/harness_sink.v - verilogSource - IMPORTED_FILE - - - - xilinx_xpgui_view_fileset - - xgui/harness_sink_v1_0.tcl - tclSource - CHECKSUM_6955aee3 - XGUI_VERSION_2 - - - - harness_sink_v1_0 - - - STREAM_WIDTH - Stream Width - 8 - - - Component_Name - harness_sink_v1_0 - - - - - - zynq - qzynq - azynq - zynquplus - - - /UserIP - - harness_sink_v1_0 - level_0 - package_project - 2 - 2023-08-22T13:34:35Z - - - 2022.2 - - - - - - - - - - - - - diff --git a/benchmarking/harness/sink/ip/src/harness_sink.v b/benchmarking/harness/sink/ip/src/harness_sink.v deleted file mode 100644 index e6b95e7797..0000000000 --- a/benchmarking/harness/sink/ip/src/harness_sink.v +++ /dev/null @@ -1,39 +0,0 @@ -`timescale 1ns / 1ps -////////////////////////////////////////////////////////////////////////////////// -// Company: -// Engineer: -// -// Create Date: 08/22/2023 02:19:08 PM -// Design Name: -// Module Name: harness_sink -// Project Name: -// Target Devices: -// Tool Versions: -// Description: -// -// Dependencies: -// -// Revision: -// Revision 0.01 - File Created -// Additional Comments: -// -////////////////////////////////////////////////////////////////////////////////// - - -module harness_sink #( - parameter STREAM_WIDTH=8 -)( - input enable, - output valid, - output checksum, - input [STREAM_WIDTH-1:0] s_axis_0_tdata, - input s_axis_0_tvalid, - output s_axis_0_tready -); - -assign s_axis_0_tready = enable; - -assign valid = s_axis_0_tvalid; -assign checksum = ^s_axis_0_tdata; - -endmodule diff --git a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl b/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl deleted file mode 100644 index eb752d53a5..0000000000 --- a/benchmarking/harness/sink/ip/xgui/harness_sink_v1_0.tcl +++ /dev/null @@ -1,25 +0,0 @@ -# Definitional proc to organize widgets for parameters. -proc init_gui { IPINST } { - ipgui::add_param $IPINST -name "Component_Name" - #Adding Page - set Page_0 [ipgui::add_page $IPINST -name "Page 0"] - ipgui::add_param $IPINST -name "STREAM_WIDTH" -parent ${Page_0} - - -} - -proc update_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } { - # Procedure called to update STREAM_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STREAM_WIDTH { PARAM_VALUE.STREAM_WIDTH } { - # Procedure called to validate STREAM_WIDTH - return true -} - - -proc update_MODELPARAM_VALUE.STREAM_WIDTH { MODELPARAM_VALUE.STREAM_WIDTH PARAM_VALUE.STREAM_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STREAM_WIDTH}] ${MODELPARAM_VALUE.STREAM_WIDTH} -} - diff --git a/benchmarking/harness/vector_xor.v b/benchmarking/harness/vector_xor.v deleted file mode 100644 index 3361860ab8..0000000000 --- a/benchmarking/harness/vector_xor.v +++ /dev/null @@ -1,32 +0,0 @@ -`timescale 1ns / 1ps -////////////////////////////////////////////////////////////////////////////////// -// Company: -// Engineer: -// -// Create Date: 08/22/2023 02:19:08 PM -// Design Name: -// Module Name: harness_sink -// Project Name: -// Target Devices: -// Tool Versions: -// Description: -// -// Dependencies: -// -// Revision: -// Revision 0.01 - File Created -// Additional Comments: -// -////////////////////////////////////////////////////////////////////////////////// - - -module vector_xor #( - parameter WIDTH=8 -)( - input [WIDTH-1:0] in_data, - output out_data -); - -assign out_data = ^in_data; - -endmodule diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 284cd2baa3..baada9d1d2 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -160,7 +160,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): # restore stdout/stderr sys.stdout = stdout_orig sys.stderr = stderr_orig - time_per_step[step_name] = step_end - step_start + time_per_step[step_name] = round(step_end - step_start) chkpt_name = "%s.onnx" % (step_name) if cfg.save_intermediate_models: intermediate_model_dir = cfg.output_dir + "/intermediate_models" @@ -183,7 +183,8 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): print("Build failed") return -1 - with open(cfg.output_dir + "/time_per_step.json", "w") as f: + time_per_step["total_build_time"] = sum(time_per_step.values()) + with open(cfg.output_dir + "/report/time_per_step.json", "w") as f: json.dump(time_per_step, f, indent=2) print("Completed successfully") return 0 diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 448c6e5c4e..a3db23a714 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -161,16 +161,16 @@ class DataflowBuildConfig: """ #: Directory where the final build outputs will be written into - output_dir: str + output_dir: Optional[str] = None #: Target clock frequency (in nanoseconds) for Vivado synthesis. #: e.g. synth_clk_period_ns=5.0 will target a 200 MHz clock. #: If hls_clk_period_ns is not specified it will default to this value. - synth_clk_period_ns: float + synth_clk_period_ns: Optional[float] = None #: Which output(s) to generate from the build flow. See documentation of #: DataflowOutputType for available options. - generate_outputs: List[DataflowOutputType] + generate_outputs: Optional[List[DataflowOutputType]] = None #: (Optional) Path to configuration JSON file in which user can specify #: a preferred implementation style (HLS or RTL) for each node. From 6e2c379c095723489e21ba6c82967fc10284eb6a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 13:21:42 +0000 Subject: [PATCH 053/125] Refactor microbenchmarks to use normal builder flow --- benchmarking/bench.py | 8 +- benchmarking/bench_base.py | 372 +++++++++-------------- benchmarking/cfg/mvau_test.json | 2 +- benchmarking/cfg/synthetic_fifotest.json | 4 +- benchmarking/collect.py | 8 + benchmarking/dut/mvau.py | 47 ++- benchmarking/dut/transformer.py | 8 +- 7 files changed, 194 insertions(+), 255 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 686c97ddc2..485c64bb76 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -10,19 +10,15 @@ from dut.resnet50 import bench_resnet50 from dut.metafi import bench_metafi from dut.synthetic_nonlinear import bench_synthetic_nonlinear +from dut.transformer import bench_transformer dut = dict() dut["mvau"] = bench_mvau dut["resnet50"] = bench_resnet50 dut["metafi"] = bench_metafi dut["synthetic_nonlinear"] = bench_synthetic_nonlinear +dut["transformer"] = bench_transformer -# TODO: remove guard once transformer support has been fully merged -try: - from dut.transformer import bench_transformer - dut["transformer"] = bench_transformer -except ImportError: - pass def main(config_name): exit_code = 0 diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 636af6bb5e..edc2e67d4d 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -126,6 +126,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) self.debug = debug #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously + #TODO: coordinate with new builder loggin setup # General configuration # TODO: do not allow multiple targets in a single bench job due to measurement? @@ -199,204 +200,153 @@ def save_local_artifacts_collection(self): # this should be called upon successful or failed completion of a run for (name, source_path) in self.local_artifacts_collection: self.save_local_artifact(name, source_path) - - # only used in simple flow (TODO: unify) - def step_make_model(self): - pass - # only used in full build flow + # must be defined by subclass def step_export_onnx(self): pass - # only used in full build flow + # must be defined by subclass def step_build_setup(self): pass - # defaults to full build flow - # may be overwritten by subclass (e.g., to call simple flow instead) + # defaults to normal build flow, may be overwritten by subclass def run(self): self.steps_full_build_flow() - def step_finn_estimate(self): - # Gather FINN estimates - print("Gathering FINN estimates") - - model = self.model_initial - finn_resources_model = res_estimation(model, fpgapart=self.part) - finn_cycles_model = model.analysis(exp_cycles_per_layer) - if self.target_node: - node = model.get_nodes_by_op_type(self.target_node)[0] - finn_resources = finn_resources_model[node.name] - finn_cycles = finn_cycles_model[node.name] - else: - finn_resources = finn_resources_model # TODO: aggregate? - finn_cycles = 0 # TODO: aggregate or drop - finn_estimates = finn_resources - finn_estimates["CYCLES"] = finn_cycles - self.output_dict["finn_estimates"] = finn_estimates - - def step_hls(self): - # Perform Vitis HLS synthesis for HLS resource/performance reports - start_time = time.time() - print("Performing Vitis HLS synthesis") - model = self.model_initial - model = model.transform(PrepareIP(self.part, self.clock_period_ns)) - model = model.transform(HLSSynthIP()) - - hls_resources_model = model.analysis(hls_synth_res_estimation) - if self.target_node: - node = model.get_nodes_by_op_type(self.target_node)[0] - hls_resources = hls_resources_model[node.name] - else: - hls_resources = hls_resources_model # TODO: aggregate? - self.output_dict["hls_estimates"] = hls_resources - self.output_dict["hls_time"] = int(time.time() - start_time) - - self.model_step_hls = copy.deepcopy(model) - - def step_rtlsim(self): - # Perform RTL simulation for performance measurement - start_time = time.time() - print("Performing Verilator RTL simulation (n=1)") - # Prepare - model = self.model_step_hls - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(PrepareRTLSim()) - # Generate input data - input_tensor = model.graph.input[0] - input_shape = model.get_tensor_shape(input_tensor.name) - input_dtype = model.get_tensor_datatype(input_tensor.name) - x = gen_finn_dt_tensor(input_dtype, input_shape) - input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case - # Run - oxe.execute_onnx(model, input_dict)["outp"] # do not check output for correctness TODO: add functional verification throughout benchmarking steps - # Log result - node = model.get_nodes_by_op_type("MVAU_hls")[0] - inst = getCustomOp(node) - rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim") - self.output_dict["rtlsim_cycles"] = rtlsim_cycles - self.output_dict["rtlsim_time"] = int(time.time() - start_time) - - def step_synthesis(self): - # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports - # TODO: avoid duplicate synthesis by using shell build also for post_synth_resources and power sim? - # TODO: check OMX synth strategy again! - start_time = time.time() - print("Performing Vivado (stitched-ip, out-of-context) synthesis") - model = self.model_step_hls - model = model.transform(ReplaceVerilogRelPaths()) - model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) - model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns)) - ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth")) - - start_test_batch_fast( - results_path=self.artifacts_dir_power, - project_path=os.path.join( - ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" - ), - run_target="impl_1", - pairs=[(25, 0.5), (50, 0.5), (75, 0.5)], - ) - - # Log most important power results directly (refer to detailed logs for more) - for reportname in ["25_0.5", "50_0.5", "75_0.5"]: - with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: - report = json.load(f) - power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) - power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) - ooc_synth_results["power_%s" % reportname] = power - ooc_synth_results["power_dyn_%s" % reportname] = power_dyn - - self.output_dict["ooc_synth"] = ooc_synth_results - self.output_dict["ooc_synth_time"] = int(time.time() - start_time) - - # Save model for logging purposes - model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id))) - self.model_step_synthesis = copy.deepcopy(model) - - def step_sim_power(self): - # Perform Vivado simulation for accurate power report - start_time = time.time() - if "ooc_synth" not in self.output_dict: - print("ERROR: step_sim_power requires step_synthesis") - print("Performing Vivado simulation for power report") - if "rtlsim_cycles" in self.output_dict: - sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns - else: - sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns - - model = self.model_step_synthesis - input_tensor = model.graph.input[0] - output_tensor = model.graph.output[0] - input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) - output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) - sim_power_report( - results_path=self.artifacts_dir_power, - project_path=os.path.join( - self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" - ), - in_width=input_node_inst.get_instream_width(), - out_width=output_node_inst.get_outstream_width(), - dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(), - sim_duration_ns=sim_duration_ns, - ) - - # Log most important power results directly (refer to detailed logs for more) - for reportname in ["sim"]: - with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: - report = json.load(f) - power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) - power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) - self.output_dict["power_%s" % reportname] = power - self.output_dict["power_dyn%s" % reportname] = power_dyn - - self.output_dict["sim_power_time"] = int(time.time() - start_time) - - def step_synth_power(self): - # Perform Vivado synthesis for on-hardware power measurement - start_time = time.time() - if self.model_step_hls is None: - print("ERROR: step_synth_power requires step_hls") - print("Performing Vivado synthesis with test harness integration for power measurement") - - if "dut_duplication" in self.params: - dut_duplication = self.params["dut_duplication"] - else: - dut_duplication = 1 - - model = self.model_step_hls.transform(ReplaceVerilogRelPaths()) - model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) - - build_dir = "temp_output_harness_build" - # TODO: replace hold harness with new instr wrapper implementation - #TODO: if synth fails this could contain stale bitstreams which will be power tested - # model = model.transform( - # MakeZYNQHarnessProject( - # platform=self.board, - # output_dir=build_dir, - # dut_duplication=dut_duplication, - # clock_period_ns=self.clock_period_ns - # ) - # ) - - # COPY bitstreams and other outputs - # TODO: integrate better (e.g. as artifact) and remove redundant copy - # TODO: make this more configurable or switch to job/artifact based power measurement - shcopy(os.path.join(build_dir, "top_wrapper.bit"), - os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) - shcopy(os.path.join(build_dir, "top.hwh"), - os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) - shcopy(os.path.join(build_dir, "synth_report.xml"), - os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) - clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) - measurement_settings = {"freq_mhz": clock_period_mhz} - with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: - json.dump(measurement_settings, f, indent=2) - - self.output_dict["synth_power_time"] = int(time.time() - start_time) - - # Save model for logging purposes - model.save(os.path.join(self.artifacts_dir_models, "model_%d_synth_power.onnx" % (self.run_id))) + # def step_finn_estimate(self): + # # Gather FINN estimates + # print("Gathering FINN estimates") + + # model = self.model_initial + # finn_resources_model = res_estimation(model, fpgapart=self.part) + # finn_cycles_model = model.analysis(exp_cycles_per_layer) + # if self.target_node: + # node = model.get_nodes_by_op_type(self.target_node)[0] + # finn_resources = finn_resources_model[node.name] + # finn_cycles = finn_cycles_model[node.name] + # else: + # finn_resources = finn_resources_model # TODO: aggregate? + # finn_cycles = 0 # TODO: aggregate or drop + # finn_estimates = finn_resources + # finn_estimates["CYCLES"] = finn_cycles + # self.output_dict["finn_estimates"] = finn_estimates + + # def step_hls(self): + # # Perform Vitis HLS synthesis for HLS resource/performance reports + # start_time = time.time() + # print("Performing Vitis HLS synthesis") + # model = self.model_initial + # model = model.transform(PrepareIP(self.part, self.clock_period_ns)) + # model = model.transform(HLSSynthIP()) + + # hls_resources_model = model.analysis(hls_synth_res_estimation) + # if self.target_node: + # node = model.get_nodes_by_op_type(self.target_node)[0] + # hls_resources = hls_resources_model[node.name] + # else: + # hls_resources = hls_resources_model # TODO: aggregate? + # self.output_dict["hls_estimates"] = hls_resources + # self.output_dict["hls_time"] = int(time.time() - start_time) + + # self.model_step_hls = copy.deepcopy(model) + + # def step_rtlsim(self): + # # Perform RTL simulation for performance measurement + # start_time = time.time() + # print("Performing Verilator RTL simulation (n=1)") + # # Prepare + # model = self.model_step_hls + # model = model.transform(SetExecMode("rtlsim")) + # model = model.transform(PrepareRTLSim()) + # # Generate input data + # input_tensor = model.graph.input[0] + # input_shape = model.get_tensor_shape(input_tensor.name) + # input_dtype = model.get_tensor_datatype(input_tensor.name) + # x = gen_finn_dt_tensor(input_dtype, input_shape) + # input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case + # # Run + # oxe.execute_onnx(model, input_dict)["outp"] # do not check output for correctness TODO: add functional verification throughout benchmarking steps + # # Log result + # node = model.get_nodes_by_op_type("MVAU_hls")[0] + # inst = getCustomOp(node) + # rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim") + # self.output_dict["rtlsim_cycles"] = rtlsim_cycles + # self.output_dict["rtlsim_time"] = int(time.time() - start_time) + +# TODO: re-introduce simple Vivado power estimation as new builder step + # def step_synthesis(self): + # # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports + # start_time = time.time() + # print("Performing Vivado (stitched-ip, out-of-context) synthesis") + # model = self.model_step_hls + # model = model.transform(ReplaceVerilogRelPaths()) + # model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) + # model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns)) + # ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth")) + + # start_test_batch_fast( + # results_path=self.artifacts_dir_power, + # project_path=os.path.join( + # ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" + # ), + # run_target="impl_1", + # pairs=[(25, 0.5), (50, 0.5), (75, 0.5)], + # ) + + # # Log most important power results directly (refer to detailed logs for more) + # for reportname in ["25_0.5", "50_0.5", "75_0.5"]: + # with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: + # report = json.load(f) + # power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) + # power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) + # ooc_synth_results["power_%s" % reportname] = power + # ooc_synth_results["power_dyn_%s" % reportname] = power_dyn + + # self.output_dict["ooc_synth"] = ooc_synth_results + # self.output_dict["ooc_synth_time"] = int(time.time() - start_time) + + # # Save model for logging purposes + # model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id))) + # self.model_step_synthesis = copy.deepcopy(model) + +# TODO: re-introduce sim-based Vivado power estimation as new builder step + # def step_sim_power(self): + # # Perform Vivado simulation for accurate power report + # start_time = time.time() + # if "ooc_synth" not in self.output_dict: + # print("ERROR: step_sim_power requires step_synthesis") + # print("Performing Vivado simulation for power report") + # if "rtlsim_cycles" in self.output_dict: + # sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns + # else: + # sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns + + # model = self.model_step_synthesis + # input_tensor = model.graph.input[0] + # output_tensor = model.graph.output[0] + # input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) + # output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) + # sim_power_report( + # results_path=self.artifacts_dir_power, + # project_path=os.path.join( + # self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" + # ), + # in_width=input_node_inst.get_instream_width(), + # out_width=output_node_inst.get_outstream_width(), + # dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(), + # sim_duration_ns=sim_duration_ns, + # ) + + # # Log most important power results directly (refer to detailed logs for more) + # for reportname in ["sim"]: + # with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: + # report = json.load(f) + # power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) + # power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) + # self.output_dict["power_%s" % reportname] = power + # self.output_dict["power_dyn%s" % reportname] = power_dyn + + # self.output_dict["sim_power_time"] = int(time.time() - start_time) def step_parse_builder_output(self, build_dir): # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow @@ -456,46 +406,6 @@ def step_parse_builder_output(self, build_dir): else: pass #TODO: warn/skip? - def steps_simple_model_flow(self): - # Default step sequence for benchmarking a simple model (mostly single operators/custom_ops) - do_hls = self.params["do_hls"] if "do_hls" in self.params else False - do_rtlsim = self.params["do_rtlsim"] if "do_rtlsim" in self.params else False - do_synthesis = self.params["do_synthesis"] if "do_synthesis" in self.params else False - do_sim_power = self.params["do_sim_power"] if "do_sim_power" in self.params else False - do_synth_power = self.params["do_synth_power"] if "do_synth_power" in self.params else False - - # Perform steps - make_model_result = self.step_make_model() - if make_model_result is None: - return - else: - model, dut_info = make_model_result - - # Save model for logging purposes - # TODO: benchmarking infrastructure could be integrated deeper into ONNX IR and FINN custom_op/transformation infrastructure - # E.g. parameters and paths could be stored as onnx attributes and benchmarking steps as generic or specialized custom_op transformations - model.save(os.path.join(self.artifacts_dir_models, "model_%d_initial.onnx" % (self.run_id))) - - # Save model for use in other steps - self.model_initial = model - - # Log dict reported by DUT-specific scripts to overall result dict - # E.g. this could contain SIMD/PE derived from folding factors or weight distribution information - self.output_dict["info"] = dut_info - - self.step_finn_estimate() - - if do_hls: - self.step_hls() - if do_rtlsim: - self.step_rtlsim() - if do_synthesis: - self.step_synthesis() - if do_sim_power: - self.step_sim_power() - #if do_synth_power: - # self.step_synth_power() - def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow @@ -510,6 +420,7 @@ def steps_full_build_flow(self): self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"])) ### MODEL CREATION/IMPORT ### + # TODO: track fixed input onnx models with DVC if "model_dir" in self.params: # input ONNX model and verification input/output pairs are provided model_dir = self.params["model_dir"] @@ -521,7 +432,9 @@ def steps_full_build_flow(self): else: # input ONNX model (+ optional I/O pair for verification) will be generated self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx") - self.step_export_onnx(self.build_inputs["onnx_path"]) + if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped": + # microbenchmarks might skip because no valid model can be generated for given params + return self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"]) if "folding_path" in self.params: @@ -543,6 +456,7 @@ def steps_full_build_flow(self): else: cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ # enable extra performance optimizations (physopt) + # TODO: check OMX synth strategy again! cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST cfg.verbose = False cfg.enable_build_pdb_debug = False diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json index d4cb2072be..07fd52cc2f 100644 --- a/benchmarking/cfg/mvau_test.json +++ b/benchmarking/cfg/mvau_test.json @@ -27,6 +27,6 @@ "dut_duplication": [1], - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] ## + "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] } ] diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json index 1b40feb9e8..dfc63c6240 100644 --- a/benchmarking/cfg/synthetic_fifotest.json +++ b/benchmarking/cfg/synthetic_fifotest.json @@ -38,7 +38,7 @@ "fifo_method": ["characterize"], "fifo_strategy": ["analytical", "rtlsim"], - "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["synthetic_nonlinear"], @@ -59,6 +59,6 @@ "fifo_method": ["largefifo_rtlsim"], "fifo_rtlsim_n": [2], - "output_products": [["rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 7ba7dc4cb0..fa582d399a 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -88,6 +88,7 @@ def wait_for_power_measurements(): for run in combined_log: with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live: + #TODO: add pipeline info to metadata (or as metric or other annotation?) metadata = { "metadata": { "run_id": run["run_id"], @@ -99,6 +100,13 @@ def wait_for_power_measurements(): live.log_params(metadata) live.log_params(run["params"]) + # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate) + + # OOC synth resource report (step_out_of_context_synthesis) + + # shell synth resource report (step_synthesize_bitfile) + + if "builder" in run["output"]: for key in run["output"]["builder"]: live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False) diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py index a41eec694b..f62c6b59a7 100644 --- a/benchmarking/dut/mvau.py +++ b/benchmarking/dut/mvau.py @@ -1,6 +1,7 @@ import math import numpy as np +import json from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -19,6 +20,8 @@ from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( MinimizeWeightBitWidth, ) +import finn.builder.build_dataflow_config as build_cfg + from bench_base import bench class bench_mvau(bench): @@ -123,7 +126,7 @@ def _make_single_mvau_model( return model - def step_make_model(self): + def step_export_onnx(self, onnx_export_path): # Read params idt = self.params["idt"] wdt = self.params["wdt"] @@ -157,10 +160,10 @@ def step_make_model(self): pe = mh // nf if mw % simd != 0 or mh % pe != 0: print("Invalid simd/pe configuration, skipping") - return + return "skipped" if m > 1 and (simd != mw or pe != mh): print("M > 1 not possible for non-max simd/pe, skipping") - return + return "skipped" output_dict["simd"] = simd output_dict["pe"] = pe @@ -178,11 +181,11 @@ def step_make_model(self): if "sparsity_amount" in self.params: if self.params["sparsity_amount"] > 0: print("sparsity amount > 0 not applicable for none sparsity, skipping") - return + return "skipped" else: if self.params["sparsity_amount"] == 0: print("sparsity amount = 0 not applicable for selected sparsity, skipping") - return + return "skipped" if sparsity_type == "unstructured": idx = np.random.choice( mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False @@ -207,7 +210,7 @@ def step_make_model(self): ) else: print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") - return + return "skipped" W[idx_mw, :] = 0.0 elif sparsity_type == "cols_regular": if self.params["sparsity_amount"] == 0.25: @@ -220,7 +223,7 @@ def step_make_model(self): ) else: print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") - return + return "skipped" W[:, idx_mh] = 0.0 else: @@ -289,7 +292,31 @@ def step_make_model(self): inst = getCustomOp(node) self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type - return model, output_dict - def run(self): - self.steps_simple_model_flow() + # log additional info about the generated model (e.g. SIMD/PE or sparsity) + with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f: + json.dump(output_dict, f, indent=2) + + # TODO: also generate golden I/O pair for further verification steps + model.save(onnx_export_path) + + def step_build_setup(self): + # create build config for synthetic microbenchmark models + cfg = build_cfg.DataflowBuildConfig( + # manual folding + target_fps=None, + steps=[ + "step_create_dataflow_partition", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + "step_create_stitched_ip", + "step_measure_rtlsim_performance", + "step_out_of_context_synthesis", + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + ) + return cfg diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 87522ad2e5..2beca913c7 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -12,22 +12,16 @@ QuantLinear, QuantReLU ) -import os -from qonnx.core.modelwrapper import ModelWrapper # Progressbar from tqdm import trange import numpy as np from brevitas.export import export_qonnx import random import json -import subprocess -from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents # FINN dataflow builder -import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg from finn.builder.build_dataflow_config import AutoFIFOSizingMethod -from bench_base import bench, step_synth_harness -from finn.util.basic import alveo_part_map +from bench_base import bench # Range information structure for seeding the range analysis for converting # quantized activations to MultiThreshold From c19289b060330047f6b1cd399f0f56b4bf49ccb1 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 15:54:59 +0000 Subject: [PATCH 054/125] Refactor artifact handling and upload of key metrics --- benchmarking/bench-ci.yml | 3 + benchmarking/bench.py | 7 +- benchmarking/bench_base.py | 113 ++++++----------- benchmarking/collect.py | 147 ++++++++++++++++++++--- src/finn/builder/build_dataflow_steps.py | 3 +- 5 files changed, 178 insertions(+), 95 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index f62f2eb35a..c6d2c6bc91 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -49,6 +49,9 @@ Result Collection: stage: collect tags: - image_build + rules: + # Also run on failure of previous tasks to collect partial results + - when: always script: - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json - dvc exp push git@github.com:eki-project/finn-plus.git diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 485c64bb76..fb890332b9 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -161,11 +161,14 @@ def get_default_session_options_new(): log_dict["total_time"] = int(time.time() - start_time) log_dict["output"] = bench_object.output_dict log.append(log_dict) + # TODO: save this meta data into run-level reports dir insted of task*.json # overwrite output log file every time to allow early abort with open(log_path, "w") as f: json.dump(log, f, indent=2) - - # save local artifacts of this run (e.g., detailed debug info) + + # save GitLab artifacts of this run (e.g., reports and deployment package) + bench_object.save_artifacts_collection() + # save local artifacts of this run (e.g., full build dir, detailed debug info) bench_object.save_local_artifacts_collection() print("Stopping job") return exit_code diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index edc2e67d4d..eef9edd721 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -6,6 +6,7 @@ import time import traceback import glob +import shutil import numpy as np from shutil import copy as shcopy from shutil import copytree @@ -134,6 +135,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) self.board = params["board"] else: self.board = "RFSoC2x2" + self.params["board"] = self.board if "part" in params: self.part = params["part"] @@ -146,60 +148,53 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) self.clock_period_ns = params["clock_period_ns"] else: self.clock_period_ns = 10 + self.params["clock_period_ns"] = self.clock_period_ns # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts) print("Clearing FINN BUILD DIR ahead of run") delete_dir_contents(os.environ["FINN_BUILD_DIR"]) - # Initialize output directories (might exist from other runs of the same job) - self.artifacts_dir_models = os.path.join(self.artifacts_dir, "models") - os.makedirs(self.artifacts_dir_models, exist_ok=True) - self.artifacts_dir_power = os.path.join(self.artifacts_dir, "power_vivado", "run_%d" % (self.run_id)) - os.makedirs(self.artifacts_dir_power, exist_ok=True) - - self.save_dir_bitstreams = os.path.join(self.save_dir, "bitstreams") - os.makedirs(self.save_dir_bitstreams, exist_ok=True) - - # Intermediate models saved between steps - # TODO: create setter functions for intermediate models or other artifacts that log them to gitlab artifacts or local dir automatically - self.model_initial = None - self.model_step_hls = None - self.model_step_synthesis = None - # Initialize dictionary to collect all benchmark results + # TODO: remove completely or only use for meta data, actual results go into run-specific .json files within /report self.output_dict = {} # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow self.build_inputs = {} - # Collect tuples of (name, source path) to save as local artifacts upon run completion or fail by exception + # Collect tuples of (name, source path, archive?) to save as pipeline artifacts upon run completion or fail by exception + self.artifacts_collection = [] + + # Collect tuples of (name, source path, archive?) to save as local artifacts upon run completion or fail by exception self.local_artifacts_collection = [] if self.debug: # Save entire FINN build dir and working dir # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure) - self.local_artifacts_collection.append(("finn_tmp", os.environ["FINN_BUILD_DIR"])) - self.local_artifacts_collection.append(("finn_cwd", os.environ["FINN_ROOT"])) + self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False)) + self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False)) - def save_artifact(self, name, source_path): - target_path = os.path.join(self.artifacts_dir, name, "run_%d" % (self.run_id)) - os.makedirs(target_path, exist_ok=True) + def save_artifact(self, target_path, source_path, archive=False): if os.path.isdir(source_path): - copytree(source_path, target_path, dirs_exist_ok=True) - else: + if archive: + os.makedirs(os.path.dirname(target_path), exist_ok=True) + shutil.make_archive(target_path, "zip", source_path) + else: + os.makedirs(target_path, exist_ok=True) + copytree(source_path, target_path, dirs_exist_ok=True) + elif os.path.isfile(source_path): + os.makedirs(target_path, exist_ok=True) shcopy(source_path, target_path) - def save_local_artifact(self, name, source_path): - target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) - os.makedirs(target_path, exist_ok=True) - if os.path.isdir(source_path): - copytree(source_path, target_path, dirs_exist_ok=True) - else: - shcopy(source_path, target_path) + def save_artifacts_collection(self): + # this should be called upon successful or failed completion of a run + for (name, source_path, archive) in self.artifacts_collection: + target_path = os.path.join(self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name) + self.save_artifact(target_path, source_path, archive) def save_local_artifacts_collection(self): # this should be called upon successful or failed completion of a run - for (name, source_path) in self.local_artifacts_collection: - self.save_local_artifact(name, source_path) + for (name, source_path, archive) in self.local_artifacts_collection: + target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) + self.save_artifact(target_path, source_path, archive) # must be defined by subclass def step_export_onnx(self): @@ -349,26 +344,7 @@ def run(self): # self.output_dict["sim_power_time"] = int(time.time() - start_time) def step_parse_builder_output(self, build_dir): - # Used to parse selected reports/logs into the output json dict for DUTs that use a full FINN builder flow - - ### SAVE BITSTREAMS ### - if (os.path.exists(os.path.join(build_dir, "harness"))): - # TODO: integrate better (e.g. as artifact) and remove redundant copy - # TODO: make this more configurable or switch to job/artifact based power measurement - # TODO: make compatible to new instr wrapper (or however we generate these outputs) - shcopy(os.path.join(build_dir, "harness/top_wrapper.bit"), - os.path.join(self.save_dir_bitstreams, "run_%d.bit" % self.run_id)) - shcopy(os.path.join(build_dir, "harness/top.hwh"), - os.path.join(self.save_dir_bitstreams, "run_%d.hwh" % self.run_id)) - shcopy(os.path.join(build_dir, "harness/synth_report.xml"), - os.path.join(self.save_dir_bitstreams, "run_%d.xml" % self.run_id)) - clock_period_mhz = int(1.0 / self.clock_period_ns * 1000.0) - measurement_settings = {"freq_mhz": clock_period_mhz} - with open(os.path.join(self.save_dir_bitstreams, "run_%d_settings.json"%self.run_id), "w") as f: - json.dump(measurement_settings, f, indent=2) - else: - pass #TODO: warn/skip? - + # TODO: output as .json or even add as new build step ### CHECK FOR VERIFICATION STEP SUCCESS ### if (os.path.exists(os.path.join(build_dir, "verification_output"))): # Collect all verification output filenames @@ -381,30 +357,7 @@ def step_parse_builder_output(self, build_dir): # Construct a dictionary reporting the verification status as string self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]} - # TODO: mark job as failed if verification fails - else: - pass #TODO: warn/skip? - - ### PARSE SYNTH RESOURCE REPORT ### - if (os.path.exists(os.path.join(build_dir, "harness/post_synth_resources.json"))): - report_path = os.path.join(build_dir, "harness/post_synth_resources.json") - # TODO: check multiple possible sources for this log (e.g. if OOC synth or Zynbuild was run) - report_filter = "(top)" - # Open the report file - with open(report_path) as file: - # Load the JSON formatted report - report = pd.read_json(file, orient="index") - # Filter the reported rows according to some regex filter rule - report = report.filter(regex=report_filter, axis="rows") - # Generate a summary of the total resources - summary = report.sum() - - #TODO: parse finn estimates, hls estimates, step times, rtlsim performance(rtlsim n=1, n=100) - #TODO: optional simulation of instr wrapper instead of running on hw - - self.output_dict["builder"] = summary.to_dict() - else: - pass #TODO: warn/skip? + # TODO: mark job as failed if verification fails? def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow @@ -417,7 +370,13 @@ def steps_full_build_flow(self): delete_dir_contents(tmp_buildflow_dir) self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") os.makedirs(self.build_inputs["build_dir"], exist_ok=True) - self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"])) + + # Save full build dir as local artifact + self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False)) + # Save reports and deployment package as pipeline artifacts + self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "report"), False)) + self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "build_dataflow.log"), False)) + self.artifacts_collection.append(("deploy", os.path.join(self.build_inputs["build_dir"], "deploy"), True)) ### MODEL CREATION/IMPORT ### # TODO: track fixed input onnx models with DVC diff --git a/benchmarking/collect.py b/benchmarking/collect.py index fa582d399a..27a298acea 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -76,19 +76,45 @@ def wait_for_power_measurements(): time.sleep(60) print("Power measurement complete") +def open_json_report(id, report_name): + path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) + if os.path.isfile(path): + with open(path, "r") as f: + report = json.load(f) + return report + else: + return None + +def log_metrics_from_report(id, live, report_name, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in keys: + if key in report: + live.log_metric(prefix + key, report[key], plot=False) + +def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + if key_top in report: + for key in keys: + if key in report[key_top]: + live.log_metric(prefix + key, report[key_top][key], plot=False) + if __name__ == "__main__": print("Consolidating synthesis results from all sub-jobs of the array") consolidate_logs(sys.argv[1], sys.argv[2]) + # TODO: remove task-level .json logs and GitLab artifacts of this job? - # TEST DVC - # TODO: proper metric collection directly from .jsons in report build dir + ### PUSH RESULTS TO DVC ### combined_log = [] with open(sys.argv[2], "r") as f: combined_log = json.load(f) for run in combined_log: + id = run["run_id"] with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live: - #TODO: add pipeline info to metadata (or as metric or other annotation?) + ### PARAMS ### + #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?) metadata = { "metadata": { "run_id": run["run_id"], @@ -98,18 +124,109 @@ def wait_for_power_measurements(): } } live.log_params(metadata) - live.log_params(run["params"]) - - # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate) - - # OOC synth resource report (step_out_of_context_synthesis) - - # shell synth resource report (step_synthesize_bitfile) - - - if "builder" in run["output"]: - for key in run["output"]["builder"]: - live.log_metric("Resources/" + key, run["output"]["builder"][key], plot=False) + params = { + "params": run["params"] + } + live.log_params(params) + + ### METRICS ### + # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.) + # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36) + + # estimate_layer_resources.json + log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [ + "LUT", + "DSP", + "BRAM_18K", + "URAM", + ], prefix="estimate/resources/") + + # estimate_layer_resources_hls.json + log_nested_metrics_from_report(id, live, "estimate_layer_resources_hls.json", "total", [ + "LUT", + "FF", + "DSP", + "DSP48E", + "DSP58E", # TODO: aggregate/unify DSP reporting + "BRAM_18K", + "URAM", + ], prefix="hls_estimate/resources/") + + # estimate_network_performance.json + log_metrics_from_report(id, live, "estimate_network_performance.json", [ + "critical_path_cycles", + "max_cycles", + "max_cycles_node_name", + "estimated_throughput_fps", + "estimated_latency_ns", + ], prefix="estimate/performance/") + + # rtlsim_performance.json + log_metrics_from_report(id, live, "rtlsim_performance.json", [ + "N", + "TIMEOUT", + "latency_cycles", + "cycles", + "fclk[mhz]", + "throughput[images/s]", + "stable_throughput[images/s]", + # add INPUT_DONE, OUTPUT_DONE, number transactions? + ], prefix="rtlsim/performance/") + + # fifo_sizing.json + log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"]) + + # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) + log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ + "LUT", + "LUTRAM", + "FF", + "DSP", + "BRAM", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], prefix="synth(ooc)/resources/") + log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ + "WNS", + "fmax_mhz", + # add TNS? what is "delay"? + ], prefix="synth(ooc)/timing/") + + # post_synth_resources.json (shell synth / step_synthesize_bitfile) + log_nested_metrics_from_report(id, live, "post_synth_resources.json", "(top)", [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], prefix="synth/resources/") + + # post synth timing report + # TODO: only exported as post_route_timing.rpt, not .json + + # verification steps + if "output" in run: + if "builder_verification" in run["output"]: + live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False) + + # instrumentation measurement + # TODO + + # power measurement + # TODO + + # live fifosizing report + png + # TODO + + # time_per_step.json + log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) + + ### ARTIFACTS ### + # Build reports, as they come from GitLab artifact + live.log_artifact(os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports")) # TODO: disabled for now, update accordingly to new runner-based measurement setup # wait_for_power_measurements() diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 68631346b9..c925a1ac05 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -530,6 +530,7 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation) + estimate_layer_resources_hls["total"] = aggregate_dict_keys(estimate_layer_resources_hls) with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f: json.dump(estimate_layer_resources_hls, f, indent=2) @@ -651,7 +652,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "depth_trigger_uram", "depth_trigger_bram", ] - extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) + extract_model_config_to_json(model, cfg.output_dir + "/report/final_hw_config.json", hw_attrs) # perform FIFO splitting and shallow FIFO removal only after the final config # json file has been written. otherwise, since these transforms may add/remove From bd3b5de3bd2cad891999ec2532d4caad7a5b98eb Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 18:01:22 +0000 Subject: [PATCH 055/125] Add basic measurement job --- .gitlab-ci.yml | 7 +++-- benchmarking/bench-ci.yml | 26 +++++++++++++++--- benchmarking/bench_base.py | 5 ++-- benchmarking/measure.py | 55 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 9 deletions(-) create mode 100644 benchmarking/measure.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a82ad24eeb..31e963729b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,6 +18,9 @@ variables: CPU_CORES: description: "Select number of CPU cores and test workers" value: "64" + CPU_CORES_BENCH: + description: "Select number of CPU cores for benchmark runs" + value: "32" PARALLEL_JOBS: description: "Number of parallel Slurm array jobs per Benchmark job" value: "2" @@ -26,7 +29,7 @@ variables: value: "2-0" # [days-hours] SLURM_PARTITION: description: "Slurm partition (e.g., normal, largemem, fpga, gpu)" - value: "largemem" + value: "normal" SLURM_QOS: description: "Optional QoS option (include --qos, e.g., --qos express)" value: "" @@ -154,7 +157,7 @@ FINN Test Suite 2022.2: paths: - deps variables: - SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p largemem -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" PYTEST_PARALLEL: "$CPU_CORES" FINN_XILINX_VERSION: "2022.2" before_script: diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index c6d2c6bc91..5c2771465a 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -20,9 +20,9 @@ FINN Build: - job: Fetch Repos pipeline: $PARENT_PIPELINE_ID variables: - SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --mem 128G --array 0-$( expr $PARALLEL_JOBS - 1 )" - NUM_DEFAULT_WORKERS: "$CPU_CORES" - PYTEST_PARALLEL: "$CPU_CORES" + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" + NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH" + PYTEST_PARALLEL: "$CPU_CORES_BENCH" before_script: - cp -dfR .. $PATH_WORKDIR # Copy to working directory (e.g. RAMdisk) - cd $PATH_WORKDIR/finn-plus @@ -42,6 +42,24 @@ FINN Build: paths: - bench_artifacts/ +Measurement: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: measure + tags: + - board + rules: + # Also run on failure of previous tasks to measure partial results + - when: always + script: + - python benchmarking/measure.py + artifacts: + name: "bench_artifacts" + when: always + paths: + - bench_artifacts/ + Result Collection: id_tokens: CI_JOB_JWT: @@ -54,7 +72,7 @@ Result Collection: - when: always script: - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json - - dvc exp push git@github.com:eki-project/finn-plus.git + - dvc exp push -r push git@github.com:eki-project/finn-plus.git artifacts: name: "bench_results" when: always diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index eef9edd721..b39a8b0dde 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -369,7 +369,7 @@ def steps_full_build_flow(self): os.makedirs(tmp_buildflow_dir, exist_ok=True) delete_dir_contents(tmp_buildflow_dir) self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") - os.makedirs(self.build_inputs["build_dir"], exist_ok=True) + os.makedirs(os.path.join(self.build_inputs["build_dir"], "report"), exist_ok=True) # Save full build dir as local artifact self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False)) @@ -390,11 +390,10 @@ def steps_full_build_flow(self): self.build_inputs["onnx_path"] = self.params["model_path"] else: # input ONNX model (+ optional I/O pair for verification) will be generated - self.build_inputs["onnx_path"] = os.path.join(tmp_buildflow_dir, "model_export.onnx") + self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx") if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped": # microbenchmarks might skip because no valid model can be generated for given params return - self.save_local_artifact("model_step_export", self.build_inputs["onnx_path"]) if "folding_path" in self.params: self.build_inputs["folding_path"] = self.params["folding_path"] diff --git a/benchmarking/measure.py b/benchmarking/measure.py new file mode 100644 index 0000000000..6744eacedb --- /dev/null +++ b/benchmarking/measure.py @@ -0,0 +1,55 @@ +import os +import subprocess +import shutil + + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print('Failed to delete %s. Reason: %s' % (file_path, e)) + +if __name__ == "__main__": + print("Looking for deployment packages in artifacts..") + # Find deployment packages from artifacts + artifacts_dir = os.path.join("bench_artifacts", "runs_output") + for run in os.listdir(artifacts_dir): + run_dir = os.path.join(artifacts_dir, run) + reports_dir = os.path.join(run_dir, "reports") + deploy_archive = os.path.join(run_dir, "deploy.zip") + extract_dir = "measurement" + if os.path.isfile(deploy_archive): + print("Found deployment package in %s, extracting.." % run_dir) + + # Extract to temporary dir + shutil.unpack_archive(deploy_archive, extract_dir) + + # Run driver + print("Running driver..") + subprocess.run([f"python {extract_dir}/driver/driver.py", + f"--bitfile {extract_dir}/bitfile/finn-accel.bit", + f"--settingsfile {extract_dir}/driver/settings.json", + f"--reportfile {extract_dir}/measured_performance.json", + ]) + print("Driver finished.") + + # Copy results back to artifact directory + for report in ["measured_performance.json", + "fifo_sizing_report.json", + "fifo_depth_export.json", + "fifo_sizing_graph.png", + ]: + report_path = os.path.join(extract_dir, report) + if os.path.isfile(report_path): + print("Copying %s to %s" % (report_path, reports_dir)) + shutil.copy(report_path, reports_dir) + + print("Clearing temporary directory..") + # Clear temporary dir + delete_dir_contents(extract_dir) + print("Done.") From 8fa1483a9700ac90291e756106104ad7e1022664 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 21:26:04 +0000 Subject: [PATCH 056/125] Fixes to measurement and logging --- benchmarking/bench.py | 4 ++-- benchmarking/bench_base.py | 4 ++-- benchmarking/cfg/mvau_test.json | 6 ------ benchmarking/collect.py | 36 +++++++++++++++++++++++++-------- benchmarking/measure.py | 8 ++++---- 5 files changed, 36 insertions(+), 22 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index fb890332b9..e7f38d0e29 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -145,8 +145,8 @@ def get_default_session_options_new(): start_time = time.time() try: - bench_object.run() - if not bench_object.output_dict: + result = bench_object.run() + if result == "skipped": log_dict["status"] = "skipped" print("Run skipped") else: diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index b39a8b0dde..7634ead091 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -206,7 +206,7 @@ def step_build_setup(self): # defaults to normal build flow, may be overwritten by subclass def run(self): - self.steps_full_build_flow() + return self.steps_full_build_flow() # def step_finn_estimate(self): # # Gather FINN estimates @@ -393,7 +393,7 @@ def steps_full_build_flow(self): self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx") if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped": # microbenchmarks might skip because no valid model can be generated for given params - return + return "skipped" if "folding_path" in self.params: self.build_inputs["folding_path"] = self.params["folding_path"] diff --git a/benchmarking/cfg/mvau_test.json b/benchmarking/cfg/mvau_test.json index 07fd52cc2f..c42b16782c 100644 --- a/benchmarking/cfg/mvau_test.json +++ b/benchmarking/cfg/mvau_test.json @@ -19,12 +19,6 @@ "ram_style": ["distributed"], "ram_style_thr": ["distributed"], - "do_hls": [true], - "do_rtlsim": [true], - "do_synthesis": [true], - "do_sim_power": [true], - "do_synth_power": [true], - "dut_duplication": [1], "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 27a298acea..fbc0118d79 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -85,6 +85,12 @@ def open_json_report(id, report_name): else: return None +def log_all_metrics_from_report(id, live, report_name, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in report: + live.log_metric(prefix + key, report[key], plot=False) + def log_metrics_from_report(id, live, report_name, keys, prefix=""): report = open_json_report(id, report_name) if report: @@ -112,7 +118,10 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= for run in combined_log: id = run["run_id"] - with Live(exp_message="Job result collected by GitLab CI", cache_images=True) as live: + experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + id + experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + #TODO: cache images once we switch to a cache provider that works with DVC Studio + with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live: ### PARAMS ### #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?) metadata = { @@ -124,11 +133,15 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= } } live.log_params(metadata) - params = { - "params": run["params"] - } + params = {"params": run["params"]} live.log_params(params) + # dut_info.json (additional information about DUT generated during model generation) + dut_info_report = open_json_report(id, "dut_info.json") + if dut_info_report: + dut_info = {"dut_info": dut_info_report} + live.log_params(dut_info) + ### METRICS ### # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.) # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36) @@ -174,7 +187,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= ], prefix="rtlsim/performance/") # fifo_sizing.json - log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"]) + log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/") # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ @@ -213,13 +226,20 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False) # instrumentation measurement - # TODO + log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/") # power measurement # TODO - # live fifosizing report + png - # TODO + # live fifosizing report + graph png + log_metrics_from_report(id, live, "fifo_sizing_report.json", [ + "error", + "fifo_size_total_kB", + ], prefix="fifosizing/live/") + + image = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png") + if os.path.isfile(image): + live.log_image("fifosizing_pass_1", image) # time_per_step.json log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) diff --git a/benchmarking/measure.py b/benchmarking/measure.py index 6744eacedb..e0a5da0bfc 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -31,10 +31,10 @@ def delete_dir_contents(dir): # Run driver print("Running driver..") - subprocess.run([f"python {extract_dir}/driver/driver.py", - f"--bitfile {extract_dir}/bitfile/finn-accel.bit", - f"--settingsfile {extract_dir}/driver/settings.json", - f"--reportfile {extract_dir}/measured_performance.json", + subprocess.run(["python", f"{extract_dir}/driver/driver.py", + "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", f"{extract_dir}/driver/settings.json", + "--reportfile", f"{extract_dir}/measured_performance.json", ]) print("Driver finished.") From 2a7c9c4ffedbdddbfd0cf9e9288cdeb0b31972ac Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Feb 2025 22:34:19 +0000 Subject: [PATCH 057/125] Minor fixes --- .gitlab-ci.yml | 2 +- benchmarking/collect.py | 2 +- benchmarking/measure.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 31e963729b..decf20fe6c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,7 @@ variables: value: "" TEST_SUITE: description: "Select test suite to run" - value: "full" + value: "none" # DEBUG options: - "none" - "quicktest" diff --git a/benchmarking/collect.py b/benchmarking/collect.py index fbc0118d79..7abbd865d2 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -118,7 +118,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= for run in combined_log: id = run["run_id"] - experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + id + experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") #TODO: cache images once we switch to a cache provider that works with DVC Studio with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live: diff --git a/benchmarking/measure.py b/benchmarking/measure.py index e0a5da0bfc..543b48fff9 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -31,7 +31,7 @@ def delete_dir_contents(dir): # Run driver print("Running driver..") - subprocess.run(["python", f"{extract_dir}/driver/driver.py", + subprocess.run(["sudo", "python", f"{extract_dir}/driver/driver.py", "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", "--settingsfile", f"{extract_dir}/driver/settings.json", "--reportfile", f"{extract_dir}/measured_performance.json", From aa9f4e40acd2307c9c35843e0e136207b3067522 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sat, 1 Mar 2025 11:54:10 +0000 Subject: [PATCH 058/125] Fix pynq measurement issues --- benchmarking/bench-ci.yml | 3 ++- benchmarking/measure.py | 2 +- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 2 -- src/finn/qnn-data/templates/driver/driver_instrumentation.py | 4 ---- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 5c2771465a..695aa8a1a3 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -53,7 +53,8 @@ Measurement: # Also run on failure of previous tasks to measure partial results - when: always script: - - python benchmarking/measure.py + # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment + - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py" artifacts: name: "bench_artifacts" when: always diff --git a/benchmarking/measure.py b/benchmarking/measure.py index 543b48fff9..e0a5da0bfc 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -31,7 +31,7 @@ def delete_dir_contents(dir): # Run driver print("Running driver..") - subprocess.run(["sudo", "python", f"{extract_dir}/driver/driver.py", + subprocess.run(["python", f"{extract_dir}/driver/driver.py", "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", "--settingsfile", f"{extract_dir}/driver/settings.json", "--reportfile", f"{extract_dir}/measured_performance.json", diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index 560959991f..be1f20156a 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -2,9 +2,7 @@ import json import os import argparse -import matplotlib as mpl import matplotlib.pyplot as plt -import numpy as np from pynq.pl_server.device import Device from driver_instrumentation import FINNInstrumentationOverlay diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py index fea9446bf5..5db2217d45 100644 --- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -1,10 +1,6 @@ import time import json import argparse -import matplotlib as mpl -import matplotlib.pyplot as plt -from IPython.display import clear_output -import numpy as np from pynq import Overlay from pynq.ps import Clocks from pynq.pl_server.device import Device From f7ad385bce0c206af2fab3103e57ecb8a86d2aa3 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 4 Mar 2025 18:35:20 +0000 Subject: [PATCH 059/125] Minor infrastructure improvements --- benchmarking/bench-ci.yml | 7 +- benchmarking/bench.py | 20 ++--- benchmarking/bench_base.py | 40 ++++----- benchmarking/collect.py | 103 ++++++++++++++---------- benchmarking/dut/synthetic_nonlinear.py | 3 + src/finn/builder/build_dataflow.py | 12 +++ 6 files changed, 106 insertions(+), 79 deletions(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 695aa8a1a3..1c03ecbd02 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -72,10 +72,5 @@ Result Collection: # Also run on failure of previous tasks to collect partial results - when: always script: - - python3.10 benchmarking/collect.py bench_artifacts/tasks_output bench_results.json + - python3.10 benchmarking/collect.py - dvc exp push -r push git@github.com:eki-project/finn-plus.git - artifacts: - name: "bench_results" - when: always - paths: - - bench_results.json diff --git a/benchmarking/bench.py b/benchmarking/bench.py index e7f38d0e29..2dbcdbe87f 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -56,8 +56,6 @@ def get_default_session_options_new(): artifacts_dir = os.path.join(experiment_dir, "bench_artifacts") print("Collecting results in path: %s" % artifacts_dir) - os.makedirs(os.path.join(artifacts_dir, "tasks_output"), exist_ok=True) - log_path = os.path.join(artifacts_dir, "tasks_output", "task_%d.json" % (task_id)) # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) if job_id == 0: @@ -71,13 +69,13 @@ def get_default_session_options_new(): # Gather benchmarking configs if config_name == "manual": - configs_path, config_select = os.path.split(os.environ.get("MANUAL_CFG_PATH")) + config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) else: configs_path = os.path.join(os.path.dirname(__file__), "cfg") config_select = config_name + ".json" + config_path = os.path.join(configs_path, config_select) # Load config - config_path = os.path.join(configs_path, config_select) print("Loading config %s" % (config_path)) if os.path.exists(config_path): with open(config_path, "r") as f: @@ -118,9 +116,7 @@ def get_default_session_options_new(): # Run benchmark # TODO: integrate this loop (especially status logging) into the bench class - # TODO: log additional info as artifact or directly into info section of json (e.g. dut, versions, date) - # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable) - log = [] + # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable), coordinate with new logging for run, run_id in enumerate(selected_runs): print( "Starting run %d/%d (id %d of %d total runs)" @@ -143,7 +139,6 @@ def get_default_session_options_new(): print("ERROR: no DUT specified") return 1 - start_time = time.time() try: result = bench_object.run() if result == "skipped": @@ -158,13 +153,12 @@ def get_default_session_options_new(): exit_code = 1 # TODO: exception catch all in builder prevents internal failures from being caught here - log_dict["total_time"] = int(time.time() - start_time) log_dict["output"] = bench_object.output_dict - log.append(log_dict) - # TODO: save this meta data into run-level reports dir insted of task*.json - # overwrite output log file every time to allow early abort + + # log metadata of this run to its own report directory + log_path = os.path.join(bench_object.report_dir, "metadata_bench.json") with open(log_path, "w") as f: - json.dump(log, f, indent=2) + json.dump(log_dict, f, indent=2) # save GitLab artifacts of this run (e.g., reports and deployment package) bench_object.save_artifacts_collection() diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 7634ead091..6a4bd63c51 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -42,7 +42,7 @@ from finn.builder.build_dataflow_config import DataflowBuildConfig import pandas as pd import onnxruntime as ort - +#TODO: merge this file into bench.py once most functionality has been moved to builder def start_test_batch_fast(results_path, project_path, run_target, pairs): # Prepare tcl script @@ -170,7 +170,25 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) # Save entire FINN build dir and working dir # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure) self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False)) - self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False)) + #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False)) + + ### SETUP ### + # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) + # Ensure it exists but is empty (clear potential artifacts from previous runs) + tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow") + os.makedirs(tmp_buildflow_dir, exist_ok=True) + delete_dir_contents(tmp_buildflow_dir) + self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir + self.build_dir = os.path.join(tmp_buildflow_dir, "build_output") + self.report_dir = os.path.join(self.build_dir, "report") + os.makedirs(self.report_dir, exist_ok=True) + + # Save full build dir as local artifact + self.local_artifacts_collection.append(("build_output", self.build_dir, False)) + # Save reports and deployment package as pipeline artifacts + self.artifacts_collection.append(("reports", self.report_dir, False)) + self.artifacts_collection.append(("reports", os.path.join(self.build_dir, "build_dataflow.log"), False)) + self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True)) def save_artifact(self, target_path, source_path, archive=False): if os.path.isdir(source_path): @@ -362,22 +380,6 @@ def step_parse_builder_output(self, build_dir): def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow - ### SETUP ### - # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) - # Ensure it exists but is empty (clear potential artifacts from previous runs) - tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow") - os.makedirs(tmp_buildflow_dir, exist_ok=True) - delete_dir_contents(tmp_buildflow_dir) - self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") - os.makedirs(os.path.join(self.build_inputs["build_dir"], "report"), exist_ok=True) - - # Save full build dir as local artifact - self.local_artifacts_collection.append(("build_output", self.build_inputs["build_dir"], False)) - # Save reports and deployment package as pipeline artifacts - self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "report"), False)) - self.artifacts_collection.append(("reports", os.path.join(self.build_inputs["build_dir"], "build_dataflow.log"), False)) - self.artifacts_collection.append(("deploy", os.path.join(self.build_inputs["build_dir"], "deploy"), True)) - ### MODEL CREATION/IMPORT ### # TODO: track fixed input onnx models with DVC if "model_dir" in self.params: @@ -403,6 +405,8 @@ def steps_full_build_flow(self): self.build_inputs["floorplan_path"] = self.params["floorplan_path"] ### BUILD SETUP ### + # TODO: convert to YAML-based builder config + # TODO: split up into default config, dut-specific config, and run-specific config cfg = self.step_build_setup() cfg.generate_outputs = self.params["output_products"] cfg.output_dir = self.build_inputs["build_dir"] diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 7abbd865d2..7b568563fa 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -3,8 +3,11 @@ import os import sys import time +import shutil from dvclive import Live +from util import delete_dir_contents + def merge_dicts(a: dict, b: dict): for key in b: if key in a: @@ -76,6 +79,10 @@ def wait_for_power_measurements(): time.sleep(60) print("Power measurement complete") +def log_dvc_metric(live, prefix, name, value): + # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) + live.log_metric(prefix + name.replace("/", "-"), value, plot=False) + def open_json_report(id, report_name): path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) if os.path.isfile(path): @@ -89,14 +96,14 @@ def log_all_metrics_from_report(id, live, report_name, prefix=""): report = open_json_report(id, report_name) if report: for key in report: - live.log_metric(prefix + key, report[key], plot=False) + log_dvc_metric(live, prefix, key, report[key]) def log_metrics_from_report(id, live, report_name, keys, prefix=""): report = open_json_report(id, report_name) if report: for key in keys: if key in report: - live.log_metric(prefix + key, report[key], plot=False) + log_dvc_metric(live, prefix, key, report[key]) def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""): report = open_json_report(id, report_name) @@ -104,39 +111,43 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= if key_top in report: for key in keys: if key in report[key_top]: - live.log_metric(prefix + key, report[key_top][key], plot=False) + log_dvc_metric(live, prefix, key, report[key_top][key]) if __name__ == "__main__": - print("Consolidating synthesis results from all sub-jobs of the array") - consolidate_logs(sys.argv[1], sys.argv[2]) - # TODO: remove task-level .json logs and GitLab artifacts of this job? - - ### PUSH RESULTS TO DVC ### - combined_log = [] - with open(sys.argv[2], "r") as f: - combined_log = json.load(f) - - for run in combined_log: - id = run["run_id"] + # Go through all runs found in the artifacts and log their results to DVC + run_dir_list = os.listdir(os.path.join("bench_artifacts", "runs_output")) + print("Looking for runs in %s" % run_dir_list) + run_ids = [] + for run_dir in run_dir_list: + if run_dir.startswith("run_"): + run_id = int(run_dir[4:]) + run_ids.append(run_id) + run_ids.sort() + print("Found %d runs" % len(run_ids)) + + for id in run_ids: + print("Processing run %d" % id) experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") #TODO: cache images once we switch to a cache provider that works with DVC Studio with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live: ### PARAMS ### - #TODO: add pipeline info and FINN configuration (e.g. tool versions) to metadata (or as metric or other annotation?) - metadata = { - "metadata": { - "run_id": run["run_id"], - "task_id": run["task_id"], - "status": run["status"], - "total_time": run["total_time"], - } - } - live.log_params(metadata) - params = {"params": run["params"]} + # input parameters logged by benchmarking infrastructure + metadata_bench = open_json_report(id, "metadata_bench.json") + params = {"params": metadata_bench["params"]} live.log_params(params) - # dut_info.json (additional information about DUT generated during model generation) + # optional metadata logged by builder + metadata_builder = open_json_report(id, "metadata_builder.json") + if metadata_builder: + metadata = { + "metadata": { + "tool_version": metadata_builder["tool_version"], + } + } + live.log_params(metadata) + + # optional dut_info.json (additional information about DUT generated during model generation) dut_info_report = open_json_report(id, "dut_info.json") if dut_info_report: dut_info = {"dut_info": dut_info_report} @@ -146,6 +157,21 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.) # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36) + # status + status = metadata_bench["status"] + if status == "ok": + # mark as failed if either bench or builder indicates failure + if metadata_builder: + status_builder = metadata_builder["status"] + if status_builder == "failed": + status = "failed" + log_dvc_metric(live, "", "status", status) + + # verification steps + if "output" in metadata_bench: + if "builder_verification" in metadata_bench["output"]: + log_dvc_metric(live, "", "verification", metadata_bench["output"]["builder_verification"]["verification"]) + # estimate_layer_resources.json log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [ "LUT", @@ -220,11 +246,6 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= # post synth timing report # TODO: only exported as post_route_timing.rpt, not .json - # verification steps - if "output" in run: - if "builder_verification" in run["output"]: - live.log_metric("verification", run["output"]["builder_verification"]["verification"], plot=False) - # instrumentation measurement log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/") @@ -245,15 +266,13 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) ### ARTIFACTS ### - # Build reports, as they come from GitLab artifact - live.log_artifact(os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports")) - - # TODO: disabled for now, update accordingly to new runner-based measurement setup - # wait_for_power_measurements() - # power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", - # "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), - # "power_measure.json") - # if os.path.isfile(power_log_path): - # print("Merging power measurement logs with remaining logs") - # merge_logs(sys.argv[2], power_log_path, sys.argv[2]) + # Log build reports as they come from GitLab artifacts, + # but copy them to a central dir first so all runs share the same path + run_report_dir = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports") + dvc_report_dir = "reports" + os.makedirs(dvc_report_dir, exist_ok=True) + delete_dir_contents(dvc_report_dir) + shutil.copytree(run_report_dir, dvc_report_dir, dirs_exist_ok=True) + live.log_artifact(dvc_report_dir) + print("Done") diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index 4eb59ef7b2..759f31838b 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -190,6 +190,9 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe): dup_config["PE"] = pe dup_config["NumOutputStreams"] = 2 dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name + # We always need to set outFIFODepths explictly for DuplicateStreams + # because it has no default value that corresponds automatically to NumOutputStreams + dup_config["outFIFODepths"] = [2] * 2 add_config = {} add_config["domain"] = "finn.custom_op.fpgadataflow.hls" diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index baada9d1d2..8602fffa09 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -181,11 +181,23 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): else: print("enable_build_pdb_debug not set in build config, exiting...") print("Build failed") + metadata = { + "status": "failed", + "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")), + } + with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: + json.dump(metadata, f, indent=2) return -1 time_per_step["total_build_time"] = sum(time_per_step.values()) with open(cfg.output_dir + "/report/time_per_step.json", "w") as f: json.dump(time_per_step, f, indent=2) + metadata = { + "status": "ok", + "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")), + } + with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: + json.dump(metadata, f, indent=2) print("Completed successfully") return 0 From c73b9c14c04f1a41090f21d4f85991864dbc925a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 09:45:49 +0000 Subject: [PATCH 060/125] Separate build & measure artifacts, fixes --- .gitlab-ci.yml | 8 --- benchmarking/bench-ci.yml | 12 ++-- benchmarking/bench.py | 8 ++- benchmarking/collect.py | 97 ++++++--------------------------- benchmarking/dut/transformer.py | 28 +--------- benchmarking/measure.py | 26 ++++----- benchmarking/util.py | 34 ++++++++++++ 7 files changed, 73 insertions(+), 140 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index decf20fe6c..79d772f65d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -215,11 +215,3 @@ Bench: parallel: matrix: - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest] - -#TODO: add selector for none, reduced, full benchmark suite -#TODO: introduce result collect job on parent level for easier visualization/excel interfacing -#TODO: more control via (optional) variables -#TODO: move power measurement from polling-based script to its own job/runner -#TODO: ensure a freshly initialized workdir on job/runner level (e.g. created directories seem to stay there) -#TODO: (optionally) save ALL build artifacts/logs/temporary files to artifacts or PFS for debugging (maybe via Jacamar feature of setting individual persistent workdirs?) -#TODO: fix clock frequency discrepancies between setting, synth, and driver diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 1c03ecbd02..99adf1e0dc 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -1,5 +1,5 @@ stages: - - synth + - build - measure - collect @@ -15,7 +15,7 @@ FINN Build: id_tokens: CI_JOB_JWT: aud: https://git.uni-paderborn.de - stage: synth + stage: build needs: - job: Fetch Repos pipeline: $PARENT_PIPELINE_ID @@ -37,10 +37,10 @@ FINN Build: paths: - deps artifacts: - name: "bench_artifacts" + name: "build_artifacts" when: always paths: - - bench_artifacts/ + - build_artifacts/ Measurement: id_tokens: @@ -56,10 +56,10 @@ Measurement: # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py" artifacts: - name: "bench_artifacts" + name: "measurement_artifacts" when: always paths: - - bench_artifacts/ + - measurement_artifacts/ Result Collection: id_tokens: diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 2dbcdbe87f..ea85082fc8 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -54,7 +54,8 @@ def get_default_session_options_new(): # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk) experiment_dir = os.environ.get("CI_PROJECT_DIR") - artifacts_dir = os.path.join(experiment_dir, "bench_artifacts") + artifacts_dir = os.path.join(experiment_dir, "build_artifacts") + os.makedirs(artifacts_dir, exist_ok=True) print("Collecting results in path: %s" % artifacts_dir) # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) @@ -151,7 +152,6 @@ def get_default_session_options_new(): log_dict["status"] = "failed" print("Run failed: " + traceback.format_exc()) exit_code = 1 - # TODO: exception catch all in builder prevents internal failures from being caught here log_dict["output"] = bench_object.output_dict @@ -164,9 +164,11 @@ def get_default_session_options_new(): bench_object.save_artifacts_collection() # save local artifacts of this run (e.g., full build dir, detailed debug info) bench_object.save_local_artifacts_collection() + + #TODO: examine verification result and builder status here to fail pipeline via exit code? + print("Stopping job") return exit_code - #TODO: add additional exit codes (e.g. when some verification within the run failed)? if __name__ == "__main__": exit_code = main(sys.argv[1]) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 7b568563fa..bcff28104c 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -1,92 +1,25 @@ -import itertools import json import os -import sys -import time import shutil from dvclive import Live from util import delete_dir_contents -def merge_dicts(a: dict, b: dict): - for key in b: - if key in a: - if isinstance(a[key], dict) and isinstance(b[key], dict): - merge_dicts(a[key], b[key]) - elif a[key] != b[key]: - raise Exception("ERROR: Dict merge conflict") - else: - a[key] = b[key] - return a - -def consolidate_logs(path, output_filepath): - log = [] - i = 0 - while (i < 1024): - if (os.path.isfile(os.path.join(path,"task_%d.json"%(i)))): - with open(os.path.join(path,"task_%d.json"%(i)), "r") as f: - log_task = json.load(f) - log.extend(log_task) - i = i + 1 - - with open(output_filepath, "w") as f: - json.dump(log, f, indent=2) - -def merge_logs(log_a, log_b, log_out): - # merges json log (list of nested dicts) b into a, not vice versa (TODO) - - with open(log_a, "r") as f: - a = json.load(f) - with open(log_b, "r") as f: - b = json.load(f) - - for idx, run_a in enumerate(a): - for run_b in b: - if run_a["run_id"] == run_b["run_id"]: - #a[idx] |= run_b # requires Python >= 3.9 - #a[idx] = {**run_a, **run_b} - a[idx] = merge_dicts(run_a, run_b) - break - - # also sort by run id - out = sorted(a, key=lambda x: x["run_id"]) - - with open(log_out, "w") as f: - json.dump(out, f, indent=2) - -def wait_for_power_measurements(): - # TODO: detect when no bitstreams are to be measured (e.g. for fifosizing) and skip - # TODO: make configurable, relative to some env variable due to different mountint points - bitstreams_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", - "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), - "bitstreams") - - power_log_path = os.path.join("/mnt/pfs/hpc-prf-radioml/felix/jobs/", - "CI_" + os.environ.get("CI_PIPELINE_IID") + "_" + os.environ.get("CI_PIPELINE_NAME"), - "power_measure.json") - - # count bitstreams to measure (can't rely on total number of runs since some of them could've failed) - files = os.listdir(bitstreams_path) - bitstream_count = len(list(filter(lambda x : ".bit" in x, files))) - - log = [] - print("Checking if all bitstreams of pipeline have been measured..") - while(len(log) < bitstream_count): - if os.path.isfile(power_log_path): - with open(power_log_path, "r") as f: - log = json.load(f) - print("Found measurements for %d/%d bitstreams"%(len(log),bitstream_count)) - time.sleep(60) - print("Power measurement complete") def log_dvc_metric(live, prefix, name, value): # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) live.log_metric(prefix + name.replace("/", "-"), value, plot=False) def open_json_report(id, report_name): - path = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) - if os.path.isfile(path): - with open(path, "r") as f: + # look in both, build & measurement, artifacts + path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) + path2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) + if os.path.isfile(path1): + with open(path1, "r") as f: + report = json.load(f) + return report + elif os.path.isfile(path2): + with open(path2, "r") as f: report = json.load(f) return report else: @@ -115,7 +48,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= if __name__ == "__main__": # Go through all runs found in the artifacts and log their results to DVC - run_dir_list = os.listdir(os.path.join("bench_artifacts", "runs_output")) + run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output")) print("Looking for runs in %s" % run_dir_list) run_ids = [] for run_dir in run_dir_list: @@ -258,7 +191,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= "fifo_size_total_kB", ], prefix="fifosizing/live/") - image = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png") + image = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png") if os.path.isfile(image): live.log_image("fifosizing_pass_1", image) @@ -268,11 +201,15 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= ### ARTIFACTS ### # Log build reports as they come from GitLab artifacts, # but copy them to a central dir first so all runs share the same path - run_report_dir = os.path.join("bench_artifacts", "runs_output", "run_%d" % (id), "reports") + run_report_dir1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports") + run_report_dir2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports") dvc_report_dir = "reports" os.makedirs(dvc_report_dir, exist_ok=True) delete_dir_contents(dvc_report_dir) - shutil.copytree(run_report_dir, dvc_report_dir, dirs_exist_ok=True) + if os.path.isdir(run_report_dir1): + shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True) + if os.path.isdir(run_report_dir2): + shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True) live.log_artifact(dvc_report_dir) print("Done") diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 2beca913c7..ea9713edfa 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -43,15 +43,7 @@ node_by_node_rtlsim, # noqa: Maybe unused, only for debugging node_by_node_cppsim, ) -# from performance.platform_build_steps import( -# test_step_gen_vitis_xo, -# test_step_gen_instrumentation_wrapper, -# test_step_gen_instrwrap_sim, -# test_step_insert_tlastmarker, -# test_step_export_xo, -# test_step_build_platform, -# test_step_run_instrwrap_sim -# ) + ### ADAPTED FROM utils.py # Seeds all relevant random number generators to the same seed for @@ -994,21 +986,3 @@ def step_build_setup(self): ) return cfg - - #def run(self): - # self.steps_full_build_flow() - # DEBUG code for live logging of long instr wrapper simulation: - # live_log_dir_path = os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id), "vivado.log") - # os.makedirs(os.path.join(self.save_dir, "vivado_sim_log", "run_%d" % (self.run_id)), exist_ok=True) - # sim_output_dir = build_dir + "/instrwrap_sim" - # # Prepare bash script - # bash_script = os.getcwd() + "/run_vivado_sim.sh" - # with open(bash_script, "w") as script: - # script.write("#!/bin/bash\n") - # script.write("cd %s\n"%(sim_output_dir)) - # script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl &> %s\n"%(live_log_dir_path)) - # # Run script - # print("Running Vivado simulation of instrumentation wrapper") - # sub_proc = subprocess.Popen(["bash", bash_script]) - # sub_proc.communicate() - ####### diff --git a/benchmarking/measure.py b/benchmarking/measure.py index e0a5da0bfc..3accb734b9 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -2,29 +2,22 @@ import subprocess import shutil +from util import delete_dir_contents -def delete_dir_contents(dir): - for filename in os.listdir(dir): - file_path = os.path.join(dir, filename) - try: - if os.path.isfile(file_path) or os.path.islink(file_path): - os.unlink(file_path) - elif os.path.isdir(file_path): - shutil.rmtree(file_path) - except Exception as e: - print('Failed to delete %s. Reason: %s' % (file_path, e)) if __name__ == "__main__": print("Looking for deployment packages in artifacts..") # Find deployment packages from artifacts - artifacts_dir = os.path.join("bench_artifacts", "runs_output") - for run in os.listdir(artifacts_dir): - run_dir = os.path.join(artifacts_dir, run) - reports_dir = os.path.join(run_dir, "reports") - deploy_archive = os.path.join(run_dir, "deploy.zip") + artifacts_in_dir = os.path.join("build_artifacts", "runs_output") + artifacts_out_dir = os.path.join("measurement_artifacts", "runs_output") + for run in os.listdir(artifacts_in_dir): + run_in_dir = os.path.join(artifacts_in_dir, run) + run_out_dir = os.path.join(artifacts_out_dir, run) + reports_dir = os.path.join(run_out_dir, "reports") + deploy_archive = os.path.join(run_in_dir, "deploy.zip") extract_dir = "measurement" if os.path.isfile(deploy_archive): - print("Found deployment package in %s, extracting.." % run_dir) + print("Found deployment package in %s, extracting.." % run_in_dir) # Extract to temporary dir shutil.unpack_archive(deploy_archive, extract_dir) @@ -47,6 +40,7 @@ def delete_dir_contents(dir): report_path = os.path.join(extract_dir, report) if os.path.isfile(report_path): print("Copying %s to %s" % (report_path, reports_dir)) + os.makedirs(reports_dir, exist_ok=True) shutil.copy(report_path, reports_dir) print("Clearing temporary directory..") diff --git a/benchmarking/util.py b/benchmarking/util.py index 17dec02762..1b4363a707 100644 --- a/benchmarking/util.py +++ b/benchmarking/util.py @@ -1,5 +1,6 @@ # Utility functions for benchmarking import os, shutil +import json from qonnx.core.datatype import DataType import xml.etree.ElementTree as ET @@ -85,3 +86,36 @@ def delete_dir_contents(dir): shutil.rmtree(file_path) except Exception as e: print('Failed to delete %s. Reason: %s' % (file_path, e)) + +def merge_dicts(a: dict, b: dict): + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge_dicts(a[key], b[key]) + elif a[key] != b[key]: + raise Exception("ERROR: Dict merge conflict") + else: + a[key] = b[key] + return a + +def merge_logs(log_a, log_b, log_out): + # merges json log (list of nested dicts) b into a, not vice versa (TODO) + + with open(log_a, "r") as f: + a = json.load(f) + with open(log_b, "r") as f: + b = json.load(f) + + for idx, run_a in enumerate(a): + for run_b in b: + if run_a["run_id"] == run_b["run_id"]: + #a[idx] |= run_b # requires Python >= 3.9 + #a[idx] = {**run_a, **run_b} + a[idx] = merge_dicts(run_a, run_b) + break + + # also sort by run id + out = sorted(a, key=lambda x: x["run_id"]) + + with open(log_out, "w") as f: + json.dump(out, f, indent=2) From b70ba9e1481faf426d187395f4072bdea6c0f4c0 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 09:47:56 +0000 Subject: [PATCH 061/125] Fix collection job import --- benchmarking/collect.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index bcff28104c..5cbe5fbf41 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -3,8 +3,17 @@ import shutil from dvclive import Live -from util import delete_dir_contents +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print('Failed to delete %s. Reason: %s' % (file_path, e)) def log_dvc_metric(live, prefix, name, value): # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) From 4853d0b9c7d49404f109d46a75a135188e93de95 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 09:54:09 +0000 Subject: [PATCH 062/125] Fix util import --- benchmarking/bench_base.py | 2 +- benchmarking/collect.py | 11 +---------- benchmarking/dut/synthetic_nonlinear.py | 2 +- benchmarking/util.py | 11 ----------- 4 files changed, 3 insertions(+), 23 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 6a4bd63c51..61a999750c 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -28,7 +28,7 @@ import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template -from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents +from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 5cbe5fbf41..bcff28104c 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -3,17 +3,8 @@ import shutil from dvclive import Live +from util import delete_dir_contents -def delete_dir_contents(dir): - for filename in os.listdir(dir): - file_path = os.path.join(dir, filename) - try: - if os.path.isfile(file_path) or os.path.islink(file_path): - os.unlink(file_path) - elif os.path.isdir(file_path): - shutil.rmtree(file_path) - except Exception as e: - print('Failed to delete %s. Reason: %s' % (file_path, e)) def log_dvc_metric(live, prefix, name, value): # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) diff --git a/benchmarking/dut/synthetic_nonlinear.py b/benchmarking/dut/synthetic_nonlinear.py index 759f31838b..eb91999b2e 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/benchmarking/dut/synthetic_nonlinear.py @@ -24,7 +24,7 @@ import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir -from util import summarize_table, summarize_section, power_xml_to_dict, prepare_inputs, delete_dir_contents +from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents from finn.util.test import get_trained_network_and_ishape from finn.util.basic import alveo_default_platform diff --git a/benchmarking/util.py b/benchmarking/util.py index 1b4363a707..23ecc0a984 100644 --- a/benchmarking/util.py +++ b/benchmarking/util.py @@ -1,7 +1,6 @@ # Utility functions for benchmarking import os, shutil import json -from qonnx.core.datatype import DataType import xml.etree.ElementTree as ET def _find_rows_and_headers(table): @@ -14,7 +13,6 @@ def _find_rows_and_headers(table): break return (rows, headers) - def summarize_table(table): table_summary = {} table_summary["headers"] = [] @@ -40,7 +38,6 @@ def summarize_table(table): return table_summary - def summarize_section(section): section_summary = {} section_summary["tables"] = [] @@ -57,7 +54,6 @@ def summarize_section(section): return section_summary - def power_xml_to_dict(xml_path): tree = ET.parse(xml_path) root = tree.getroot() @@ -69,13 +65,6 @@ def power_xml_to_dict(xml_path): return result -def prepare_inputs(input_tensor, idt, wdt): - if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: - # convert bipolar to binary - return {"inp": (input_tensor + 1) / 2} - else: - return {"inp": input_tensor} - def delete_dir_contents(dir): for filename in os.listdir(dir): file_path = os.path.join(dir, filename) From 0c812bc54fbc4a5df24141a48e1cf646a0c008e2 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 10:20:24 +0000 Subject: [PATCH 063/125] Nested interconnects for Zynq-7000, fixes --- .../driver/driver_instrumentation.py | 101 +++++++++++------- .../fpgadataflow/make_pynq_driver.py | 7 +- .../fpgadataflow/make_zynq_proj.py | 16 +-- .../transformation/fpgadataflow/templates.py | 2 + 4 files changed, 80 insertions(+), 46 deletions(-) diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py index fea9446bf5..90a0ed5b89 100644 --- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -1,31 +1,28 @@ -import time -import json import argparse -import matplotlib as mpl -import matplotlib.pyplot as plt -from IPython.display import clear_output -import numpy as np +import json +import time from pynq import Overlay -from pynq.ps import Clocks from pynq.pl_server.device import Device +from pynq.ps import Clocks + +# Instrumentation wrapper register map # +# ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed +# ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow +# ap_uint<32> &latency, +# ap_uint<32> &interval, +# ap_uint<32> &checksum, +# ap_uint<32> &min_latency -### Instrumentation wrapper register map ### -#ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed -#ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow -#ap_uint<32> &latency, -#ap_uint<32> &interval, -#ap_uint<32> &checksum, -#ap_uint<32> &min_latency class FINNInstrumentationOverlay(Overlay): def __init__( self, bitfile_name, - platform = "zynq", - fclk_mhz = 100.0, - device = None, - download = True, - seed = 1, + platform="zynq", + fclk_mhz=100.0, + device=None, + download=True, + seed=1, ): super().__init__(bitfile_name, download=download, device=device) @@ -40,27 +37,34 @@ def __init__( self.fclk_mhz_actual = Clocks.fclk0_mhz def instrumentation_read(self, name): - return self.instrumentation_wrap_0.read(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"]) + return self.instrumentation_wrap_0.read( + offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"] + ) def instrumentation_write(self, name, value): - return self.instrumentation_wrap_0.write(offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], value=value) + return self.instrumentation_wrap_0.write( + offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], + value=value, + ) def reset_accelerator(self): - self.axi_gpio_0.write(offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0) + self.axi_gpio_0.write( + offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0 + ) def start_accelerator(self): - lfsr_seed = (self.seed << 16) & 0xffff0000 # upper 16 bits - self.instrumentation_write("cfg", lfsr_seed + 1) # start operation + lfsr_seed = (self.seed << 16) & 0xFFFF0000 # upper 16 bits + self.instrumentation_write("cfg", lfsr_seed + 1) # start operation def observe_instrumentation(self, debug_print=True): status_reg = self.instrumentation_read("status") chksum_reg = self.instrumentation_read("checksum") min_latency = self.instrumentation_read("min_latency") latency = self.instrumentation_read("latency") - interval = self.instrumentation_read("interval") + interval = self.instrumentation_read("interval") - frame = (chksum_reg >> 24) & 0x000000ff - checksum = chksum_reg & 0x00ffffff + frame = (chksum_reg >> 24) & 0x000000FF + checksum = chksum_reg & 0x00FFFFFF overflow_err = (status_reg & 0x00000001) != 0 underflow_err = (status_reg & 0x00000002) != 0 @@ -83,14 +87,25 @@ def observe_instrumentation(self, debug_print=True): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper') - parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10) - parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0) - parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1) - parser.add_argument('--device', help='FPGA device to be used', type=int, default=0) - parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit") - parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json") - parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="") + parser = argparse.ArgumentParser( + description="Profile FINN-generated accelerator using instrumentation wrapper" + ) + parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10) + parser.add_argument( + "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0 + ) + parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1) + parser.add_argument("--device", help="FPGA device to be used", type=int, default=0) + parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit") + parser.add_argument( + "--reportfile", + help="Name of output .json report file", + type=str, + default="measured_performance.json", + ) + parser.add_argument( + "--settingsfile", help="Name of optional input .json settings file", type=str, default="" + ) # parse arguments args = parser.parse_args() runtime = args.runtime @@ -111,7 +126,9 @@ def observe_instrumentation(self, debug_print=True): # instantiate FINN accelerator driver and pass batchsize and bitfile print("Programming FPGA..") - accel = FINNInstrumentationOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed) + accel = FINNInstrumentationOverlay( + bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed + ) # start accelerator print("Running accelerator..") @@ -121,7 +138,15 @@ def observe_instrumentation(self, debug_print=True): time.sleep(runtime) # read measurement from instrumentation - (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = accel.observe_instrumentation() + ( + overflow_err, + underflow_err, + frame, + checksum, + min_latency, + latency, + interval, + ) = accel.observe_instrumentation() # write report to file report = { @@ -135,7 +160,7 @@ def observe_instrumentation(self, debug_print=True): "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))), "min_pipeline_depth": round(min_latency / interval, 2), - "pipeline_depth" : round(latency / interval, 2), + "pipeline_depth": round(latency / interval, 2), } with open(reportfile, "w") as f: json.dump(report, f, indent=2) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index b935f5eea0..c26fa845ed 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -26,9 +26,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json import numpy as np import os -import json import qonnx import shutil import warnings @@ -303,6 +303,8 @@ def apply(self, model): else: continue + return (model, False) + class MakePYNQDriverInstrumentation(Transformation): def __init__(self, platform, clk_period_ns): @@ -320,7 +322,8 @@ def apply(self, model): # create (copy) the static instrumentation driver driver_template = ( - os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py" + os.environ["FINN_ROOT"] + + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py" ) driver_py = pynq_driver_dir + "/driver.py" shutil.copy(driver_template, driver_py) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 846d95a11b..98372b700f 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -174,13 +174,16 @@ def apply(self, model): ) # connect to master interconnect config.append( - "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]" + "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] " + "-boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]" % (master_axilite_idx, i) ) - # connect clocks/reset TODO: suppport zynq_7000 + # connect clocks/reset config.append( - "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_%d/ACLK]" - % (i) + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " + "{ Clk {/zynq_ps/$zynq_ps_clkname} Freq {} " + "Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} " + "[get_bd_pins axi_interconnect_%d/ACLK]" % (i) ) master_axilite_idx += 1 total_axilite_count = max(0, total_axilite_count - 64) @@ -359,10 +362,11 @@ def apply(self, model): config.append("delete_bd_objs [get_bd_cells smartconnect_0]") aximm_idx = 1 - # finalize nested interconnect clock/reset TODO: support zynq_7000 + # finalize nested interconnect clock/reset for i in range(1, nested_interconnect_count + 1): config.append( - "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_%d/M*_ACLK]" + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " + "{ Clk {/zynq_ps/$zynq_ps_clkname} } [get_bd_pins axi_interconnect_%d/M*_ACLK]" % (i) ) diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index d9040d83f2..6cde5cfa66 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -146,6 +146,7 @@ create_bd_design "top" if {$ZYNQ_TYPE == "zynq_us+"} { set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]] + set zynq_ps_clkname "pl_clk0" create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] #activate one slave port, deactivate the second master port @@ -156,6 +157,7 @@ set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] } elseif {$ZYNQ_TYPE == "zynq_7000"} { set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]] + set zynq_ps_clkname "FCLK_CLK0" create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ps] set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps] From 4bf21a295ee38d65b33212012c8952f167db03dc Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 11:05:19 +0000 Subject: [PATCH 064/125] Force disable additional AXI-lite interfaces for live FIFO sizing --- src/finn/builder/build_dataflow_steps.py | 31 ++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 2f05886afd..5dc971cf33 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -91,8 +91,8 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker from finn.transformation.fpgadataflow.make_pynq_driver import ( - MakePYNQDriverIODMA, MakePYNQDriverInstrumentation, + MakePYNQDriverIODMA, ) from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.minimize_accumulator_width import ( @@ -555,6 +555,29 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior if cfg.live_fifo_sizing: + # Disable runtime-writable weights, external weights, and dynamic mode, + # as we don't support additional AXI-lite interfaces next to the FIFOs + for node in model.graph.node: + if node.domain.startswith("finn.custom_op.fpgadataflow"): + node_inst = getCustomOp(node) + try: + if node_inst.get_nodeattr("runtime_writeable_weights") == 1: + node_inst.set_nodeattr("runtime_writeable_weights", 0) + if node_inst.get_nodeattr("ram_style") == "ultra": + node_inst.set_nodeattr("ram_style", "block") + except AttributeError: + pass + try: + if node_inst.get_nodeattr("mem_mode") == "external": + node_inst.set_nodeattr("mem_mode", "internal_decoupled") + except AttributeError: + pass + try: + if node_inst.get_nodeattr("dynamic_mode") == 1: + node_inst.set_nodeattr("dynamic_mode", 0) + except AttributeError: + pass + # Create all DWCs and FIFOs normally model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) @@ -826,7 +849,11 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig): if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs: driver_dir = cfg.output_dir + "/driver" if cfg.enable_instrumentation: - model = model.transform(MakePYNQDriverInstrumentation(cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing)) + model = model.transform( + MakePYNQDriverInstrumentation( + cfg._resolve_driver_platform(), cfg.synth_clk_period_ns, cfg.live_fifo_sizing + ) + ) else: model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) From 5cc29245fdff75931bfd9f8feee86261f1231f46 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 6 Mar 2025 17:11:31 +0000 Subject: [PATCH 065/125] Enable live fifosizing for transformer --- benchmarking/bench_base.py | 2 ++ benchmarking/cfg/synthetic_fifotest.json | 30 ++++++++++++------------ benchmarking/collect.py | 2 +- benchmarking/dut/transformer.py | 26 ++++++++++---------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 61a999750c..5f828ca4e4 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -426,6 +426,8 @@ def steps_full_build_flow(self): cfg.force_python_rtlsim = False cfg.split_large_fifos = True cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet + cfg.save_intermediate_models = True # Save the intermediate model graphs + cfg.verify_save_full_context = True, # Output full context dump for verification steps #rtlsim_use_vivado_comps # TODO ? #cfg.default_swg_exception #cfg.large_fifo_mem_style diff --git a/benchmarking/cfg/synthetic_fifotest.json b/benchmarking/cfg/synthetic_fifotest.json index dfc63c6240..7e362200af 100644 --- a/benchmarking/cfg/synthetic_fifotest.json +++ b/benchmarking/cfg/synthetic_fifotest.json @@ -1,15 +1,15 @@ [ { "dut": ["synthetic_nonlinear"], - "dim": [32], + "dim": [64], "kernel_size": [5], - "ch": [4], - "simd": [4], - "pe": [4], + "ch": [8], + "simd": [8], + "pe": [8], "parallel_window": [1], "lb_num_layers": [1], - "rb_num_layers": [3], + "rb_num_layers": [4], "board": ["RFSoC2x2"], "clock_period_ns": [10], @@ -21,15 +21,15 @@ }, { "dut": ["synthetic_nonlinear"], - "dim": [32], + "dim": [64], "kernel_size": [5], - "ch": [4], - "simd": [4], - "pe": [4], + "ch": [8], + "simd": [8], + "pe": [8], "parallel_window": [1], "lb_num_layers": [1], - "rb_num_layers": [3], + "rb_num_layers": [4], "board": ["RFSoC2x2"], "clock_period_ns": [10], @@ -42,15 +42,15 @@ }, { "dut": ["synthetic_nonlinear"], - "dim": [32], + "dim": [64], "kernel_size": [5], - "ch": [4], - "simd": [4], - "pe": [4], + "ch": [8], + "simd": [8], + "pe": [8], "parallel_window": [1], "lb_num_layers": [1], - "rb_num_layers": [3], + "rb_num_layers": [4], "board": ["RFSoC2x2"], "clock_period_ns": [10], diff --git a/benchmarking/collect.py b/benchmarking/collect.py index bcff28104c..45f6073d1b 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -49,7 +49,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= if __name__ == "__main__": # Go through all runs found in the artifacts and log their results to DVC run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output")) - print("Looking for runs in %s" % run_dir_list) + print("Looking for runs in build artifacts") run_ids = [] for run_dir in run_dir_list: if run_dir.startswith("run_"): diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index ea9713edfa..5d0566a476 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -910,15 +910,6 @@ def step_build_setup(self): verify_input_npy=self.build_inputs["input_npy_path"], # File with expected test outputs for verification verify_expected_output_npy=self.build_inputs["output_npy_path"], - # Output full context dump for verification steps - verify_save_full_context=True, - # Save the intermediate model graphs - save_intermediate_models=True, - # Avoid RTL simulation for setting the FIFO sizes - auto_fifo_strategy=AutoFIFOSizingMethod.CHARACTERIZE, - # Do not automatically set FIFO sizes as this requires RTL simulation - # not implemented for the attention operator - auto_fifo_depths=False, # Build steps to execute steps=[ # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering @@ -963,11 +954,6 @@ def step_build_setup(self): "step_generate_estimate_reports", "step_hw_codegen", "step_hw_ipgen", - # Set the attention- and residual-related FIFO depths insert FIFOs - # and apply folding configuration once again - # Note: Implement all FIFOs with a depth at least as deep as the - # sequence length in URAM. - set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len), # Run additional node-by-node verification in RTL simulation of the # model before creating the stitched IP # Note: end-to-end verification of the stitched IP in RTL simulation @@ -985,4 +971,16 @@ def step_build_setup(self): ] ) + # TESTING custom vs live FIFO-sizing + if self.params["fifo_method"] == "live": + # insert default FIFO-sizing step (behind step_generate_estimate_reports) + for i in range(len(cfg.steps)): + if cfg.steps[i] == "step_generate_estimate_reports": + cfg.steps.insert(i+1, "step_set_fifo_depths") + else: + # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen) + for i in range(len(cfg.steps)): + if cfg.steps[i] == "step_hw_ipgen": + cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len)) + return cfg From 05f72d2d28a11857ed10a411ce2970a5280320a0 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 7 Mar 2025 10:46:25 +0000 Subject: [PATCH 066/125] Minor fixes for Transformer flow --- benchmarking/dut/transformer.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 5d0566a476..819b9b5fa2 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -21,6 +21,7 @@ # FINN dataflow builder import finn.builder.build_dataflow_config as build_cfg from finn.builder.build_dataflow_config import AutoFIFOSizingMethod +from qonnx.core.modelwrapper import ModelWrapper from bench_base import bench # Range information structure for seeding the range analysis for converting @@ -855,10 +856,14 @@ def step_build_setup(self): seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"] else: # for real input models - _, seq_len, emb_dim = np.load(self.build_inputs["input_npy_path"]).shape - # TODO: use the following to get dimensions for GPT models? - #model = ModelWrapper(self.build_inputs["onnx_path"]) - #_, emb_dim, seq_len = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0") + inp_shape = np.load(self.build_inputs["input_npy_path"]).shape + if len(inp_shape) == 3: + # for RadioML Transformers + _, seq_len, emb_dim = inp_shape + else: + # for GPTs (why is this different?) + model = ModelWrapper(self.build_inputs["onnx_path"]) + _, seq_len, emb_dim = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0") # Read the input value range information for the dataset from the parameters # Note: Consider calibrating this on the fly from the dataset @@ -972,7 +977,7 @@ def step_build_setup(self): ) # TESTING custom vs live FIFO-sizing - if self.params["fifo_method"] == "live": + if self.params.get("fifo_method") == "live": # insert default FIFO-sizing step (behind step_generate_estimate_reports) for i in range(len(cfg.steps)): if cfg.steps[i] == "step_generate_estimate_reports": From 230ac92471342c0a28e91168fc3b57895b0c8651 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 7 Mar 2025 11:52:05 +0000 Subject: [PATCH 067/125] Fix clkname variable expansion --- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 98372b700f..c6449468cf 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -181,9 +181,7 @@ def apply(self, model): # connect clocks/reset config.append( "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " - "{ Clk {/zynq_ps/$zynq_ps_clkname} Freq {} " - "Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} " - "[get_bd_pins axi_interconnect_%d/ACLK]" % (i) + '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/ACLK]' % (i) ) master_axilite_idx += 1 total_axilite_count = max(0, total_axilite_count - 64) @@ -366,8 +364,7 @@ def apply(self, model): for i in range(1, nested_interconnect_count + 1): config.append( "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " - "{ Clk {/zynq_ps/$zynq_ps_clkname} } [get_bd_pins axi_interconnect_%d/M*_ACLK]" - % (i) + '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/M*_ACLK]' % (i) ) # create a temporary folder for the project From 8dcf182129813a81221d1ca764f1045a7a24ac09 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 7 Mar 2025 12:56:02 +0000 Subject: [PATCH 068/125] [Driver] Reset PYNQ cache before loading overlay --- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 2 ++ src/finn/qnn-data/templates/driver/driver_instrumentation.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index be1f20156a..778d74b21e 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -3,6 +3,7 @@ import os import argparse import matplotlib.pyplot as plt +from pynq import PL from pynq.pl_server.device import Device from driver_instrumentation import FINNInstrumentationOverlay @@ -211,6 +212,7 @@ def determine_start_depth(self, ): print("Programming FPGA..") + PL.reset() # reset PYNQ cache accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths) (start_depth, iteration_runtime) = accel.determine_start_depth() diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py index 90a0ed5b89..51c85587cf 100644 --- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -2,6 +2,7 @@ import json import time from pynq import Overlay +from pynq import PL from pynq.pl_server.device import Device from pynq.ps import Clocks @@ -126,6 +127,7 @@ def observe_instrumentation(self, debug_print=True): # instantiate FINN accelerator driver and pass batchsize and bitfile print("Programming FPGA..") + PL.reset() # reset PYNQ cache accel = FINNInstrumentationOverlay( bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed ) From 8e4a2095235bfdab0e47dfde8a51019143a8bfd0 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 7 Mar 2025 14:34:53 +0000 Subject: [PATCH 069/125] Add VGG-10 and MobileNetV1 --- .gitlab-ci.yml | 2 +- benchmarking/bench.py | 4 + benchmarking/cfg/mobilenetv1_test.json | 32 +++++ benchmarking/cfg/vgg10_test.json | 32 +++++ benchmarking/dut/mobilenetv1.py | 160 +++++++++++++++++++++++++ benchmarking/dut/vgg10.py | 53 ++++++++ 6 files changed, 282 insertions(+), 1 deletion(-) create mode 100644 benchmarking/cfg/mobilenetv1_test.json create mode 100644 benchmarking/cfg/vgg10_test.json create mode 100644 benchmarking/dut/mobilenetv1.py create mode 100644 benchmarking/dut/vgg10.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 79d772f65d..074bc98f0c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -214,4 +214,4 @@ Bench: PARENT_PIPELINE_ID: $CI_PIPELINE_ID parallel: matrix: - - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest] + - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest, vgg10_test, mobilenetv1_test] diff --git a/benchmarking/bench.py b/benchmarking/bench.py index ea85082fc8..41cfdbbbf7 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -11,6 +11,8 @@ from dut.metafi import bench_metafi from dut.synthetic_nonlinear import bench_synthetic_nonlinear from dut.transformer import bench_transformer +from dut.vgg10 import bench_vgg10 +from dut.mobilenetv1 import bench_mobilenetv1 dut = dict() dut["mvau"] = bench_mvau @@ -18,6 +20,8 @@ dut["metafi"] = bench_metafi dut["synthetic_nonlinear"] = bench_synthetic_nonlinear dut["transformer"] = bench_transformer +dut["vgg10"] = bench_vgg10 +dut["mobilenetv1"] = bench_mobilenetv1 def main(config_name): diff --git a/benchmarking/cfg/mobilenetv1_test.json b/benchmarking/cfg/mobilenetv1_test.json new file mode 100644 index 0000000000..d080638722 --- /dev/null +++ b/benchmarking/cfg/mobilenetv1_test.json @@ -0,0 +1,32 @@ +[ + { + "dut": ["mobilenetv1"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "fifo_method": ["manual"], + + "rtlsim_n": [5], + "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "fifo_method": ["live"], + + "rtlsim_n": [5], + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/benchmarking/cfg/vgg10_test.json b/benchmarking/cfg/vgg10_test.json new file mode 100644 index 0000000000..7a6e1a5deb --- /dev/null +++ b/benchmarking/cfg/vgg10_test.json @@ -0,0 +1,32 @@ +[ + { + "dut": ["vgg10"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "fifo_method": ["largefifo_rtlsim"], + + "rtlsim_n": [5], + "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["vgg10"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], + "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], + "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "clock_period_ns": [10], + + "fifo_method": ["live"], + + "rtlsim_n": [5], + "output_products": [["bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py new file mode 100644 index 0000000000..56b13a6095 --- /dev/null +++ b/benchmarking/dut/mobilenetv1.py @@ -0,0 +1,160 @@ +from qonnx.core.modelwrapper import ModelWrapper +from finn.builder.build_dataflow_config import ( + DataflowBuildConfig, + ShellFlowType, + VerificationStepType, +) +from finn.builder.build_dataflow_steps import verify_step +from finn.transformation.streamline import Streamline +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +import finn.transformation.streamline.absorb as absorb +import finn.transformation.streamline.reorder as reorder +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul +from qonnx.transformation.remove import RemoveIdentityOps +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + ApplyConfig, +) +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from qonnx.transformation.infer_datatypes import InferDataTypes + +from bench_base import bench + + +def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Streamline()) + additional_streamline_transformations = [ + DoubleToSingleFloat(), + reorder.MoveMulPastDWConv(), + absorb.AbsorbMulIntoMultiThreshold(), + ChangeDataLayoutQuantAvgPool2d(), + InferDataLayouts(), + reorder.MoveTransposePastScalarMul(), + absorb.AbsorbTransposeIntoFlatten(), + reorder.MoveFlattenPastAffine(), + reorder.MoveFlattenPastTopK(), + reorder.MoveScalarMulPastMatMul(), + CollapseRepeatedMul(), + RemoveIdentityOps(), + RoundAndClipThresholds(), + ] + for trn in additional_streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + + if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): + verify_step(model, cfg, "streamlined_python", need_parent=False) + + return model + +def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(LowerConvsToMatMul()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RoundAndClipThresholds()) + model = model.transform(InferDataLayouts()) + return model + +def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model + +def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): + if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: + try: + from finnexperimental.analysis.partitioning import partition + + # apply partitioning of the model, restricting the first and last layers + # to SLR0 + default_slr = 0 + abs_anchors = [(0, [default_slr]), (-1, [default_slr])] + floorplan = partition( + model, + cfg.synth_clk_period_ns, + cfg.board, + abs_anchors=abs_anchors, + multivariant=False, + )[0] + # apply floorplan to model + model = model.transform(ApplyConfig(floorplan)) + print("SLR floorplanning applied") + except Exception: + print("No SLR floorplanning applied") + return model + +def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model + +class bench_mobilenetv1(bench): + def step_build_setup(self): + # create build config for MobileNetV1 (based on finn-examples) + mobilenet_build_steps = [ + step_mobilenet_streamline, + step_mobilenet_lower_convs, + step_mobilenet_convert_to_hw_layers_separate_th, + "step_create_dataflow_partition", + "step_specialize_layers", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + "step_set_fifo_depths", + "step_create_stitched_ip", + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + # mobilenet_build_steps_alveo = [ + # step_mobilenet_streamline, + # step_mobilenet_lower_convs, + # step_mobilenet_convert_to_hw_layers, + # "step_create_dataflow_partition", + # "step_specialize_layers", + # "step_apply_folding_config", + # "step_minimize_bit_width", + # "step_generate_estimate_reports", + # "step_hw_codegen", + # "step_hw_ipgen", + # "step_set_fifo_depths", + # "step_create_stitched_ip", + # step_mobilenet_slr_floorplan, + # "step_synthesize_bitfile", + # "step_make_pynq_driver", + # "step_deployment_package", + # ] + + cfg = DataflowBuildConfig( + steps=mobilenet_build_steps, + ) + + return cfg diff --git a/benchmarking/dut/vgg10.py b/benchmarking/dut/vgg10.py new file mode 100644 index 0000000000..e64a58fb2f --- /dev/null +++ b/benchmarking/dut/vgg10.py @@ -0,0 +1,53 @@ +from qonnx.core.modelwrapper import ModelWrapper +from finn.builder.build_dataflow_config import DataflowBuildConfig +from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb + +from bench_base import bench + + +def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Change3DTo4DTensors()) + model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) + return model + +def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(GiveUniqueNodeNames()) + return model + +class bench_vgg10(bench): + def step_build_setup(self): + # create build config for VGG-10 (based on finn-examples) + vgg10_build_steps = [ + "step_tidy_up", + step_pre_streamline, + "step_streamline", + "step_convert_to_hw", + step_convert_final_layers, + "step_create_dataflow_partition", + "step_specialize_layers", + "step_target_fps_parallelization", + "step_apply_folding_config", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_set_fifo_depths", + "step_hw_codegen", + "step_hw_ipgen", + "step_create_stitched_ip", + "step_measure_rtlsim_performance", + "step_out_of_context_synthesis", + "step_synthesize_bitfile", + "step_make_pynq_driver", + "step_deployment_package", + ] + + cfg = DataflowBuildConfig( + steps=vgg10_build_steps, + standalone_thresholds=True, + ) + + return cfg From 68c41b8d5366f4809b07f11ab53f843e50afafcc Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 7 Mar 2025 17:17:33 +0000 Subject: [PATCH 070/125] Fix variable named range --- benchmarking/dut/transformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 819b9b5fa2..ec737ce6b8 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -867,8 +867,8 @@ def step_build_setup(self): # Read the input value range information for the dataset from the parameters # Note: Consider calibrating this on the fly from the dataset - range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable? - input_range = tuple(np.array([range]).T) + value_range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable? + input_range = tuple(np.array([value_range]).T) # Construct the seed range information of the input tensor range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range) From 9884beff238d6a2a7d52f449b2600797fecf329a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sat, 8 Mar 2025 17:39:30 +0000 Subject: [PATCH 071/125] Reduce benchmark parallelism, force push exp --- .gitlab-ci.yml | 2 +- benchmarking/bench-ci.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 074bc98f0c..6b8e8369b8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -20,7 +20,7 @@ variables: value: "64" CPU_CORES_BENCH: description: "Select number of CPU cores for benchmark runs" - value: "32" + value: "8" PARALLEL_JOBS: description: "Number of parallel Slurm array jobs per Benchmark job" value: "2" diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 99adf1e0dc..73b91508d7 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -73,4 +73,4 @@ Result Collection: - when: always script: - python3.10 benchmarking/collect.py - - dvc exp push -r push git@github.com:eki-project/finn-plus.git + - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git From ef7b8cf5bb124b440129a4197a9a83b064d99397 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sat, 8 Mar 2025 18:10:59 +0000 Subject: [PATCH 072/125] Fix FIFO width export for driver --- .../templates/driver/driver_fifosizing.py | 222 ++++++++++++------ .../fpgadataflow/make_pynq_driver.py | 13 +- 2 files changed, 155 insertions(+), 80 deletions(-) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index 560959991f..5aa116ebac 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -1,27 +1,32 @@ -import time -import json -import os import argparse -import matplotlib as mpl +import json import matplotlib.pyplot as plt -import numpy as np -from pynq.pl_server.device import Device - +import os +import sys +import time from driver_instrumentation import FINNInstrumentationOverlay +from pynq.pl_server.device import Device class FINNLiveFIFOOverlay(FINNInstrumentationOverlay): def __init__( self, bitfile_name, - platform = "zynq", - fclk_mhz = 100.0, - device = None, - download = True, - seed = 1, - fifo_widths = {}, + platform="zynq", + fclk_mhz=100.0, + device=None, + download=True, + seed=1, + fifo_widths=dict(), ): - super().__init__(bitfile_name, platform = platform, fclk_mhz = fclk_mhz, seed = seed, download = download, device = device) + super().__init__( + bitfile_name, + platform=platform, + fclk_mhz=fclk_mhz, + seed=seed, + download=download, + device=device, + ) self.error = False self.fifo_widths = fifo_widths @@ -33,9 +38,13 @@ def __init__( # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps # We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights if (len(self.ip_dict.keys()) - 3) != self.num_fifos: + print( + "Error: Number of expected FIFOs (%d) doesn't match number of AXI-Lite interfaces (%d)" + % (self.num_fifos, len(self.ip_dict.keys()) - 3) + ) self.error = True - def configure_fifo(self, i, mode, depth = 2): + def configure_fifo(self, i, mode, depth=2): ### Virtual FIFO register map ### mode_offset = 0x10 depth_offset = 0x18 @@ -45,43 +54,51 @@ def configure_fifo(self, i, mode, depth = 2): max_occupancy_ctrl_offset = 0x34 ip_name = "StreamingDataflowPartition_%d" % i - getattr(self, ip_name).write(offset=mode_offset, value = mode) - getattr(self, ip_name).write(offset=depth_offset, value = depth) + getattr(self, ip_name).write(offset=mode_offset, value=mode) + getattr(self, ip_name).write(offset=depth_offset, value=depth) def total_fifo_size(self, depths): # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs total_size_bits = 0 for i, depth in enumerate(depths): - total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths["StreamingFIFO_hls_%d" % i] + total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[i] total_size_kB = total_size_bits / 8.0 / 1000.0 return total_size_kB - - def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0.5): + + def size_iteratively(self, start_depth, iteration_runtime, reduction_factor=0.5): ### Iterative FIFO-sizing function ### fifo_minimum_reached = [False] * self.num_fifos - + if isinstance(start_depth, list): # Individual start depth for each FIFO has been supplied fifo_depths = start_depth else: # Initialize all depths to the same start depth fifo_depths = [start_depth] * self.num_fifos - + # Reset accelerator and configure FIFOs self.reset_accelerator() for i in range(0, self.num_fifos): - self.configure_fifo(i, mode = 1, depth = fifo_depths[i]) + self.configure_fifo(i, mode=1, depth=fifo_depths[i]) # Run once to determine target interval self.start_accelerator() time.sleep(1) - (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False) + ( + overflow_err, + underflow_err, + frame, + checksum, + min_latency, + latency, + interval, + ) = self.observe_instrumentation(False) log_total_fifo_size = [int(self.total_fifo_size(fifo_depths))] log_interval = [interval] log_min_latency = [min_latency] log_latency = [latency] target_interval = interval - + # Iteratively reduce FIFO depth until all FIFOs are minimized iteration = 0 start_time = time.time() @@ -96,7 +113,7 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0. # Configure all FIFOs for i in range(0, self.num_fifos): - self.configure_fifo(i, mode = 1, depth = fifo_depths[i]) + self.configure_fifo(i, mode=1, depth=fifo_depths[i]) # Start accelerator self.start_accelerator() @@ -104,8 +121,16 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0. # Let it run time.sleep(iteration_runtime) - # Check if throughput dropped or deadlock occured - (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation(False) + # Check if throughput dropped or deadlock occured + ( + overflow_err, + underflow_err, + frame, + checksum, + min_latency, + latency, + interval, + ) = self.observe_instrumentation(False) if interval > target_interval or interval == 0 or overflow_err or underflow_err: # Revert depth reduction and mark FIFO as minimized @@ -115,7 +140,7 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0. log_total_fifo_size.append(int(self.total_fifo_size(fifo_depths))) log_interval.append(interval) log_min_latency.append(min_latency) - log_latency.append(latency) + log_latency.append(latency) if fifo_depths[fifo_id] == 1: fifo_minimum_reached[fifo_id] = True @@ -133,9 +158,18 @@ def size_iteratively(self, start_depth, iteration_runtime, reduction_factor = 0. duration = int(end_time - start_time) print("Done (%d seconds)" % duration) - return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency, duration + return ( + fifo_depths, + log_total_fifo_size, + log_interval, + log_min_latency, + log_latency, + duration, + ) - def determine_start_depth(self, ): + def determine_start_depth( + self, + ): ### Attempt to determine start depth for all FIFOs automatically ### # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis start_depth = 64 @@ -148,15 +182,28 @@ def determine_start_depth(self, ): # Configure FIFOs for i in range(0, self.num_fifos): - self.configure_fifo(i, mode = 1, depth = start_depth) - + self.configure_fifo(i, mode=1, depth=start_depth) + # Start accelerator and let it run for a long time self.start_accelerator() time.sleep(1) - + # Examine performance - (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = self.observe_instrumentation() - if interval > 0 and interval == last_interval and not overflow_err and not underflow_err: + ( + overflow_err, + underflow_err, + frame, + checksum, + min_latency, + latency, + interval, + ) = self.observe_instrumentation() + if ( + interval > 0 + and interval == last_interval + and not overflow_err + and not underflow_err + ): # Accelerator runs with stable interval, reset to previous start depth start_depth_found = True start_depth = last_start_depth @@ -164,13 +211,13 @@ def determine_start_depth(self, ): # Start depth is still too small, increase for next try last_start_depth = start_depth start_depth = start_depth * 2 - + last_interval = interval if start_depth > 1000000: print("Couldn't find a working start depth, please set manually") self.error = True - + # Determine runtime per iteration based on performance, so that stable-state is guaranteed # Use a simple overestimation for now to be safe iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000) @@ -179,15 +226,27 @@ def determine_start_depth(self, ): print("Determined iteration runtime based on performance: %f s" % iteration_runtime) return (start_depth, iteration_runtime) + if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Profile performance of FINN-generated accelerator using instrumentation wrapper') - parser.add_argument('--runtime', help='Runtime in seconds', type=int, default=10) - parser.add_argument('--frequency', help='FPGA clock frequency in MHz', type=float, default=100.0) - parser.add_argument('--seed', help='LFSR seed for input data generation', type=int, default=1) - parser.add_argument('--device', help='FPGA device to be used', type=int, default=0) - parser.add_argument('--bitfile', help='Name of bitfile', default="finn-accel.bit") - parser.add_argument('--reportfile', help='Name of output .json report file', type=str, default="measured_performance.json") - parser.add_argument('--settingsfile', help='Name of optional input .json settings file', type=str, default="") + parser = argparse.ArgumentParser( + description="Profile performance of FINN-generated accelerator using instrumentation wrapper" + ) + parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10) + parser.add_argument( + "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0 + ) + parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1) + parser.add_argument("--device", help="FPGA device to be used", type=int, default=0) + parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit") + parser.add_argument( + "--reportfile", + help="Name of output .json report file", + type=str, + default="measured_performance.json", + ) + parser.add_argument( + "--settingsfile", help="Name of optional input .json settings file", type=str, default="" + ) # parse arguments args = parser.parse_args() runtime = args.runtime @@ -208,58 +267,67 @@ def determine_start_depth(self, ): frequency = settings["fclk_mhz"] # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g., - # {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}} + # {'fifo_widths': {0: 8, 1: 32, 2: 24}} fifo_widths = settings["fifo_widths"] - print("Programming FPGA..") - accel = FINNLiveFIFOOverlay(bitfile_name = bitfile, device = device, fclk_mhz = frequency, seed = seed, fifo_widths = fifo_widths) - + accel = FINNLiveFIFOOverlay( + bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths + ) + if accel.error: + print("Error: Accelerator initialization failed.") + sys.exit(1) + + print("Determining start depth..") (start_depth, iteration_runtime) = accel.determine_start_depth() ### First pass print("Starting first pass..") pass1_result = accel.size_iteratively(start_depth, iteration_runtime) - (fifo_depths, - log_total_fifo_size, - log_interval, - log_min_latency, - log_latency, - duration) = pass1_result + ( + fifo_depths, + log_total_fifo_size, + log_interval, + log_min_latency, + log_latency, + duration, + ) = pass1_result ### Visualize results and save as "fifo_sizing_graph.png" fig, ax1 = plt.subplots() - color = 'tab:red' - ax1.set_xlabel('Iteration') - ax1.set_ylabel('Total FIFO Size [kB]', color=color) + color = "tab:red" + ax1.set_xlabel("Iteration") + ax1.set_ylabel("Total FIFO Size [kB]", color=color) ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color) - ax1.tick_params(axis='y', labelcolor=color) + ax1.tick_params(axis="y", labelcolor=color) ax1.set_ylim(0, max(log_total_fifo_size)) - - ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis - color = 'tab:blue' - ax2.set_ylabel('Latency [cycles]', color=color) + ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis + + color = "tab:blue" + ax2.set_ylabel("Latency [cycles]", color=color) ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color) - ax2.tick_params(axis='y', labelcolor=color) - #ax2.set_ylim(0, max(log_latency)) + ax2.tick_params(axis="y", labelcolor=color) + # ax2.set_ylim(0, max(log_latency)) ax2.axhline(log_min_latency[0], color="green", label="Minimum (1st frame) Latency") ax2.legend() plt.tight_layout() - plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi = 300) + plt.savefig(os.path.join(report_dir, "fifo_sizing_graph.png"), dpi=300) ### Second pass for fine-tuning print("Starting second pass..") - pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95) - (fifo_depths, - log_total_fifo_size, - log_interval, - log_min_latency, - log_latency, - duration) = pass2_result + pass2_result = accel.size_iteratively(fifo_depths, iteration_runtime, reduction_factor=0.95) + ( + fifo_depths, + log_total_fifo_size, + log_interval, + log_min_latency, + log_latency, + duration, + ) = pass2_result ### Generate fifo_sizing_report.json fifo_report = { @@ -283,7 +351,7 @@ def determine_start_depth(self, ): }, } for fifo, depth in enumerate(fifo_depths): - size = (depth + accel.fifo_depth_offset) * accel.fifo_widths["StreamingFIFO_hls_%d" % fifo] + size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[fifo] fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset fifo_report["fifo_sizes"][fifo] = size with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f: @@ -312,9 +380,9 @@ def determine_start_depth(self, ): "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))), "min_pipeline_depth": round(min_latency / interval, 2), - "pipeline_depth" : round(latency / interval, 2), + "pipeline_depth": round(latency / interval, 2), } with open(reportfile, "w") as f: json.dump(report, f, indent=2) - print("Done.") \ No newline at end of file + print("Done.") diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 9ccc0e08f8..c18adb8d14 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -346,10 +346,17 @@ def apply(self, model): } if self.live_fifo_sizing: # export FIFO widths to the settings file as well + # at this stage, the FIFOs are already wrapped in StreamingDataflowPartitions fifo_widths = {} - for node in model.get_nodes_by_op_type("StreamingFIFO_hls"): - node_inst = getCustomOp(node) - fifo_widths[node.name] = node_inst.get_instream_width() + for sdp_node in model.get_nodes_by_op_type("StreamingDataflowPartition"): + sdp_node_inst = getCustomOp(sdp_node) + sdp_id = sdp_node_inst.get_nodeattr("partition_id") + dataflow_model_filename = sdp_node_inst.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + for node in kernel_model.graph.node: + if node.op_type.startswith("StreamingFIFO"): + node_inst = getCustomOp(node) + fifo_widths[sdp_id] = node_inst.get_instream_width() settings["fifo_widths"] = fifo_widths settingsfile = pynq_driver_dir + "/settings.json" From 4cfc32adb6867e757f8a7f77a9b0d57e06f48b20 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sat, 8 Mar 2025 18:26:33 +0000 Subject: [PATCH 073/125] Transformer: disable cppsim for virtual fifosizing --- benchmarking/dut/transformer.py | 7 +++++-- benchmarking/dut/transformer_custom_steps.py | 22 +++++++++++++++++++- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index ec737ce6b8..1798ea1410 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -908,7 +908,7 @@ def step_build_setup(self): # converting to HLS build_cfg.VerificationStepType.TIDY_UP_PYTHON, # Verify the model after generating C++ HLS and applying folding - build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + #build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, #only inserted if live FIFO-sizing is off # No RTL Simulation support for now ], # File with test inputs for verification @@ -963,7 +963,7 @@ def step_build_setup(self): # model before creating the stitched IP # Note: end-to-end verification of the stitched IP in RTL simulation # is still not possible due to missing float IPs - node_by_node_cppsim, + #node_by_node_cppsim, #only inserted if live FIFO-sizing is off # Only for debugging for now, does not work if "vivado" style # StreamingFIFOs are used # node_by_node_rtlsim, @@ -987,5 +987,8 @@ def step_build_setup(self): for i in range(len(cfg.steps)): if cfg.steps[i] == "step_hw_ipgen": cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len)) + # also enable cppsim, which doesn't work with virtual FIFOs + cfg.steps.insert(i+2, node_by_node_cppsim) + cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM) return cfg diff --git a/benchmarking/dut/transformer_custom_steps.py b/benchmarking/dut/transformer_custom_steps.py index 4ff497b892..5b0d39c756 100644 --- a/benchmarking/dut/transformer_custom_steps.py +++ b/benchmarking/dut/transformer_custom_steps.py @@ -11,6 +11,8 @@ # YAML for loading experiment configurations import yaml +import json + # QONNX quantization data types from qonnx.core.datatype import DataType @@ -616,7 +618,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): del config[node.name] # Create/Open a YAML file to store the configuration for later reuse - with open(cfg.output_dir + "/final_hw_config.yaml", "w") as file: + # TODO: make consistent with .json report in default step + with open(cfg.output_dir + "/report/final_hw_config.yaml", "w") as file: # Store the configuration dictionary as YAML code yaml.safe_dump(config, file) @@ -628,6 +631,23 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(SplitLargeFIFOs()) model = model.transform(RemoveShallowFIFOs()) + # generate a dedicated report about final FIFO sizes + fifo_info = {} + fifo_info["fifo_depths"] = {} + fifo_info["fifo_sizes"] = {} + total_fifo_size = 0 + for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") + fifo_info["fifo_sizes"][ + node.name + ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") + total_fifo_size += fifo_info["fifo_sizes"][node.name] + fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) + + with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f: + json.dump(fifo_info, f, indent=2) + # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) model = model.transform( From b0fb5f258c984f1f30aea20cefbed1f01b5a27e1 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 9 Mar 2025 10:29:39 +0000 Subject: [PATCH 074/125] [Driver] Reset PYNQ cache before loading Overlay --- src/finn/qnn-data/templates/driver/driver_instrumentation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py index 90a0ed5b89..aa5225eab6 100644 --- a/src/finn/qnn-data/templates/driver/driver_instrumentation.py +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -1,7 +1,7 @@ import argparse import json import time -from pynq import Overlay +from pynq import PL, Overlay from pynq.pl_server.device import Device from pynq.ps import Clocks @@ -126,6 +126,7 @@ def observe_instrumentation(self, debug_print=True): # instantiate FINN accelerator driver and pass batchsize and bitfile print("Programming FPGA..") + PL.reset() # reset PYNQ cache accel = FINNInstrumentationOverlay( bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed ) From a08e2c4e12be4fa532f6b81ff428142dfaa757cd Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 9 Mar 2025 10:33:44 +0000 Subject: [PATCH 075/125] [Driver] Reset PYNQ cache, fix json int keys --- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 6 ++++-- src/finn/transformation/fpgadataflow/make_pynq_driver.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index 5aa116ebac..fc50314cf3 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -5,6 +5,7 @@ import sys import time from driver_instrumentation import FINNInstrumentationOverlay +from pynq import PL from pynq.pl_server.device import Device @@ -61,7 +62,7 @@ def total_fifo_size(self, depths): # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs total_size_bits = 0 for i, depth in enumerate(depths): - total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[i] + total_size_bits += (depth + self.fifo_depth_offset) * self.fifo_widths[str(i)] total_size_kB = total_size_bits / 8.0 / 1000.0 return total_size_kB @@ -271,6 +272,7 @@ def determine_start_depth( fifo_widths = settings["fifo_widths"] print("Programming FPGA..") + PL.reset() # reset PYNQ cache accel = FINNLiveFIFOOverlay( bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths ) @@ -351,7 +353,7 @@ def determine_start_depth( }, } for fifo, depth in enumerate(fifo_depths): - size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[fifo] + size = (depth + accel.fifo_depth_offset) * accel.fifo_widths[str(fifo)] fifo_report["fifo_depths"][fifo] = depth + accel.fifo_depth_offset fifo_report["fifo_sizes"][fifo] = size with open(os.path.join(report_dir, "fifo_sizing_report.json"), "w") as f: diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index c18adb8d14..e7c947192a 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -350,7 +350,8 @@ def apply(self, model): fifo_widths = {} for sdp_node in model.get_nodes_by_op_type("StreamingDataflowPartition"): sdp_node_inst = getCustomOp(sdp_node) - sdp_id = sdp_node_inst.get_nodeattr("partition_id") + # JSON doesn't support int keys + sdp_id = str(sdp_node_inst.get_nodeattr("partition_id")) dataflow_model_filename = sdp_node_inst.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) for node in kernel_model.graph.node: From c0fcb10ab19a7ea3f22e132ac898d6fec8355cc6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Sun, 9 Mar 2025 11:29:14 +0000 Subject: [PATCH 076/125] Improve error propagation --- benchmarking/bench.py | 14 +++++++++++--- benchmarking/measure.py | 14 +++++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 41cfdbbbf7..3d0a575057 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -151,7 +151,7 @@ def get_default_session_options_new(): print("Run skipped") else: log_dict["status"] = "ok" - print("Run completed") + print("Run successfully completed") except Exception: log_dict["status"] = "failed" print("Run failed: " + traceback.format_exc()) @@ -159,6 +159,16 @@ def get_default_session_options_new(): log_dict["output"] = bench_object.output_dict + # examine status reported by builder (which catches all exceptions before they reach us here) + # we could also fail the pipeline if functional verification fails (TODO) + builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json") + if os.path.isfile(builder_log_path): + with open(builder_log_path, "r") as f: + builder_log = json.load(f) + if builder_log["status"] == "failed": + print("Run failed (builder reported failure)") + exit_code = 1 + # log metadata of this run to its own report directory log_path = os.path.join(bench_object.report_dir, "metadata_bench.json") with open(log_path, "w") as f: @@ -169,8 +179,6 @@ def get_default_session_options_new(): # save local artifacts of this run (e.g., full build dir, detailed debug info) bench_object.save_local_artifacts_collection() - #TODO: examine verification result and builder status here to fail pipeline via exit code? - print("Stopping job") return exit_code diff --git a/benchmarking/measure.py b/benchmarking/measure.py index 3accb734b9..d0e5a64aa8 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -1,4 +1,5 @@ import os +import sys import subprocess import shutil @@ -6,6 +7,7 @@ if __name__ == "__main__": + exit_code = 0 print("Looking for deployment packages in artifacts..") # Find deployment packages from artifacts artifacts_in_dir = os.path.join("build_artifacts", "runs_output") @@ -24,12 +26,16 @@ # Run driver print("Running driver..") - subprocess.run(["python", f"{extract_dir}/driver/driver.py", + result = subprocess.run(["python", f"{extract_dir}/driver/driver.py", "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", "--settingsfile", f"{extract_dir}/driver/settings.json", "--reportfile", f"{extract_dir}/measured_performance.json", - ]) - print("Driver finished.") + ]) + if result.returncode != 0: + print("Driver reported error!") + exit_code = 1 + else: + print("Driver finished successfully.") # Copy results back to artifact directory for report in ["measured_performance.json", @@ -47,3 +53,5 @@ # Clear temporary dir delete_dir_contents(extract_dir) print("Done.") + print("Processed all deployment packages.") + sys.exit(exit_code) From f8bf6e7b20c8515f1126dfb493c531b26bb133a6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 11 Mar 2025 12:10:55 +0000 Subject: [PATCH 077/125] Zip debug artifacts --- benchmarking/bench_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 5f828ca4e4..9f6689dcd3 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -169,7 +169,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) if self.debug: # Save entire FINN build dir and working dir # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure) - self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], False)) + self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True)) #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False)) ### SETUP ### From a63b4ae10b2ddaba82e320708762eb3e1dbe87d2 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 14 Mar 2025 16:19:36 +0100 Subject: [PATCH 078/125] Fix MNV1 fifo step order --- benchmarking/dut/mobilenetv1.py | 41 ++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py index 56b13a6095..06042816cf 100644 --- a/benchmarking/dut/mobilenetv1.py +++ b/benchmarking/dut/mobilenetv1.py @@ -1,4 +1,21 @@ +from bench_base import bench from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps + +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb +import finn.transformation.streamline.reorder as reorder from finn.builder.build_dataflow_config import ( DataflowBuildConfig, ShellFlowType, @@ -6,25 +23,8 @@ ) from finn.builder.build_dataflow_steps import verify_step from finn.transformation.streamline import Streamline -from qonnx.transformation.double_to_single_float import DoubleToSingleFloat -import finn.transformation.streamline.absorb as absorb -import finn.transformation.streamline.reorder as reorder -from qonnx.transformation.infer_data_layouts import InferDataLayouts from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul -from qonnx.transformation.remove import RemoveIdentityOps from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds -from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul -from qonnx.transformation.general import ( - GiveReadableTensorNames, - GiveUniqueNodeNames, - ApplyConfig, -) -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d -from qonnx.transformation.infer_datatypes import InferDataTypes - -from bench_base import bench def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): @@ -55,6 +55,7 @@ def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): return model + def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(LowerConvsToMatMul()) model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) @@ -66,6 +67,7 @@ def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(InferDataLayouts()) return model + def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(to_hw.InferPool()) model = model.transform(to_hw.InferConvInpGen()) @@ -78,6 +80,7 @@ def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildC model = model.transform(GiveReadableTensorNames()) return model + def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: try: @@ -101,6 +104,7 @@ def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): print("No SLR floorplanning applied") return model + def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(to_hw.InferPool()) model = model.transform(to_hw.InferConvInpGen()) @@ -114,6 +118,7 @@ def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: Da model = model.transform(GiveReadableTensorNames()) return model + class bench_mobilenetv1(bench): def step_build_setup(self): # create build config for MobileNetV1 (based on finn-examples) @@ -126,9 +131,9 @@ def step_build_setup(self): "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", + "step_set_fifo_depths", "step_hw_codegen", "step_hw_ipgen", - "step_set_fifo_depths", "step_create_stitched_ip", "step_synthesize_bitfile", "step_make_pynq_driver", From cce646dc091a61f4af8984fa05962062d27c45d6 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 21 Mar 2025 11:52:18 +0100 Subject: [PATCH 079/125] Allow local test execution --- .gitignore | 5 +++ benchmarking/bench.py | 62 +++++++++++++++++++++++--------------- benchmarking/bench_base.py | 7 +++-- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index be61378730..f40370b443 100644 --- a/.gitignore +++ b/.gitignore @@ -96,3 +96,8 @@ MANIFEST # downloaded dep repos /deps/ + +bench_input +bench_output +bench_save +bench_work diff --git a/benchmarking/bench.py b/benchmarking/bench.py index 3d0a575057..54788ac6a5 100644 --- a/benchmarking/bench.py +++ b/benchmarking/bench.py @@ -5,6 +5,9 @@ import time import traceback import onnxruntime as ort +import importlib + +from util import delete_dir_contents from dut.mvau import bench_mvau from dut.resnet50 import bench_resnet50 @@ -36,11 +39,36 @@ def get_default_session_options_new(): return _default_session_options ort.capi._pybind_state.get_default_session_options = get_default_session_options_new - # Gather job array info - job_id = int(os.environ["SLURM_JOB_ID"]) - #TODO: allow portable execution on any platform by making as many env vars as possible optional - print("Job launched with ID: %d" % (job_id)) try: + # Launched via SLURM, expect additional CI env vars + job_id = int(os.environ["SLURM_JOB_ID"]) + # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk) + experiment_dir = os.environ.get("CI_PROJECT_DIR") + save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"), + "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME")) + work_dir = os.environ["PATH_WORKDIR"] + + # Gather benchmarking configs + if config_name == "manual": + config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) + else: + configs_path = os.path.join(os.path.dirname(__file__), "cfg") + config_select = config_name + ".json" + config_path = os.path.join(configs_path, config_select) + print("Job launched with SLURM ID: %d" % (job_id)) + except KeyError: + # Launched without SLURM, assume test run on local machine + job_id = 0 + experiment_dir = "bench_output/" + time.strftime("%d_%H_%M") + save_dir = "bench_save/" + time.strftime("%d_%H_%M") + work_dir = "bench_work" + os.makedirs(work_dir, exist_ok=True) + delete_dir_contents(work_dir) + config_path = config_name # expect caller to provide direct path to a single config file + print("Local test job launched without SLURM") + + try: + # Launched as SLURM job array array_id = int(os.environ["SLURM_ARRAY_JOB_ID"]) task_id = int(os.environ["SLURM_ARRAY_TASK_ID"]) task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"]) @@ -49,36 +77,20 @@ def get_default_session_options_new(): % (array_id, task_id, task_count) ) except KeyError: + # Launched as single (SLURM or non-SLURM) job array_id = job_id task_id = 0 task_count = 1 print("Launched as single job") # Prepare result directory - # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk) - experiment_dir = os.environ.get("CI_PROJECT_DIR") - artifacts_dir = os.path.join(experiment_dir, "build_artifacts") os.makedirs(artifacts_dir, exist_ok=True) print("Collecting results in path: %s" % artifacts_dir) - - # local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) - if job_id == 0: - #DEBUG mode - save_dir = experiment_dir + "_save" - else: - save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"), - "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME")) - print("Saving additional artifacts in path: %s" % save_dir) - os.makedirs(save_dir, exist_ok=True) - # Gather benchmarking configs - if config_name == "manual": - config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) - else: - configs_path = os.path.join(os.path.dirname(__file__), "cfg") - config_select = config_name + ".json" - config_path = os.path.join(configs_path, config_select) + # Prepare local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) + os.makedirs(save_dir, exist_ok=True) + print("Saving additional artifacts in path: %s" % save_dir) # Load config print("Loading config %s" % (config_path)) @@ -136,7 +148,7 @@ def get_default_session_options_new(): # Create bench object for respective DUT if "dut" in params: if params["dut"] in dut: - bench_object = dut[params["dut"]](params, task_id, run_id, artifacts_dir, save_dir) + bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir) else: print("ERROR: unknown DUT specified") return 1 diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 9f6689dcd3..cc25fc7ff7 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -117,11 +117,12 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt json_file.write(json.dumps(power_report_dict, indent=2)) class bench(): - def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True): + def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True): super().__init__() self.params = params self.task_id = task_id self.run_id = run_id + self.work_dir = work_dir self.artifacts_dir = artifacts_dir self.save_dir = save_dir self.debug = debug @@ -175,7 +176,7 @@ def __init__(self, params, task_id, run_id, artifacts_dir, save_dir, debug=True) ### SETUP ### # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) # Ensure it exists but is empty (clear potential artifacts from previous runs) - tmp_buildflow_dir = os.path.join(os.environ["PATH_WORKDIR"], "buildflow") + tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow") os.makedirs(tmp_buildflow_dir, exist_ok=True) delete_dir_contents(tmp_buildflow_dir) self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir @@ -422,7 +423,7 @@ def steps_full_build_flow(self): cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST cfg.verbose = False cfg.enable_build_pdb_debug = False - cfg.stitched_ip_gen_dcp = False # only needed for further manual integration + #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration cfg.force_python_rtlsim = False cfg.split_large_fifos = True cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet From 2c9925d29bc9e39d0de2dbd02a8221ecd1f786ec Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 24 Mar 2025 09:17:48 +0100 Subject: [PATCH 080/125] Start search for start depths from 1 --- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index fc50314cf3..ada6979db2 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -173,7 +173,7 @@ def determine_start_depth( ): ### Attempt to determine start depth for all FIFOs automatically ### # If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis - start_depth = 64 + start_depth = 1 last_interval = 0 start_depth_found = False From 9f3e7c73dd3d403b1b1fe51156b3c36bc2dd2e61 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 24 Mar 2025 17:06:59 +0100 Subject: [PATCH 081/125] Let driver fill live FIFO sizes into complete folding config --- src/finn/builder/build_dataflow_steps.py | 53 ++++++++++++------- .../templates/driver/driver_fifosizing.py | 27 ++++++++-- .../fpgadataflow/make_pynq_driver.py | 9 +++- 3 files changed, 65 insertions(+), 24 deletions(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 5dc971cf33..6f8e1e7007 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -44,6 +44,7 @@ GiveUniqueNodeNames, RemoveStaticGraphInputs, RemoveUnusedTensors, + SortGraph, ) from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes @@ -553,8 +554,40 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): `GiveUniqueNodeNames`. """ + hw_attrs = [ + "PE", + "SIMD", + "parallel_window", + "ram_style", + "depth", + "impl_style", + "resType", + "mem_mode", + "runtime_writeable_weights", + "inFIFODepths", + "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", + ] + # Experimental live FIFO-sizing, overwrites all other FIFO-related behavior if cfg.live_fifo_sizing: + # Create all DWCs and FIFOs normally + model = model.transform(InsertDWC()) + model = model.transform( + InsertFIFO(vivado_ram_style=cfg.large_fifo_mem_style, create_shallow_fifos=True) + ) + + # Clean up model + model = model.transform(SortGraph()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # save original folding config before potentially modifying it + cfg_path = cfg.output_dir + "/report/folding_config_before_lfs.json" + extract_model_config_to_json(model, cfg_path, hw_attrs) + model.set_metadata_prop("folding_config_before_lfs", cfg_path) + # Disable runtime-writable weights, external weights, and dynamic mode, # as we don't support additional AXI-lite interfaces next to the FIFOs for node in model.graph.node: @@ -578,10 +611,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): except AttributeError: pass - # Create all DWCs and FIFOs normally - model = model.transform(InsertDWC()) - model = model.transform(InsertFIFO(create_shallow_fifos=True)) - # Specialize FIFOs to HLS back-end instead of default RTL back-end for node in model.get_nodes_by_op_type("StreamingFIFO"): node_inst = getCustomOp(node) @@ -594,6 +623,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): node_inst.set_nodeattr("impl_style", "virtual") # Clean up model + model = model.transform(SortGraph()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) @@ -659,21 +689,6 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(ApplyConfig(cfg.folding_config_file)) # extract the final configuration and save it as json - hw_attrs = [ - "PE", - "SIMD", - "parallel_window", - "ram_style", - "depth", - "impl_style", - "resType", - "mem_mode", - "runtime_writeable_weights", - "inFIFODepths", - "outFIFODepths", - "depth_trigger_uram", - "depth_trigger_bram", - ] extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) # perform FIFO splitting and shallow FIFO removal only after the final config diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index ada6979db2..1cbc5053cf 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -259,6 +259,7 @@ def determine_start_depth( settingsfile = args.settingsfile devID = args.device device = Device.devices[devID] + folding_config_lfs = None # overwrite frequency if specified in settings file if settingsfile != "": @@ -267,10 +268,15 @@ def determine_start_depth( if "fclk_mhz" in settings: frequency = settings["fclk_mhz"] - # For live FIFO-sizing, we also expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g., - # {'fifo_widths': {0: 8, 1: 32, 2: 24}} + # For live FIFO-sizing, we also expect the FIFO widths (in bits) exported by FINN, e.g., + # {'fifo_widths': {"0": 8, "1": 32, "2": 24}} fifo_widths = settings["fifo_widths"] + # The settings can also contain the original folding config, + # into which we can insert the live FIFO sizes once we are done + if "folding_config_before_lfs" in settings: + folding_config_lfs = settings["folding_config_before_lfs"] + print("Programming FPGA..") PL.reset() # reset PYNQ cache accel = FINNLiveFIFOOverlay( @@ -362,11 +368,24 @@ def determine_start_depth( ### Generate fifo_depth_export.json to export FIFO depths for use in FINN fifo_depth_export = {} for fifo, depth in enumerate(fifo_depths): - fifo_depth_export["StreamingFIFO_rtl_%d" % fifo] = {} - fifo_depth_export["StreamingFIFO_rtl_%d" % fifo]["depth"] = depth + accel.fifo_depth_offset + fifo_name = "StreamingFIFO_rtl_%d" % fifo + fifo_depth_export[fifo_name] = {} + fifo_depth_export[fifo_name]["depth"] = depth + accel.fifo_depth_offset with open(os.path.join(report_dir, "fifo_depth_export.json"), "w") as f: json.dump(fifo_depth_export, f, indent=2) + # Also export directly into original folding config for convenience + if folding_config_lfs: + for key in list(folding_config_lfs.keys()): + if key.startswith("StreamingFIFO"): + fifo_name = "StreamingFIFO_rtl_%d" % int(key.removeprefix("StreamingFIFO_")) + # Rename FIFO from StreamingFIFO_* to StreamingFIFO_rtl_* + folding_config_lfs[fifo_name] = folding_config_lfs.pop(key) + folding_config_lfs[fifo_name]["depth"] = fifo_depth_export[fifo_name]["depth"] + folding_config_lfs[fifo_name]["impl_style"] = "rtl" + with open(os.path.join(report_dir, "folding_config_lfs.json"), "w") as f: + json.dump(folding_config_lfs, f, indent=2) + ### Generate the usual instrumentation performance report based on final state min_latency = log_min_latency[-1] latency = log_latency[-1] diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index e7c947192a..e065641b27 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -313,7 +313,7 @@ def __init__(self, platform, clk_period_ns, live_fifo_sizing): self.clk_period_ns = clk_period_ns self.live_fifo_sizing = live_fifo_sizing - def apply(self, model): + def apply(self, model: ModelWrapper): # TODO: support runtime-writable and external weights # TODO: support Alveo and Versal platforms @@ -359,6 +359,13 @@ def apply(self, model): node_inst = getCustomOp(node) fifo_widths[sdp_id] = node_inst.get_instream_width() settings["fifo_widths"] = fifo_widths + # export original folding config to settings file, + # so that the driver can generate a final cfg with live fifo sizes applied + folding_path = model.get_metadata_prop("folding_config_before_lfs") + if folding_path: + with open(folding_path, "r") as f: + folding_cfg = json.load(f) + settings["folding_config_before_lfs"] = folding_cfg settingsfile = pynq_driver_dir + "/settings.json" with open(settingsfile, "w") as f: From 7a2ff270f206ddfe4e86d4a122870d816a86f0e1 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 24 Mar 2025 18:15:09 +0100 Subject: [PATCH 082/125] Generate follow-up bench cfg for lfs experiments --- benchmarking/bench_base.py | 7 +++++ benchmarking/collect.py | 34 ++++++++++++++++++++++++ src/finn/builder/build_dataflow_steps.py | 2 +- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index cc25fc7ff7..1aab18dd28 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -441,6 +441,7 @@ def steps_full_build_flow(self): cfg.auto_fifo_depths = False cfg.live_fifo_sizing = True cfg.enable_instrumentation = True + cfg.synth_clk_period_ns = 10 # force conservative 100 MHz clock else: cfg.auto_fifo_depths = True cfg.auto_fifo_strategy = self.params["fifo_method"] @@ -468,6 +469,12 @@ def steps_full_build_flow(self): if "floorplan_path" in self.build_inputs: cfg.floorplan_path = self.build_inputs["floorplan_path"] + if "target_fps" in self.params: + if self.params["target_fps"] == "None": + cfg.target_fps = None + else: + cfg.target_fps = self.params["target_fps"] + # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) # TODO: make configurable or set on pipeline level? os.environ["LIVENESS_THRESHOLD"] = "10000000" diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 45f6073d1b..8a5bce3663 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -58,6 +58,12 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= run_ids.sort() print("Found %d runs" % len(run_ids)) + follow_up_bench_cfg = list() + # Prepare (local) output directory where follow-up bench configs will be stored + output_cfg_dir = os.path.join(os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID")) + output_folding_dir = os.path.join(output_cfg_dir, "folding") + output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json") + for id in run_ids: print("Processing run %d" % id) experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) @@ -212,4 +218,32 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True) live.log_artifact(dvc_report_dir) + # Prepare benchmarking config for follow-up runs after live FIFO-sizing + folding_config_lfs_path = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "folding_config_lfs.json") + if os.path.isfile(folding_config_lfs_path): + # Copy folding config produced by live FIFO-sizing + output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json") + os.makedirs(output_folding_dir, exist_ok=True) + shutil.copy(folding_config_lfs_path, output_folding_path) + + # Create benchmarking config + metadata_bench = open_json_report(id, "metadata_bench.json") + configuration = dict() + for key in metadata_bench["params"]: + # wrap in list + configuration[key] = [metadata_bench["params"][key]] + # overwrite FIFO-related params + import_folding_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"), "folding", experiment_name + ".json") + configuration["fifo_method"] = ["manual"] + configuration["target_fps"] = ["None"] + configuration["folding_path"] = [import_folding_path] + + follow_up_bench_cfg.append(configuration) + + # Save aggregated benchmarking config for follow-up job + if follow_up_bench_cfg: + print("Saving follow-up bench config for lfs: %s" % output_cfg_path) + with open(output_cfg_path, "w") as f: + json.dump(follow_up_bench_cfg, f, indent=2) + print("Done") diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index c508a2d505..7ff957af0a 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -423,7 +423,7 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi "depth_trigger_uram", "depth_trigger_bram", ] - extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs) + extract_model_config_to_json(model, cfg.output_dir + "/report/auto_folding_config.json", hw_attrs) return model From ea808b25fdca1568ed3f1be6c65a49ebbbfd11ec Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 25 Mar 2025 13:58:05 +0100 Subject: [PATCH 083/125] Fix collection of lfs-generated folding config --- benchmarking/collect.py | 1 + benchmarking/measure.py | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 8a5bce3663..f59f3a3607 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -224,6 +224,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= # Copy folding config produced by live FIFO-sizing output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json") os.makedirs(output_folding_dir, exist_ok=True) + print("Saving lfs-generated folding config of this run to use in a future follow-up run: %s" % output_folding_path) shutil.copy(folding_config_lfs_path, output_folding_path) # Create benchmarking config diff --git a/benchmarking/measure.py b/benchmarking/measure.py index d0e5a64aa8..a79632c168 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -42,6 +42,7 @@ "fifo_sizing_report.json", "fifo_depth_export.json", "fifo_sizing_graph.png", + "folding_config_lfs.json", ]: report_path = os.path.join(extract_dir, report) if os.path.isfile(report_path): From 15fef09eaf7c31f924c3474a960fa278f898c9fe Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 28 Mar 2025 15:22:58 +0100 Subject: [PATCH 084/125] Increase virtual FIFO depth offset to 8 --- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index 1cbc5053cf..a87342f79e 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -32,8 +32,9 @@ def __init__( self.error = False self.fifo_widths = fifo_widths self.num_fifos = len(self.fifo_widths) - # Try to account for additional registers introduced by virtual FIFO HLS implementation - self.fifo_depth_offset = 4 + # Account for additional FIFO depth and implicit registers introduced by the virtual FIFO HLS implementation that are not present in real FIFOs + # This results in a minimum possible FIFO depth of 1 + 8 = 9, which should be improved in a future virtual FIFO implementation (TODO) + self.fifo_depth_offset = 8 # Sanity check # We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps From 5e8c888fcb562de22fffdc74ee55340393f36e30 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 8 Apr 2025 08:47:40 +0200 Subject: [PATCH 085/125] Allow IODMA wrapper --- benchmarking/bench_base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index 1aab18dd28..dc51f690ed 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -426,13 +426,18 @@ def steps_full_build_flow(self): #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration cfg.force_python_rtlsim = False cfg.split_large_fifos = True - cfg.enable_instrumentation = True # no IODMA functional correctness/accuracy test yet cfg.save_intermediate_models = True # Save the intermediate model graphs cfg.verify_save_full_context = True, # Output full context dump for verification steps #rtlsim_use_vivado_comps # TODO ? #cfg.default_swg_exception #cfg.large_fifo_mem_style + # Switch between instrumentation or IODMA wrapper (TODO: combine both in one bitstream) + if "enable_instrumentation" in self.params: + cfg.enable_instrumentation = self.params["enable_instrumentation"] + else: + cfg.enable_instrumentation = True + # "manual or "characterize" or "largefifo_rtlsim" or "live" if "fifo_method" in self.params: if self.params["fifo_method"] == "manual": From 2687ae013d0d0fab1e7a4886934c049e18582c7d Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 8 Apr 2025 09:48:36 +0200 Subject: [PATCH 086/125] Parse DCP resource breakdown --- benchmarking/collect.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index f59f3a3607..491c29d043 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -154,6 +154,31 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= # fifo_sizing.json log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/") + # stitched IP DCP synth resource report + log_nested_metrics_from_report(id, live, "post_synth_resources_dcp.json", "(top)", [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], prefix="synth(dcp)/resources/") + + # stitched IP DCP synth resource breakdown + # TODO: generalize to all build flows and bitfile synth + layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"] + for category in layer_categories: + log_nested_metrics_from_report(id, live, "res_breakdown_build_output.json", category, [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], prefix="synth(dcp)/resources(breakdown)/" + category + "/") + # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ "LUT", From 00ec0f94ddb30a84b0005ec9fb47bdfba0479a5e Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 8 Apr 2025 09:52:15 +0200 Subject: [PATCH 087/125] Put pipeline and run IDs in DVC exp msg --- benchmarking/collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 491c29d043..6fcd3be948 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -67,7 +67,7 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= for id in run_ids: print("Processing run %d" % id) experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) - experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + " (" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + ")" #TODO: cache images once we switch to a cache provider that works with DVC Studio with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live: ### PARAMS ### From c4f7437fd6be9354997e261fba8be51d3efd3af9 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 24 Apr 2025 11:03:50 +0200 Subject: [PATCH 088/125] Validate accuracy when synthesized with IODMA wrapper --- benchmarking/bench_base.py | 3 + benchmarking/collect.py | 5 + benchmarking/measure.py | 21 ++- src/finn/builder/build_dataflow_config.py | 3 + src/finn/builder/build_dataflow_steps.py | 2 +- .../qnn-data/templates/driver/validate.py | 172 +++++++++++++++--- .../fpgadataflow/make_pynq_driver.py | 13 +- 7 files changed, 183 insertions(+), 36 deletions(-) diff --git a/benchmarking/bench_base.py b/benchmarking/bench_base.py index dc51f690ed..39a16dd7bc 100644 --- a/benchmarking/bench_base.py +++ b/benchmarking/bench_base.py @@ -480,6 +480,9 @@ def steps_full_build_flow(self): else: cfg.target_fps = self.params["target_fps"] + if "validation_dataset" in self.params: + cfg.validation_dataset = self.params["validation_dataset"] + # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) # TODO: make configurable or set on pipeline level? os.environ["LIVENESS_THRESHOLD"] = "10000000" diff --git a/benchmarking/collect.py b/benchmarking/collect.py index 6fcd3be948..81dfbe339f 100644 --- a/benchmarking/collect.py +++ b/benchmarking/collect.py @@ -213,6 +213,11 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= # instrumentation measurement log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/") + # IODMA validation accuracy + log_metrics_from_report(id, live, "validation.json", [ + "top-1_accuracy", + ], prefix="measurement/validation/") + # power measurement # TODO diff --git a/benchmarking/measure.py b/benchmarking/measure.py index a79632c168..7231991bde 100644 --- a/benchmarking/measure.py +++ b/benchmarking/measure.py @@ -26,11 +26,21 @@ # Run driver print("Running driver..") - result = subprocess.run(["python", f"{extract_dir}/driver/driver.py", - "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", - "--settingsfile", f"{extract_dir}/driver/settings.json", - "--reportfile", f"{extract_dir}/measured_performance.json", - ]) + # run validate.py (from IODMA driver) if present, otherwise driver.py from instrumentation + # TODO: unify IODMA/instrumentation shell & driver + if os.path.isfile(f"{extract_dir}/driver/validate.py"): + result = subprocess.run(["python", f"{extract_dir}/driver/validate.py", + "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", f"{extract_dir}/driver/settings.json", + "--reportfile", f"{extract_dir}/validation.json", + "--dataset_root", "/home/xilinx/datasets", #TODO: env var + ]) + else: + result = subprocess.run(["python", f"{extract_dir}/driver/driver.py", + "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", f"{extract_dir}/driver/settings.json", + "--reportfile", f"{extract_dir}/measured_performance.json", + ]) if result.returncode != 0: print("Driver reported error!") exit_code = 1 @@ -43,6 +53,7 @@ "fifo_depth_export.json", "fifo_sizing_graph.png", "folding_config_lfs.json", + "validation.json", ]: report_path = os.path.join(extract_dir, report) if os.path.isfile(report_path): diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index a3db23a714..b2814f31ab 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -369,6 +369,9 @@ class DataflowBuildConfig: #: rtlsim, otherwise they will be replaced by RTL implementations. rtlsim_use_vivado_comps: Optional[bool] = True + #: Specify validation dataset to be used for deployment of the generated driver + validation_dataset: Optional[str] = None + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 7ff957af0a..1bd78c7f0a 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -852,7 +852,7 @@ def step_make_pynq_driver(model: ModelWrapper, cfg: DataflowBuildConfig): ) ) else: - model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) + model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform(), cfg.validation_dataset)) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) print("PYNQ Python driver written into " + driver_dir) return model diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index c8bc1c009d..55e7603650 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -27,10 +27,65 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os import numpy as np +from PIL import Image +from dataset_loading import FileQueue, ImgQueue +import json +from pynq import PL from driver import io_shape_dict from driver_base import FINNExampleOverlay +def img_resize(img, size): + w, h = img.size + if (w <= h and w == size) or (h <= w and h == size): + return img + if w < h: + ow = size + oh = int(size * h / w) + return img.resize((ow, oh), Image.BILINEAR) + else: + oh = size + ow = int(size * w / h) + return img.resize((ow, oh), Image.BILINEAR) + +def img_center_crop(img, size): + crop_height, crop_width = (size, size) + image_width, image_height = img.size + crop_top = int(round((image_height - crop_height) / 2.)) + crop_left = int(round((image_width - crop_width) / 2.)) + return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height)) + +def pre_process(img_np): + img = Image.fromarray(img_np.astype(np.uint8)) + img = img_resize(img, 256) + img = img_center_crop(img, 224) + img = np.array(img, dtype=np.uint8) + return img + +def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images = 50000): + if label_file_path is None: + val_folders = [ f.name for f in os.scandir(val_path) if f.is_dir() ] + val_folders = sorted(val_folders) + assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val" + files = [] + labels = [] + for idx, folder in enumerate(val_folders): + current_files = sorted(os.listdir(os.path.join(val_path, folder))) + current_files = [os.path.join(folder, file) for file in current_files] + files.extend(current_files) + labels.extend([idx]*len(current_files)) + files = files[:n_images] + else: + files = ['ILSVRC2012_val_{:08d}.JPEG'.format(i) for i in range(1,n_images+1)] + labels = np.loadtxt(label_file_path, dtype=int, usecols=1) + + file_queue = FileQueue() + file_queue.load_epochs(list(zip(files,labels)), shuffle=False) + img_queue = ImgQueue(maxsize=batch_size) + img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process) + return img_queue + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Validate top-1 accuracy for FINN-generated accelerator" @@ -38,7 +93,7 @@ parser.add_argument( "--batchsize", help="number of samples for inference", type=int, default=100 ) - parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True) + parser.add_argument("--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="") parser.add_argument( "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma" ) @@ -48,14 +103,43 @@ parser.add_argument( "--dataset_root", help="dataset root dir for download/reuse", default="/tmp" ) + parser.add_argument( + "--reportfile", + help="Name of output .json report file", + type=str, + default="validation.json", + ) + parser.add_argument( + "--settingsfile", help="Name of optional input .json settings file", type=str, default="" + ) # parse arguments args = parser.parse_args() bsize = args.batchsize dataset = args.dataset bitfile = args.bitfile platform = args.platform + reportfile = args.reportfile + settingsfile = args.settingsfile dataset_root = args.dataset_root + # overwrite settings if specified in settings file + if settingsfile != "": + with open(settingsfile, "r") as f: + settings = json.load(f) + if "validation_dataset" in settings: + dataset = settings["validation_dataset"] + + # program FPGA and load driver + PL.reset() # reset PYNQ cache + driver = FINNExampleOverlay( + bitfile_name=bitfile, + platform=platform, + io_shape_dict=io_shape_dict, + batch_size=bsize, + runtime_weight_dir="runtime_weights/", + ) + + # prepare dataset if dataset == "mnist": from dataset_loading import mnist @@ -68,40 +152,72 @@ trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( dataset_root, download=True, one_hot=False ) + elif dataset == "cifar100": + from dataset_loading import cifar + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( + dataset_root, download=True, one_hot=False, cifar10=False + ) + elif dataset == "imagenet": + val_dir = dataset_root + "/ImageNet/2012/val" + label_file = dataset_root + "/ImageNet/2012/val.txt" + img_queue = setup_dataloader(val_dir, label_file, bsize) + total = 50000 else: raise Exception("Unrecognized dataset") - test_imgs = testx - test_labels = testy - - ok = 0 - nok = 0 - total = test_imgs.shape[0] + # run accelerator on dataset + if dataset in ["mnist", "cifar10", "cifar100"]: + test_imgs = testx + test_labels = testy - driver = FINNExampleOverlay( - bitfile_name=bitfile, - platform=platform, - io_shape_dict=io_shape_dict, - batch_size=bsize, - runtime_weight_dir="runtime_weights/", - ) + ok = 0 + nok = 0 + total = test_imgs.shape[0] - n_batches = int(total / bsize) + n_batches = int(total / bsize) - test_imgs = test_imgs.reshape(n_batches, bsize, -1) - test_labels = test_labels.reshape(n_batches, bsize) + test_imgs = test_imgs.reshape(n_batches, bsize, -1) + test_labels = test_labels.reshape(n_batches, bsize) - for i in range(n_batches): - ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape) - exp = test_labels[i] - driver.copy_input_data_to_device(ibuf_normal) - driver.execute_on_buffers() - obuf_normal = np.empty_like(driver.obuf_packed_device[0]) - driver.copy_output_data_from_device(obuf_normal) - ret = np.bincount(obuf_normal.flatten() == exp.flatten()) - nok += ret[0] - ok += ret[1] - print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok)) + print("Starting validation..") + for i in range(n_batches): + ibuf_normal = test_imgs[i].reshape(driver.ishape_normal()) + exp = test_labels[i] + obuf_normal = driver.execute(ibuf_normal) + #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + #TODO: detect automatically if argmax is needed or output is already top-1 + obuf_normal = np.argmax(obuf_normal, axis=1) + ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) + nok += ret[0] + ok += ret[1] + print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok)) + elif dataset in ["imagenet"]: + ok = 0 + nok = 0 + i = 0 + print("Starting validation..") + while not img_queue.last_batch: + imgs, lbls = img_queue.get_batch(bsize, timeout=None) + imgs = np.array(imgs) + exp = np.array(lbls) + ibuf_normal = imgs.reshape(driver.ishape_normal()) + obuf_normal = driver.execute(ibuf_normal) + #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + #TODO: detect automatically if argmax is needed or output is already top-1 + obuf_normal = np.argmax(obuf_normal, axis=1) + ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) + nok += ret[0] + ok += ret[1] + i += 1 + print("batch %d : total OK %d NOK %d" % (i, ok, nok)) + # calculate top-1 accuracy acc = 100.0 * ok / (total) print("Final accuracy: %f" % acc) + + # write report to file + report = { + "top-1_accuracy": acc, + } + with open(reportfile, "w") as f: + json.dump(report, f, indent=2) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 42cc017d30..c6ddfbd173 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -74,9 +74,10 @@ class MakePYNQDriverIODMA(Transformation): under the runtime_weights/ subfolder of the pynq_driver_dir. """ - def __init__(self, platform): + def __init__(self, platform, validation_datset): super().__init__() self.platform = platform + self.validation_datset = validation_datset def apply(self, model): # create a temporary folder for the generated driver @@ -270,8 +271,16 @@ def apply(self, model): ) shutil.copy(validate_template, validate_py) - # generate weight files for runtime-writable layers + # generate settings.json for generated driver + if self.validation_datset is not None: + settings = { + "validation_datset": self.validation_datset, + } + settingsfile = pynq_driver_dir + "/settings.json" + with open(settingsfile, "w") as f: + json.dump(settings, f, indent=2) + # generate weight files for runtime-writable layers for sdp_ind, sdp_node in enumerate(model.graph.node): assert sdp_node.op_type == "StreamingDataflowPartition" # get dataflow model From d0e33d005cb82225dfdfb98eda6b4a43210752c4 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 24 Apr 2025 11:05:17 +0200 Subject: [PATCH 089/125] Update gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index f40370b443..dbac36d4f9 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ __pycache__/* .settings .idea tags +poetry.lock +*.code-workspace +.env # Package files *.egg @@ -97,6 +100,7 @@ MANIFEST # downloaded dep repos /deps/ +# local test directories for benchmarking infrastructure bench_input bench_output bench_save From 5b45cde002b350fcc919d7ada8c41931342ed0fc Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 25 Apr 2025 15:44:39 +0200 Subject: [PATCH 090/125] Fix typo --- src/finn/transformation/fpgadataflow/make_pynq_driver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index c6ddfbd173..6dad5dc1d8 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -274,7 +274,7 @@ def apply(self, model): # generate settings.json for generated driver if self.validation_datset is not None: settings = { - "validation_datset": self.validation_datset, + "validation_dataset": self.validation_datset, } settingsfile = pynq_driver_dir + "/settings.json" with open(settingsfile, "w") as f: From b5aee28630d9958d0572ac927cda8b8cb9c9e69a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 28 Apr 2025 12:48:53 +0200 Subject: [PATCH 091/125] Update gitignore --- .dvc/.gitignore | 3 +++ .gitignore | 9 +++++++++ 2 files changed, 12 insertions(+) create mode 100644 .dvc/.gitignore diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000000..528f30c71c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.gitignore b/.gitignore index be61378730..dbac36d4f9 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ __pycache__/* .settings .idea tags +poetry.lock +*.code-workspace +.env # Package files *.egg @@ -96,3 +99,9 @@ MANIFEST # downloaded dep repos /deps/ + +# local test directories for benchmarking infrastructure +bench_input +bench_output +bench_save +bench_work From 4f9dc7ee13006b004bc3700c354011ae38608add Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 28 Apr 2025 13:08:51 +0200 Subject: [PATCH 092/125] [Driver] Increase recursion limit --- src/finn/qnn-data/templates/driver/driver_fifosizing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/finn/qnn-data/templates/driver/driver_fifosizing.py b/src/finn/qnn-data/templates/driver/driver_fifosizing.py index a87342f79e..e86b28772d 100644 --- a/src/finn/qnn-data/templates/driver/driver_fifosizing.py +++ b/src/finn/qnn-data/templates/driver/driver_fifosizing.py @@ -279,7 +279,12 @@ def determine_start_depth( folding_config_lfs = settings["folding_config_before_lfs"] print("Programming FPGA..") - PL.reset() # reset PYNQ cache + # Increase recursion limit because the default value (1000) caused pickle RecursionErrors + # during PYNQ cache handling for accelerators with many FIFOs (exact reason unknown) + sys.setrecursionlimit(10000) + # Reset PYNQ cache, without this we encountered issues where PYNQ would try to load + # an incorrect combination of .bit and .hwh file, see https://github.com/Xilinx/PYNQ/issues/1409 + PL.reset() accel = FINNLiveFIFOOverlay( bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed, fifo_widths=fifo_widths ) From 46995244766d2f629ae2354c3a20ea907ee958d7 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 6 May 2025 20:47:20 +0200 Subject: [PATCH 093/125] [Driver] Support top1 output --- src/finn/qnn-data/templates/driver/validate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index 55e7603650..16f1e7a029 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -185,8 +185,8 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images exp = test_labels[i] obuf_normal = driver.execute(ibuf_normal) #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] - #TODO: detect automatically if argmax is needed or output is already top-1 - obuf_normal = np.argmax(obuf_normal, axis=1) + if obuf_normal.shape[1] > 1: + obuf_normal = np.argmax(obuf_normal, axis=1) ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) nok += ret[0] ok += ret[1] @@ -203,8 +203,8 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images ibuf_normal = imgs.reshape(driver.ishape_normal()) obuf_normal = driver.execute(ibuf_normal) #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] - #TODO: detect automatically if argmax is needed or output is already top-1 - obuf_normal = np.argmax(obuf_normal, axis=1) + if obuf_normal.shape[1] > 1: + obuf_normal = np.argmax(obuf_normal, axis=1) ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) nok += ret[0] ok += ret[1] From 215b6ca272118a7ac6fccde7d63223a0fe3b213a Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 19 May 2025 19:13:12 +0200 Subject: [PATCH 094/125] [CI] Fix artifact pull from parent pipeline --- benchmarking/bench-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarking/bench-ci.yml b/benchmarking/bench-ci.yml index 7e9376f3cf..28b3e9d83b 100644 --- a/benchmarking/bench-ci.yml +++ b/benchmarking/bench-ci.yml @@ -19,7 +19,7 @@ FINN Build: aud: https://git.uni-paderborn.de stage: build needs: - - job: Fetch Repos + - job: Build pipeline: $PARENT_PIPELINE_ID variables: SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" From ffc9fd9650155570d19d198e3be31cc5ade31ec9 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 13:41:59 +0200 Subject: [PATCH 095/125] Fix make driver step name --- benchmarking/dut/metafi.py | 2 +- benchmarking/dut/mobilenetv1.py | 2 +- benchmarking/dut/mvau.py | 2 +- benchmarking/dut/resnet50.py | 2 +- benchmarking/dut/transformer.py | 2 +- benchmarking/dut/vgg10.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarking/dut/metafi.py b/benchmarking/dut/metafi.py index 4c9dec2521..72912c45fc 100644 --- a/benchmarking/dut/metafi.py +++ b/benchmarking/dut/metafi.py @@ -39,7 +39,7 @@ def step_build_setup(self): "step_measure_rtlsim_performance", "step_out_of_context_synthesis", "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] diff --git a/benchmarking/dut/mobilenetv1.py b/benchmarking/dut/mobilenetv1.py index 06042816cf..a3899b1382 100644 --- a/benchmarking/dut/mobilenetv1.py +++ b/benchmarking/dut/mobilenetv1.py @@ -136,7 +136,7 @@ def step_build_setup(self): "step_hw_ipgen", "step_create_stitched_ip", "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] # mobilenet_build_steps_alveo = [ diff --git a/benchmarking/dut/mvau.py b/benchmarking/dut/mvau.py index f62c6b59a7..d67a926160 100644 --- a/benchmarking/dut/mvau.py +++ b/benchmarking/dut/mvau.py @@ -315,7 +315,7 @@ def step_build_setup(self): "step_measure_rtlsim_performance", "step_out_of_context_synthesis", "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] ) diff --git a/benchmarking/dut/resnet50.py b/benchmarking/dut/resnet50.py index bf5aed8ab4..0535db7269 100644 --- a/benchmarking/dut/resnet50.py +++ b/benchmarking/dut/resnet50.py @@ -31,7 +31,7 @@ def step_build_setup(self): "step_measure_rtlsim_performance", # was not in finn-examples "step_out_of_context_synthesis", # was not in finn-examples "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] diff --git a/benchmarking/dut/transformer.py b/benchmarking/dut/transformer.py index 1798ea1410..d1b14fca72 100644 --- a/benchmarking/dut/transformer.py +++ b/benchmarking/dut/transformer.py @@ -971,7 +971,7 @@ def step_build_setup(self): # "step_measure_rtlsim_performance", # not possible due to float components "step_out_of_context_synthesis", # for synthesis results (e.g. utilization) "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] ) diff --git a/benchmarking/dut/vgg10.py b/benchmarking/dut/vgg10.py index e64a58fb2f..516d5c47de 100644 --- a/benchmarking/dut/vgg10.py +++ b/benchmarking/dut/vgg10.py @@ -41,7 +41,7 @@ def step_build_setup(self): "step_measure_rtlsim_performance", "step_out_of_context_synthesis", "step_synthesize_bitfile", - "step_make_pynq_driver", + "step_make_driver", "step_deployment_package", ] From d20b10d23366cb9eba3c856e6b0020fd4b1e2dfa Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 14:05:33 +0200 Subject: [PATCH 096/125] Move benchmarking scripts to FINN package --- .gitlab-ci.yml | 2 +- .../finn/benchmarking}/bench-ci.yml | 9 +++--- .../finn/benchmarking}/bench.py | 22 ++++++--------- .../finn/benchmarking}/bench_base.py | 4 +-- .../finn/benchmarking}/bench_rtl_swg.py | 0 .../finn/benchmarking}/cfg/metafi_test.json | 0 .../benchmarking}/cfg/mobilenetv1_test.json | 0 .../finn/benchmarking}/cfg/mvau_test.json | 0 .../finn/benchmarking}/cfg/resnet50_test.json | 0 .../benchmarking}/cfg/synthetic_fifotest.json | 0 .../cfg/transformer_gpt_all.json | 0 .../cfg/transformer_radioml_all.json | 0 .../benchmarking}/cfg/transformer_sweep.json | 0 .../benchmarking}/cfg/transformer_test.json | 0 .../finn/benchmarking}/cfg/vgg10_test.json | 0 .../finn/benchmarking}/collect.py | 4 +-- .../finn/benchmarking}/dut/metafi.py | 2 +- .../finn/benchmarking}/dut/mobilenetv1.py | 2 +- .../finn/benchmarking}/dut/mvau.py | 2 +- .../finn/benchmarking}/dut/resnet50.py | 4 +-- .../dut/resnet50_custom_steps.py | 0 .../benchmarking}/dut/synthetic_nonlinear.py | 4 +-- .../finn/benchmarking}/dut/transformer.py | 4 +-- .../dut/transformer_custom_steps.py | 0 .../finn/benchmarking}/dut/vgg10.py | 2 +- .../finn/benchmarking}/measure.py | 2 +- .../finn/benchmarking}/templates.py | 0 .../finn/benchmarking}/util.py | 0 src/finn/interface/run_finn.py | 28 +++++++++++++++++++ 29 files changed, 57 insertions(+), 34 deletions(-) rename {benchmarking => src/finn/benchmarking}/bench-ci.yml (88%) rename {benchmarking => src/finn/benchmarking}/bench.py (93%) rename {benchmarking => src/finn/benchmarking}/bench_base.py (98%) rename {benchmarking => src/finn/benchmarking}/bench_rtl_swg.py (100%) rename {benchmarking => src/finn/benchmarking}/cfg/metafi_test.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/mobilenetv1_test.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/mvau_test.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/resnet50_test.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/synthetic_fifotest.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/transformer_gpt_all.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/transformer_radioml_all.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/transformer_sweep.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/transformer_test.json (100%) rename {benchmarking => src/finn/benchmarking}/cfg/vgg10_test.json (100%) rename {benchmarking => src/finn/benchmarking}/collect.py (99%) rename {benchmarking => src/finn/benchmarking}/dut/metafi.py (97%) rename {benchmarking => src/finn/benchmarking}/dut/mobilenetv1.py (99%) rename {benchmarking => src/finn/benchmarking}/dut/mvau.py (99%) rename {benchmarking => src/finn/benchmarking}/dut/resnet50.py (92%) rename {benchmarking => src/finn/benchmarking}/dut/resnet50_custom_steps.py (100%) rename {benchmarking => src/finn/benchmarking}/dut/synthetic_nonlinear.py (98%) rename {benchmarking => src/finn/benchmarking}/dut/transformer.py (99%) rename {benchmarking => src/finn/benchmarking}/dut/transformer_custom_steps.py (100%) rename {benchmarking => src/finn/benchmarking}/dut/vgg10.py (97%) rename {benchmarking => src/finn/benchmarking}/measure.py (98%) rename {benchmarking => src/finn/benchmarking}/templates.py (100%) rename {benchmarking => src/finn/benchmarking}/util.py (100%) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 990f2758ff..4d89ef0853 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -180,7 +180,7 @@ Bench: when: never - if: $MANUAL_CFG_PATH == "" trigger: - include: benchmarking/bench-ci.yml + include: src/finn/benchmarking/bench-ci.yml strategy: depend forward: pipeline_variables: true diff --git a/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml similarity index 88% rename from benchmarking/bench-ci.yml rename to src/finn/benchmarking/bench-ci.yml index 28b3e9d83b..9e960f8ecd 100644 --- a/benchmarking/bench-ci.yml +++ b/src/finn/benchmarking/bench-ci.yml @@ -28,11 +28,10 @@ FINN Build: script: # Launch additional monitoring - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & - # Launch benchmarking script directly (TODO: deeper integration) + # Launch benchmarking script via FINN CLI, includes deps update and environment preparation - | source ./finn-plus-venv/bin/activate - finn deps update - python ./finn-plus/benchmarking/bench.py $BENCH_CFG + finn bench $BENCH_CFG cache: key: $CI_COMMIT_SHA policy: pull @@ -56,7 +55,7 @@ Measurement: - when: always script: # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment - - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python benchmarking/measure.py" + - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python src/finn/benchmarking/measure.py" artifacts: name: "measurement_artifacts" when: always @@ -74,5 +73,5 @@ Result Collection: # Also run on failure of previous tasks to collect partial results - when: always script: - - python3.10 benchmarking/collect.py + - python3.10 src/finn/benchmarking/collect.py - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git diff --git a/benchmarking/bench.py b/src/finn/benchmarking/bench.py similarity index 93% rename from benchmarking/bench.py rename to src/finn/benchmarking/bench.py index 54788ac6a5..8d87036477 100644 --- a/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -7,15 +7,15 @@ import onnxruntime as ort import importlib -from util import delete_dir_contents +from finn.benchmarking.util import delete_dir_contents -from dut.mvau import bench_mvau -from dut.resnet50 import bench_resnet50 -from dut.metafi import bench_metafi -from dut.synthetic_nonlinear import bench_synthetic_nonlinear -from dut.transformer import bench_transformer -from dut.vgg10 import bench_vgg10 -from dut.mobilenetv1 import bench_mobilenetv1 +from finn.benchmarking.dut.mvau import bench_mvau +from finn.benchmarking.dut.resnet50 import bench_resnet50 +from finn.benchmarking.dut.metafi import bench_metafi +from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear +from finn.benchmarking.dut.transformer import bench_transformer +from finn.benchmarking.dut.vgg10 import bench_vgg10 +from finn.benchmarking.dut.mobilenetv1 import bench_mobilenetv1 dut = dict() dut["mvau"] = bench_mvau @@ -27,7 +27,7 @@ dut["mobilenetv1"] = bench_mobilenetv1 -def main(config_name): +def start_bench_run(config_name): exit_code = 0 # Attempt to work around onnxruntime issue on Slurm-managed clusters: # See https://github.com/microsoft/onnxruntime/issues/8313 @@ -193,7 +193,3 @@ def get_default_session_options_new(): print("Stopping job") return exit_code - -if __name__ == "__main__": - exit_code = main(sys.argv[1]) - sys.exit(exit_code) diff --git a/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py similarity index 98% rename from benchmarking/bench_base.py rename to src/finn/benchmarking/bench_base.py index 39a16dd7bc..16ef757389 100644 --- a/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -27,8 +27,8 @@ from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map -from templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template -from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents +from finn.benchmarking.templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template +from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) diff --git a/benchmarking/bench_rtl_swg.py b/src/finn/benchmarking/bench_rtl_swg.py similarity index 100% rename from benchmarking/bench_rtl_swg.py rename to src/finn/benchmarking/bench_rtl_swg.py diff --git a/benchmarking/cfg/metafi_test.json b/src/finn/benchmarking/cfg/metafi_test.json similarity index 100% rename from benchmarking/cfg/metafi_test.json rename to src/finn/benchmarking/cfg/metafi_test.json diff --git a/benchmarking/cfg/mobilenetv1_test.json b/src/finn/benchmarking/cfg/mobilenetv1_test.json similarity index 100% rename from benchmarking/cfg/mobilenetv1_test.json rename to src/finn/benchmarking/cfg/mobilenetv1_test.json diff --git a/benchmarking/cfg/mvau_test.json b/src/finn/benchmarking/cfg/mvau_test.json similarity index 100% rename from benchmarking/cfg/mvau_test.json rename to src/finn/benchmarking/cfg/mvau_test.json diff --git a/benchmarking/cfg/resnet50_test.json b/src/finn/benchmarking/cfg/resnet50_test.json similarity index 100% rename from benchmarking/cfg/resnet50_test.json rename to src/finn/benchmarking/cfg/resnet50_test.json diff --git a/benchmarking/cfg/synthetic_fifotest.json b/src/finn/benchmarking/cfg/synthetic_fifotest.json similarity index 100% rename from benchmarking/cfg/synthetic_fifotest.json rename to src/finn/benchmarking/cfg/synthetic_fifotest.json diff --git a/benchmarking/cfg/transformer_gpt_all.json b/src/finn/benchmarking/cfg/transformer_gpt_all.json similarity index 100% rename from benchmarking/cfg/transformer_gpt_all.json rename to src/finn/benchmarking/cfg/transformer_gpt_all.json diff --git a/benchmarking/cfg/transformer_radioml_all.json b/src/finn/benchmarking/cfg/transformer_radioml_all.json similarity index 100% rename from benchmarking/cfg/transformer_radioml_all.json rename to src/finn/benchmarking/cfg/transformer_radioml_all.json diff --git a/benchmarking/cfg/transformer_sweep.json b/src/finn/benchmarking/cfg/transformer_sweep.json similarity index 100% rename from benchmarking/cfg/transformer_sweep.json rename to src/finn/benchmarking/cfg/transformer_sweep.json diff --git a/benchmarking/cfg/transformer_test.json b/src/finn/benchmarking/cfg/transformer_test.json similarity index 100% rename from benchmarking/cfg/transformer_test.json rename to src/finn/benchmarking/cfg/transformer_test.json diff --git a/benchmarking/cfg/vgg10_test.json b/src/finn/benchmarking/cfg/vgg10_test.json similarity index 100% rename from benchmarking/cfg/vgg10_test.json rename to src/finn/benchmarking/cfg/vgg10_test.json diff --git a/benchmarking/collect.py b/src/finn/benchmarking/collect.py similarity index 99% rename from benchmarking/collect.py rename to src/finn/benchmarking/collect.py index 81dfbe339f..fa71c2a2aa 100644 --- a/benchmarking/collect.py +++ b/src/finn/benchmarking/collect.py @@ -1,9 +1,9 @@ import json import os import shutil -from dvclive import Live +from dvclive.live import Live -from util import delete_dir_contents +from finn.benchmarking.util import delete_dir_contents def log_dvc_metric(live, prefix, name, value): diff --git a/benchmarking/dut/metafi.py b/src/finn/benchmarking/dut/metafi.py similarity index 97% rename from benchmarking/dut/metafi.py rename to src/finn/benchmarking/dut/metafi.py index 72912c45fc..05c75eee08 100644 --- a/benchmarking/dut/metafi.py +++ b/src/finn/benchmarking/dut/metafi.py @@ -1,6 +1,6 @@ import finn.builder.build_dataflow_config as build_cfg -from bench_base import bench +from finn.benchmarking.bench_base import bench # # custom steps # from custom_steps import ( diff --git a/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py similarity index 99% rename from benchmarking/dut/mobilenetv1.py rename to src/finn/benchmarking/dut/mobilenetv1.py index a3899b1382..d3c0968d1a 100644 --- a/benchmarking/dut/mobilenetv1.py +++ b/src/finn/benchmarking/dut/mobilenetv1.py @@ -1,4 +1,4 @@ -from bench_base import bench +from finn.benchmarking.bench_base import bench from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d from qonnx.transformation.double_to_single_float import DoubleToSingleFloat diff --git a/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py similarity index 99% rename from benchmarking/dut/mvau.py rename to src/finn/benchmarking/dut/mvau.py index d67a926160..8ce89fdccc 100644 --- a/benchmarking/dut/mvau.py +++ b/src/finn/benchmarking/dut/mvau.py @@ -22,7 +22,7 @@ ) import finn.builder.build_dataflow_config as build_cfg -from bench_base import bench +from finn.benchmarking.bench_base import bench class bench_mvau(bench): diff --git a/benchmarking/dut/resnet50.py b/src/finn/benchmarking/dut/resnet50.py similarity index 92% rename from benchmarking/dut/resnet50.py rename to src/finn/benchmarking/dut/resnet50.py index 0535db7269..efcd0de275 100644 --- a/benchmarking/dut/resnet50.py +++ b/src/finn/benchmarking/dut/resnet50.py @@ -1,14 +1,14 @@ import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import alveo_default_platform -from dut.resnet50_custom_steps import ( +from finn.benchmarking.dut.resnet50_custom_steps import ( step_resnet50_tidy, step_resnet50_streamline, step_resnet50_convert_to_hw, step_resnet50_slr_floorplan, ) -from bench_base import bench +from finn.benchmarking.bench_base import bench class bench_resnet50(bench): def step_build_setup(self): diff --git a/benchmarking/dut/resnet50_custom_steps.py b/src/finn/benchmarking/dut/resnet50_custom_steps.py similarity index 100% rename from benchmarking/dut/resnet50_custom_steps.py rename to src/finn/benchmarking/dut/resnet50_custom_steps.py diff --git a/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py similarity index 98% rename from benchmarking/dut/synthetic_nonlinear.py rename to src/finn/benchmarking/dut/synthetic_nonlinear.py index eb91999b2e..b912e8b319 100644 --- a/benchmarking/dut/synthetic_nonlinear.py +++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py @@ -24,13 +24,13 @@ import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir -from util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents +from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents from finn.util.test import get_trained_network_and_ishape from finn.util.basic import alveo_default_platform -from bench_base import bench +from finn.benchmarking.bench_base import bench def generate_random_threshold_values( data_type, num_input_channels, num_steps, narrow=False, per_tensor=False diff --git a/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py similarity index 99% rename from benchmarking/dut/transformer.py rename to src/finn/benchmarking/dut/transformer.py index d1b14fca72..27583ec5e1 100644 --- a/benchmarking/dut/transformer.py +++ b/src/finn/benchmarking/dut/transformer.py @@ -22,14 +22,14 @@ import finn.builder.build_dataflow_config as build_cfg from finn.builder.build_dataflow_config import AutoFIFOSizingMethod from qonnx.core.modelwrapper import ModelWrapper -from bench_base import bench +from finn.benchmarking.bench_base import bench # Range information structure for seeding the range analysis for converting # quantized activations to MultiThreshold from qonnx.util.range_analysis import RangeInfo # Custom build steps required to streamline and convert the attention operator -from dut.transformer_custom_steps import ( +from finn.benchmarking.dut.transformer_custom_steps import ( prepare_graph, step_streamline, step_convert_attention_to_hw, diff --git a/benchmarking/dut/transformer_custom_steps.py b/src/finn/benchmarking/dut/transformer_custom_steps.py similarity index 100% rename from benchmarking/dut/transformer_custom_steps.py rename to src/finn/benchmarking/dut/transformer_custom_steps.py diff --git a/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py similarity index 97% rename from benchmarking/dut/vgg10.py rename to src/finn/benchmarking/dut/vgg10.py index 516d5c47de..f799759108 100644 --- a/benchmarking/dut/vgg10.py +++ b/src/finn/benchmarking/dut/vgg10.py @@ -5,7 +5,7 @@ import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb -from bench_base import bench +from finn.benchmarking.bench_base import bench def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): diff --git a/benchmarking/measure.py b/src/finn/benchmarking/measure.py similarity index 98% rename from benchmarking/measure.py rename to src/finn/benchmarking/measure.py index 7231991bde..9a44ff3192 100644 --- a/benchmarking/measure.py +++ b/src/finn/benchmarking/measure.py @@ -3,7 +3,7 @@ import subprocess import shutil -from util import delete_dir_contents +from finn.benchmarking.util import delete_dir_contents if __name__ == "__main__": diff --git a/benchmarking/templates.py b/src/finn/benchmarking/templates.py similarity index 100% rename from benchmarking/templates.py rename to src/finn/benchmarking/templates.py diff --git a/benchmarking/util.py b/src/finn/benchmarking/util.py similarity index 100% rename from benchmarking/util.py rename to src/finn/benchmarking/util.py diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index ca5faef96d..82f71316e0 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -32,6 +32,7 @@ from finn.interface.manage_deps import install_pyxsi, update_dependencies from finn.interface.manage_tests import run_test +from finn.benchmarking.bench import start_bench_run # Resolves the path to modules which are not part of the FINN package hierarchy def _resolve_module_path(name: str) -> str: @@ -260,6 +261,32 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) -> ) +@click.command(help="Run a given benchmark configuration.") +@click.option( + "--bench_config", + help="Name or path of experiment configuration file", + default="", +) +@click.option("--dependency-path", "-d", default="") +@click.option("--num-workers", "-n", default=-1, show_default=True) +@click.option( + "--build-path", + "-b", + help="Specify a build temp path of your choice", + default="", +) +def bench( + bench_config: str, dependency_path: str, num_workers: int, build_path: str +) -> None: + console = Console() + build_dir = Path(build_path).expanduser() if build_path != "" else None + dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None + prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True) + console.rule("RUNNING BENCHMARK") + exit_code = start_bench_run(bench_config) + sys.exit(exit_code) + + @click.command(help="Run a given test. Uses /tmp/FINN_TMP as the temporary file location") @click.option( "--variant", @@ -385,6 +412,7 @@ def main() -> None: main_group.add_command(config) main_group.add_command(deps) main_group.add_command(build) + main_group.add_command(bench) main_group.add_command(test) main_group.add_command(run) main_group() From 33921b84df16e881db47d4ddc16b3d9615528f63 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 14:13:25 +0200 Subject: [PATCH 097/125] Fix early import --- src/finn/interface/run_finn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index 82f71316e0..3661b414ab 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -32,7 +32,6 @@ from finn.interface.manage_deps import install_pyxsi, update_dependencies from finn.interface.manage_tests import run_test -from finn.benchmarking.bench import start_bench_run # Resolves the path to modules which are not part of the FINN package hierarchy def _resolve_module_path(name: str) -> str: @@ -275,14 +274,16 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) -> help="Specify a build temp path of your choice", default="", ) -def bench( - bench_config: str, dependency_path: str, num_workers: int, build_path: str -) -> None: +def bench(bench_config: str, dependency_path: str, num_workers: int, build_path: str) -> None: console = Console() build_dir = Path(build_path).expanduser() if build_path != "" else None dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True) console.rule("RUNNING BENCHMARK") + + # Late import because we need prepare_finn to setup remaining dependencies first + from finn.benchmarking.bench import start_bench_run + exit_code = start_bench_run(bench_config) sys.exit(exit_code) From a49d003cb67023e8370522e6315906dc63cd0201 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 14:29:57 +0200 Subject: [PATCH 098/125] Introduce custom step library --- src/finn/benchmarking/dut/mobilenetv1.py | 119 +----------------- src/finn/benchmarking/dut/vgg10.py | 18 --- .../builder/custom_step_library/__init__.py | 0 .../builder/custom_step_library/conv1d.py | 18 +++ .../builder/custom_step_library/mobilenet.py | 119 ++++++++++++++++++ .../custom_step_library/resnet.py} | 0 .../custom_step_library/transformer.py} | 0 7 files changed, 138 insertions(+), 136 deletions(-) create mode 100644 src/finn/builder/custom_step_library/__init__.py create mode 100644 src/finn/builder/custom_step_library/conv1d.py create mode 100644 src/finn/builder/custom_step_library/mobilenet.py rename src/finn/{benchmarking/dut/resnet50_custom_steps.py => builder/custom_step_library/resnet.py} (100%) rename src/finn/{benchmarking/dut/transformer_custom_steps.py => builder/custom_step_library/transformer.py} (100%) diff --git a/src/finn/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py index d3c0968d1a..efcfb7b521 100644 --- a/src/finn/benchmarking/dut/mobilenetv1.py +++ b/src/finn/benchmarking/dut/mobilenetv1.py @@ -1,122 +1,5 @@ from finn.benchmarking.bench_base import bench -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d -from qonnx.transformation.double_to_single_float import DoubleToSingleFloat -from qonnx.transformation.general import ( - ApplyConfig, - GiveReadableTensorNames, - GiveUniqueNodeNames, -) -from qonnx.transformation.infer_data_layouts import InferDataLayouts -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul -from qonnx.transformation.remove import RemoveIdentityOps - -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw -import finn.transformation.streamline.absorb as absorb -import finn.transformation.streamline.reorder as reorder -from finn.builder.build_dataflow_config import ( - DataflowBuildConfig, - ShellFlowType, - VerificationStepType, -) -from finn.builder.build_dataflow_steps import verify_step -from finn.transformation.streamline import Streamline -from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul -from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds - - -def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(Streamline()) - additional_streamline_transformations = [ - DoubleToSingleFloat(), - reorder.MoveMulPastDWConv(), - absorb.AbsorbMulIntoMultiThreshold(), - ChangeDataLayoutQuantAvgPool2d(), - InferDataLayouts(), - reorder.MoveTransposePastScalarMul(), - absorb.AbsorbTransposeIntoFlatten(), - reorder.MoveFlattenPastAffine(), - reorder.MoveFlattenPastTopK(), - reorder.MoveScalarMulPastMatMul(), - CollapseRepeatedMul(), - RemoveIdentityOps(), - RoundAndClipThresholds(), - ] - for trn in additional_streamline_transformations: - model = model.transform(trn) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - model = model.transform(InferDataTypes()) - - if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): - verify_step(model, cfg, "streamlined_python", need_parent=False) - - return model - - -def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(LowerConvsToMatMul()) - model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) - model = model.transform(absorb.AbsorbConsecutiveTransposes()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - model = model.transform(InferDataTypes()) - model = model.transform(RoundAndClipThresholds()) - model = model.transform(InferDataLayouts()) - return model - - -def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(to_hw.InferPool()) - model = model.transform(to_hw.InferConvInpGen()) - model = model.transform(to_hw.InferVectorVectorActivation()) - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) - model = model.transform(to_hw.InferChannelwiseLinearLayer()) - model = model.transform(to_hw.InferLabelSelectLayer()) - model = model.transform(InferShapes()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - return model - - -def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): - if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: - try: - from finnexperimental.analysis.partitioning import partition - - # apply partitioning of the model, restricting the first and last layers - # to SLR0 - default_slr = 0 - abs_anchors = [(0, [default_slr]), (-1, [default_slr])] - floorplan = partition( - model, - cfg.synth_clk_period_ns, - cfg.board, - abs_anchors=abs_anchors, - multivariant=False, - )[0] - # apply floorplan to model - model = model.transform(ApplyConfig(floorplan)) - print("SLR floorplanning applied") - except Exception: - print("No SLR floorplanning applied") - return model - - -def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(to_hw.InferPool()) - model = model.transform(to_hw.InferConvInpGen()) - model = model.transform(to_hw.InferThresholdingLayer()) - model = model.transform(to_hw.InferVectorVectorActivation()) - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) - model = model.transform(to_hw.InferChannelwiseLinearLayer()) - model = model.transform(to_hw.InferLabelSelectLayer()) - model = model.transform(InferShapes()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - return model +from finn.builder.build_dataflow_config import DataflowBuildConfig class bench_mobilenetv1(bench): diff --git a/src/finn/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py index f799759108..d34c186387 100644 --- a/src/finn/benchmarking/dut/vgg10.py +++ b/src/finn/benchmarking/dut/vgg10.py @@ -1,24 +1,6 @@ -from qonnx.core.modelwrapper import ModelWrapper from finn.builder.build_dataflow_config import DataflowBuildConfig -from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors -from qonnx.transformation.general import GiveUniqueNodeNames -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw -import finn.transformation.streamline.absorb as absorb - from finn.benchmarking.bench_base import bench - -def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(Change3DTo4DTensors()) - model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) - return model - -def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig): - model = model.transform(to_hw.InferChannelwiseLinearLayer()) - model = model.transform(to_hw.InferLabelSelectLayer()) - model = model.transform(GiveUniqueNodeNames()) - return model - class bench_vgg10(bench): def step_build_setup(self): # create build config for VGG-10 (based on finn-examples) diff --git a/src/finn/builder/custom_step_library/__init__.py b/src/finn/builder/custom_step_library/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py new file mode 100644 index 0000000000..5545f66536 --- /dev/null +++ b/src/finn/builder/custom_step_library/conv1d.py @@ -0,0 +1,18 @@ +from qonnx.core.modelwrapper import ModelWrapper +from finn.builder.build_dataflow_config import DataflowBuildConfig +from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors +from qonnx.transformation.general import GiveUniqueNodeNames +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb + + +def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Change3DTo4DTensors()) + model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) + return model + +def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(GiveUniqueNodeNames()) + return model diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py new file mode 100644 index 0000000000..6a2d8053b2 --- /dev/null +++ b/src/finn/builder/custom_step_library/mobilenet.py @@ -0,0 +1,119 @@ +from finn.benchmarking.bench_base import bench +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps + +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb +import finn.transformation.streamline.reorder as reorder +from finn.builder.build_dataflow_config import ( + DataflowBuildConfig, + ShellFlowType, + VerificationStepType, +) +from finn.builder.build_dataflow_steps import verify_step +from finn.transformation.streamline import Streamline +from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds + + +def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Streamline()) + additional_streamline_transformations = [ + DoubleToSingleFloat(), + reorder.MoveMulPastDWConv(), + absorb.AbsorbMulIntoMultiThreshold(), + ChangeDataLayoutQuantAvgPool2d(), + InferDataLayouts(), + reorder.MoveTransposePastScalarMul(), + absorb.AbsorbTransposeIntoFlatten(), + reorder.MoveFlattenPastAffine(), + reorder.MoveFlattenPastTopK(), + reorder.MoveScalarMulPastMatMul(), + CollapseRepeatedMul(), + RemoveIdentityOps(), + RoundAndClipThresholds(), + ] + for trn in additional_streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + + if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): + verify_step(model, cfg, "streamlined_python", need_parent=False) + + return model + + +def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(LowerConvsToMatMul()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RoundAndClipThresholds()) + model = model.transform(InferDataLayouts()) + return model + + +def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model + + +def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): + if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: + try: + from finnexperimental.analysis.partitioning import partition + + # apply partitioning of the model, restricting the first and last layers + # to SLR0 + default_slr = 0 + abs_anchors = [(0, [default_slr]), (-1, [default_slr])] + floorplan = partition( + model, + cfg.synth_clk_period_ns, + cfg.board, + abs_anchors=abs_anchors, + multivariant=False, + )[0] + # apply floorplan to model + model = model.transform(ApplyConfig(floorplan)) + print("SLR floorplanning applied") + except Exception: + print("No SLR floorplanning applied") + return model + + +def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model \ No newline at end of file diff --git a/src/finn/benchmarking/dut/resnet50_custom_steps.py b/src/finn/builder/custom_step_library/resnet.py similarity index 100% rename from src/finn/benchmarking/dut/resnet50_custom_steps.py rename to src/finn/builder/custom_step_library/resnet.py diff --git a/src/finn/benchmarking/dut/transformer_custom_steps.py b/src/finn/builder/custom_step_library/transformer.py similarity index 100% rename from src/finn/benchmarking/dut/transformer_custom_steps.py rename to src/finn/builder/custom_step_library/transformer.py From cfdb04239a53227aa284b5716226650f18e68e3b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 17:23:35 +0200 Subject: [PATCH 099/125] Switch to YAML-based build config --- src/finn/benchmarking/bench.py | 21 ++-- src/finn/benchmarking/bench_base.py | 119 +++++++----------- src/finn/benchmarking/cfg/metafi_test.json | 14 --- src/finn/benchmarking/cfg/metafi_test.yml | 14 +++ .../benchmarking/cfg/mobilenetv1_test.json | 32 ----- .../benchmarking/cfg/mobilenetv1_test.yml | 31 +++++ .../cfg/{mvau_test.json => mvau_test.yml} | 2 +- src/finn/benchmarking/cfg/resnet50_test.json | 33 ----- src/finn/benchmarking/cfg/resnet50_test.yml | 33 +++++ ...c_fifotest.json => synthetic_fifotest.yml} | 28 +++-- ...r_gpt_all.json => transformer_gpt_all.yml} | 4 +- ...l_all.json => transformer_radioml_all.yml} | 8 +- ...ormer_sweep.json => transformer_sweep.yml} | 20 +-- ...sformer_test.json => transformer_test.yml} | 4 +- src/finn/benchmarking/cfg/vgg10_test.json | 32 ----- src/finn/benchmarking/cfg/vgg10_test.yml | 33 +++++ src/finn/benchmarking/dut/metafi.py | 61 --------- src/finn/benchmarking/dut/metafi.yml | 28 +++++ src/finn/benchmarking/dut/mobilenetv1.py | 48 ------- src/finn/benchmarking/dut/mobilenetv1.yml | 16 +++ src/finn/benchmarking/dut/resnet50.py | 42 ------- src/finn/benchmarking/dut/resnet50.yml | 19 +++ src/finn/benchmarking/dut/transformer.py | 2 +- src/finn/benchmarking/dut/vgg10.py | 35 ------ src/finn/benchmarking/dut/vgg10.yml | 23 ++++ src/finn/interface/run_finn.py | 2 +- 26 files changed, 280 insertions(+), 424 deletions(-) delete mode 100644 src/finn/benchmarking/cfg/metafi_test.json create mode 100644 src/finn/benchmarking/cfg/metafi_test.yml delete mode 100644 src/finn/benchmarking/cfg/mobilenetv1_test.json create mode 100644 src/finn/benchmarking/cfg/mobilenetv1_test.yml rename src/finn/benchmarking/cfg/{mvau_test.json => mvau_test.yml} (75%) delete mode 100644 src/finn/benchmarking/cfg/resnet50_test.json create mode 100644 src/finn/benchmarking/cfg/resnet50_test.yml rename src/finn/benchmarking/cfg/{synthetic_fifotest.json => synthetic_fifotest.yml} (57%) rename src/finn/benchmarking/cfg/{transformer_gpt_all.json => transformer_gpt_all.yml} (72%) rename src/finn/benchmarking/cfg/{transformer_radioml_all.json => transformer_radioml_all.yml} (57%) rename src/finn/benchmarking/cfg/{transformer_sweep.json => transformer_sweep.yml} (82%) rename src/finn/benchmarking/cfg/{transformer_test.json => transformer_test.yml} (77%) delete mode 100644 src/finn/benchmarking/cfg/vgg10_test.json create mode 100644 src/finn/benchmarking/cfg/vgg10_test.yml delete mode 100644 src/finn/benchmarking/dut/metafi.py create mode 100644 src/finn/benchmarking/dut/metafi.yml delete mode 100644 src/finn/benchmarking/dut/mobilenetv1.py create mode 100644 src/finn/benchmarking/dut/mobilenetv1.yml delete mode 100644 src/finn/benchmarking/dut/resnet50.py create mode 100644 src/finn/benchmarking/dut/resnet50.yml delete mode 100644 src/finn/benchmarking/dut/vgg10.py create mode 100644 src/finn/benchmarking/dut/vgg10.yml diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 8d87036477..745d6c62b2 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -1,30 +1,24 @@ import itertools -import sys import os import json +import yaml import time import traceback import onnxruntime as ort -import importlib from finn.benchmarking.util import delete_dir_contents +from finn.benchmarking.bench_base import bench from finn.benchmarking.dut.mvau import bench_mvau -from finn.benchmarking.dut.resnet50 import bench_resnet50 -from finn.benchmarking.dut.metafi import bench_metafi from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear from finn.benchmarking.dut.transformer import bench_transformer -from finn.benchmarking.dut.vgg10 import bench_vgg10 -from finn.benchmarking.dut.mobilenetv1 import bench_mobilenetv1 + +# Register custom bench subclasses that offer more control than YAML-based flow dut = dict() dut["mvau"] = bench_mvau -dut["resnet50"] = bench_resnet50 -dut["metafi"] = bench_metafi dut["synthetic_nonlinear"] = bench_synthetic_nonlinear dut["transformer"] = bench_transformer -dut["vgg10"] = bench_vgg10 -dut["mobilenetv1"] = bench_mobilenetv1 def start_bench_run(config_name): @@ -96,7 +90,7 @@ def get_default_session_options_new(): print("Loading config %s" % (config_path)) if os.path.exists(config_path): with open(config_path, "r") as f: - config = json.load(f) + config = yaml.load(f, Loader=yaml.SafeLoader) else: print("ERROR: config file not found") return @@ -150,8 +144,9 @@ def get_default_session_options_new(): if params["dut"] in dut: bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir) else: - print("ERROR: unknown DUT specified") - return 1 + # If no custom bench subclass is defined, fall back to base class, + # expect DUT-specific YAML definition instead + bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir) else: print("ERROR: no DUT specified") return 1 diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py index 16ef757389..dc1b40cee2 100644 --- a/src/finn/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -3,6 +3,7 @@ import subprocess import copy import json +import yaml import time import traceback import glob @@ -130,7 +131,14 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously #TODO: coordinate with new builder loggin setup - # General configuration + # Setup some basic global default configuration + # TODO: are these class members even used anymore? + if "synth_clk_period_ns" in params: + self.clock_period_ns = params["synth_clk_period_ns"] + else: + self.clock_period_ns = 10 + self.params["synth_clk_period_ns"] = self.clock_period_ns + # TODO: do not allow multiple targets in a single bench job due to measurement? if "board" in params: self.board = params["board"] @@ -144,12 +152,12 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d self.part = part_map[self.board] else: raise Exception("No part specified for board %s" % self.board) - - if "clock_period_ns" in params: - self.clock_period_ns = params["clock_period_ns"] + + if self.board in alveo_part_map: + self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO + self.params["vitis_platform"] = alveo_default_platform[self.board] else: - self.clock_period_ns = 10 - self.params["clock_period_ns"] = self.clock_period_ns + self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts) print("Clearing FINN BUILD DIR ahead of run") @@ -214,14 +222,20 @@ def save_local_artifacts_collection(self): for (name, source_path, archive) in self.local_artifacts_collection: target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) self.save_artifact(target_path, source_path, archive) - + # must be defined by subclass def step_export_onnx(self): pass - # must be defined by subclass + # can be overwritten by subclass if setup is too complex for YAML definition def step_build_setup(self): - pass + dut_yaml_name = self.params["dut"] + ".yml" + dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name) + if os.path.isfile(dut_path): + with open(dut_path, "r") as f: + return DataflowBuildConfig.from_yaml(f) + else: + raise Exception("No DUT-specific YAML build definition found") # defaults to normal build flow, may be overwritten by subclass def run(self): @@ -381,6 +395,13 @@ def step_parse_builder_output(self, build_dir): def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow + ### LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig) + custom_params = [ + "model_dir", # used to setup onnx/npy input + "model_path", # used to setup onnx/npy input + # model-gen parameters, such as seed, simd, pe, etc. (TODO: separate from builder options) + ] + ### MODEL CREATION/IMPORT ### # TODO: track fixed input onnx models with DVC if "model_dir" in self.params: @@ -398,26 +419,12 @@ def steps_full_build_flow(self): # microbenchmarks might skip because no valid model can be generated for given params return "skipped" - if "folding_path" in self.params: - self.build_inputs["folding_path"] = self.params["folding_path"] - if "specialize_path" in self.params: - self.build_inputs["specialize_path"] = self.params["specialize_path"] - if "floorplan_path" in self.params: - self.build_inputs["floorplan_path"] = self.params["floorplan_path"] - ### BUILD SETUP ### - # TODO: convert to YAML-based builder config - # TODO: split up into default config, dut-specific config, and run-specific config + # Initialize from YAML (default) or custom script (if dedicated subclass is defined) cfg = self.step_build_setup() - cfg.generate_outputs = self.params["output_products"] + + # Set some global defaults (could still be overwritten by run-specific YAML) cfg.output_dir = self.build_inputs["build_dir"] - cfg.synth_clk_period_ns = self.clock_period_ns - cfg.board = self.board - if self.board in alveo_part_map: - cfg.shell_flow_type=build_cfg.ShellFlowType.VITIS_ALVEO - cfg.vitis_platform=alveo_default_platform[self.board] - else: - cfg.shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ # enable extra performance optimizations (physopt) # TODO: check OMX synth strategy again! cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST @@ -427,61 +434,21 @@ def steps_full_build_flow(self): cfg.force_python_rtlsim = False cfg.split_large_fifos = True cfg.save_intermediate_models = True # Save the intermediate model graphs - cfg.verify_save_full_context = True, # Output full context dump for verification steps + cfg.verify_save_full_context = True # Output full context dump for verification steps + cfg.enable_instrumentation = True #rtlsim_use_vivado_comps # TODO ? #cfg.default_swg_exception #cfg.large_fifo_mem_style - # Switch between instrumentation or IODMA wrapper (TODO: combine both in one bitstream) - if "enable_instrumentation" in self.params: - cfg.enable_instrumentation = self.params["enable_instrumentation"] - else: - cfg.enable_instrumentation = True - - # "manual or "characterize" or "largefifo_rtlsim" or "live" - if "fifo_method" in self.params: - if self.params["fifo_method"] == "manual": - cfg.auto_fifo_depths = False - elif self.params["fifo_method"] == "live": - cfg.auto_fifo_depths = False - cfg.live_fifo_sizing = True - cfg.enable_instrumentation = True - cfg.synth_clk_period_ns = 10 # force conservative 100 MHz clock + # Overwrite build config settings with run-specific YAML build definition + for key in self.params: + if hasattr(cfg, key): + setattr(cfg, key, self.params[key]) else: - cfg.auto_fifo_depths = True - cfg.auto_fifo_strategy = self.params["fifo_method"] - # only relevant for "characterize" method: "rtlsim" or "analytical" - if "fifo_strategy" in self.params: - cfg.characteristic_function_strategy = self.params["fifo_strategy"] - - # Batch size used for RTLSim performance measurement (and in-depth FIFO test here) - # TODO: determine automatically or replace by exact instr wrapper sim - if "rtlsim_n" in self.params: - cfg.rtlsim_batch_size=self.params["rtlsim_n"] - - # Batch size used for FIFO sizing (largefifo_rtlsim only) - if "fifo_rtlsim_n" in self.params: - cfg.fifosim_n_inferences=self.params["fifo_rtlsim_n"] - - # Manual correction factor for FIFO-Sim input throttling - if "fifo_throttle_factor" in self.params: - cfg.fifo_throttle_factor = self.params["fifo_throttle_factor"] - - if "folding_path" in self.build_inputs: - cfg.folding_config_file = self.build_inputs["folding_path"] - if "specialize_path" in self.build_inputs: - cfg.specialize_layers_config_file = self.build_inputs["specialize_path"] - if "floorplan_path" in self.build_inputs: - cfg.floorplan_path = self.build_inputs["floorplan_path"] - - if "target_fps" in self.params: - if self.params["target_fps"] == "None": - cfg.target_fps = None - else: - cfg.target_fps = self.params["target_fps"] - - if "validation_dataset" in self.params: - cfg.validation_dataset = self.params["validation_dataset"] + if key not in custom_params: + pass + #TODO: be more strict? support custom extra options like MetaFi uses? + #raise Exception("Unrecognized builder config defined in YAML: %s" % key) # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) # TODO: make configurable or set on pipeline level? diff --git a/src/finn/benchmarking/cfg/metafi_test.json b/src/finn/benchmarking/cfg/metafi_test.json deleted file mode 100644 index bc10f857c3..0000000000 --- a/src/finn/benchmarking/cfg/metafi_test.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "dut": ["metafi"], - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["live"], - - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] - } - ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/metafi_test.yml b/src/finn/benchmarking/cfg/metafi_test.yml new file mode 100644 index 0000000000..711250bbdb --- /dev/null +++ b/src/finn/benchmarking/cfg/metafi_test.yml @@ -0,0 +1,14 @@ +[ + { + "dut": ["metafi"], + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "live_fifo_sizing": [True], + + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } + ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.json b/src/finn/benchmarking/cfg/mobilenetv1_test.json deleted file mode 100644 index d080638722..0000000000 --- a/src/finn/benchmarking/cfg/mobilenetv1_test.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "dut": ["mobilenetv1"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["manual"], - - "rtlsim_n": [5], - "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["mobilenetv1"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["live"], - - "rtlsim_n": [5], - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.yml b/src/finn/benchmarking/cfg/mobilenetv1_test.yml new file mode 100644 index 0000000000..040fa380e4 --- /dev/null +++ b/src/finn/benchmarking/cfg/mobilenetv1_test.yml @@ -0,0 +1,31 @@ +[ + { + "dut": ["mobilenetv1"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "auto_fifo_depths": [False], + + "rtlsim_batch_sizauto_fifo_depths": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "live_fifo_sizing": [True], + + "rtlsim_batch_size": [5], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/mvau_test.json b/src/finn/benchmarking/cfg/mvau_test.yml similarity index 75% rename from src/finn/benchmarking/cfg/mvau_test.json rename to src/finn/benchmarking/cfg/mvau_test.yml index c42b16782c..7e0b3d14d2 100644 --- a/src/finn/benchmarking/cfg/mvau_test.json +++ b/src/finn/benchmarking/cfg/mvau_test.yml @@ -21,6 +21,6 @@ "dut_duplication": [1], - "output_products": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] } ] diff --git a/src/finn/benchmarking/cfg/resnet50_test.json b/src/finn/benchmarking/cfg/resnet50_test.json deleted file mode 100644 index 06a96729ab..0000000000 --- a/src/finn/benchmarking/cfg/resnet50_test.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["U250"], - "clock_period_ns": [4], - - "fifo_method": ["manual"], - - "rtlsim_n": [5], - "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]] - }, - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["live"], - - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] - } - ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/resnet50_test.yml b/src/finn/benchmarking/cfg/resnet50_test.yml new file mode 100644 index 0000000000..e3acf9fa7d --- /dev/null +++ b/src/finn/benchmarking/cfg/resnet50_test.yml @@ -0,0 +1,33 @@ +[ + { + "dut": ["resnet50"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["U250"], + "synth_clk_period_ns": [4], + + "auto_fifo_depths": [False], + + "rtlsim_batch_size": [5], + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]] + }, + { + "dut": ["resnet50"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], + "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "live_fifo_sizing": [True], + + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } + ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/synthetic_fifotest.json b/src/finn/benchmarking/cfg/synthetic_fifotest.yml similarity index 57% rename from src/finn/benchmarking/cfg/synthetic_fifotest.json rename to src/finn/benchmarking/cfg/synthetic_fifotest.yml index 7e362200af..58a49d108d 100644 --- a/src/finn/benchmarking/cfg/synthetic_fifotest.json +++ b/src/finn/benchmarking/cfg/synthetic_fifotest.yml @@ -12,11 +12,11 @@ "rb_num_layers": [4], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], "rtlsim_n": [5], - "fifo_method": ["live"], + "live_fifo_sizing": [True], "output_products": [["bitfile", "pynq_driver", "deployment_package"]] }, { @@ -32,13 +32,15 @@ "rb_num_layers": [4], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "rtlsim_n": [5], + "rtlsim_batch_size": [5], + + "auto_fifo_depths": [True], + "auto_fifo_strategy": ["characterize"], + "characteristic_function_strategy": ["analytical", "rtlsim"], - "fifo_method": ["characterize"], - "fifo_strategy": ["analytical", "rtlsim"], - "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["synthetic_nonlinear"], @@ -53,12 +55,14 @@ "rb_num_layers": [4], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "rtlsim_n": [5], + "rtlsim_batch_size": [5], + + "auto_fifo_depths": [True], + "auto_fifo_strategy": ["largefifo_rtlsim"], - "fifo_method": ["largefifo_rtlsim"], - "fifo_rtlsim_n": [2], - "output_products": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] + "fifosim_n_inferences": [2], + "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/transformer_gpt_all.json b/src/finn/benchmarking/cfg/transformer_gpt_all.yml similarity index 72% rename from src/finn/benchmarking/cfg/transformer_gpt_all.json rename to src/finn/benchmarking/cfg/transformer_gpt_all.yml index b0b70fb0aa..e0610c3d7e 100644 --- a/src/finn/benchmarking/cfg/transformer_gpt_all.json +++ b/src/finn/benchmarking/cfg/transformer_gpt_all.yml @@ -5,8 +5,8 @@ "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], "board": ["U280"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "output_products": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] + "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] } ] diff --git a/src/finn/benchmarking/cfg/transformer_radioml_all.json b/src/finn/benchmarking/cfg/transformer_radioml_all.yml similarity index 57% rename from src/finn/benchmarking/cfg/transformer_radioml_all.json rename to src/finn/benchmarking/cfg/transformer_radioml_all.yml index 5eeea031b2..dede0988c8 100644 --- a/src/finn/benchmarking/cfg/transformer_radioml_all.json +++ b/src/finn/benchmarking/cfg/transformer_radioml_all.yml @@ -5,9 +5,9 @@ "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["transformer"], @@ -15,8 +15,8 @@ "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/transformer_sweep.json b/src/finn/benchmarking/cfg/transformer_sweep.yml similarity index 82% rename from src/finn/benchmarking/cfg/transformer_sweep.json rename to src/finn/benchmarking/cfg/transformer_sweep.yml index e1795ff3f8..7fa9420d01 100644 --- a/src/finn/benchmarking/cfg/transformer_sweep.json +++ b/src/finn/benchmarking/cfg/transformer_sweep.yml @@ -14,9 +14,7 @@ "model_bits": [2], "model_norm": ["none"], "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "dut_duplication": [1] + "model_positional_encoding": ["binary"] }, { "dut": ["transformer"], @@ -33,9 +31,7 @@ "model_bits": [2], "model_norm": ["none"], "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "dut_duplication": [1] + "model_positional_encoding": ["binary"] }, { "dut": ["transformer"], @@ -52,9 +48,7 @@ "model_bits": [2], "model_norm": ["none"], "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "dut_duplication": [1] + "model_positional_encoding": ["binary"] }, { "dut": ["transformer"], @@ -71,9 +65,7 @@ "model_bits": [2], "model_norm": ["none"], "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "dut_duplication": [1] + "model_positional_encoding": ["binary"] }, { "dut": ["transformer"], @@ -90,8 +82,6 @@ "model_bits": [2, 4, 6, 8], "model_norm": ["none"], "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "dut_duplication": [1] + "model_positional_encoding": ["binary"] } ] diff --git a/src/finn/benchmarking/cfg/transformer_test.json b/src/finn/benchmarking/cfg/transformer_test.yml similarity index 77% rename from src/finn/benchmarking/cfg/transformer_test.json rename to src/finn/benchmarking/cfg/transformer_test.yml index e0fcbc160d..a529981fdc 100644 --- a/src/finn/benchmarking/cfg/transformer_test.json +++ b/src/finn/benchmarking/cfg/transformer_test.yml @@ -17,8 +17,8 @@ "model_positional_encoding": ["binary"], "board": ["RFSoC2x2"], - "clock_period_ns": [10], + "synth_clk_period_ns": [10], - "output_products": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] } ] diff --git a/src/finn/benchmarking/cfg/vgg10_test.json b/src/finn/benchmarking/cfg/vgg10_test.json deleted file mode 100644 index 7a6e1a5deb..0000000000 --- a/src/finn/benchmarking/cfg/vgg10_test.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "dut": ["vgg10"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["largefifo_rtlsim"], - - "rtlsim_n": [5], - "output_products": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["vgg10"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], - "folding_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], - "specialize_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "clock_period_ns": [10], - - "fifo_method": ["live"], - - "rtlsim_n": [5], - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/vgg10_test.yml b/src/finn/benchmarking/cfg/vgg10_test.yml new file mode 100644 index 0000000000..e16122b130 --- /dev/null +++ b/src/finn/benchmarking/cfg/vgg10_test.yml @@ -0,0 +1,33 @@ +[ + { + "dut": ["vgg10"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "auto_fifo_depths": [True], + "auto_fifo_strategy": ["largefifo_rtlsim"], + + "rtlsim_batch_size": [5], + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["vgg10"], + + "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], + "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], + "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "live_fifo_sizing": [True], + + "rtlsim_batch_size": [5], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/src/finn/benchmarking/dut/metafi.py b/src/finn/benchmarking/dut/metafi.py deleted file mode 100644 index 05c75eee08..0000000000 --- a/src/finn/benchmarking/dut/metafi.py +++ /dev/null @@ -1,61 +0,0 @@ -import finn.builder.build_dataflow_config as build_cfg - -from finn.benchmarking.bench_base import bench - -# # custom steps -# from custom_steps import ( -# step_extract_absorb_bias, -# step_pre_streamline, -# step_residual_convert_to_hw, -# step_residual_streamline, -# step_residual_tidy, -# step_residual_topo, -# step_set_preferred_impl_style, -# step_convert_final_layers -# ) - -class bench_metafi(bench): - def step_build_setup(self): - # create build config for MetaFi models - - steps = [ - # step_residual_tidy, - # step_extract_absorb_bias, - # step_residual_topo, - # step_pre_streamline, - # step_residual_streamline, - # step_residual_convert_to_hw, - "step_create_dataflow_partition", - # step_set_preferred_impl_style, - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - "step_create_stitched_ip", - "step_measure_rtlsim_performance", - "step_out_of_context_synthesis", - "step_synthesize_bitfile", - "step_make_driver", - "step_deployment_package", - ] - - cfg = build_cfg.DataflowBuildConfig( - steps=steps, - target_fps=None, #23 - # folding_config_file=folding_config_file, - # folding_config_file="/home/rz/project/finn-examples/build/vgg10-radioml/folding_config/auto_folding_config.json", - # specialize_layers_config_file = "output_%s_%s" % (model_name, release_platform_name) + "/template_specialize_layers_config.json", - # specialize_layers_config_file = "/home/rz/project/finn-examples/build/vgg10-radioml/specialize_layers_config/template_specialize_layers_config.json", - - #large_fifo_mem_style=build_cfg.LargeFIFOMemStyle.AUTO, - # standalone_thresholds=True, - ) - - # where is this used and why? - cfg.use_conv_rtl = True, # use rtl for conv layers (MVAU cannot use rtl in our model) - - return cfg \ No newline at end of file diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml new file mode 100644 index 0000000000..d3ea2c69ff --- /dev/null +++ b/src/finn/benchmarking/dut/metafi.yml @@ -0,0 +1,28 @@ +steps: + - # step_residual_tidy + - # step_extract_absorb_bias + - # step_residual_topo + - # step_pre_streamline + - # step_residual_streamline + - # step_residual_convert_to_hw + - step_create_dataflow_partition + - # step_set_preferred_impl_style + - step_specialize_layers + - step_target_fps_parallelization + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_measure_rtlsim_performance + - step_out_of_context_synthesis + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package + +target_fps: null # 23 + +#TODO: where is this used and why? +use_conv_rtl: True # use rtl for conv layers (MVAU cannot use rtl in our model) diff --git a/src/finn/benchmarking/dut/mobilenetv1.py b/src/finn/benchmarking/dut/mobilenetv1.py deleted file mode 100644 index efcfb7b521..0000000000 --- a/src/finn/benchmarking/dut/mobilenetv1.py +++ /dev/null @@ -1,48 +0,0 @@ -from finn.benchmarking.bench_base import bench -from finn.builder.build_dataflow_config import DataflowBuildConfig - - -class bench_mobilenetv1(bench): - def step_build_setup(self): - # create build config for MobileNetV1 (based on finn-examples) - mobilenet_build_steps = [ - step_mobilenet_streamline, - step_mobilenet_lower_convs, - step_mobilenet_convert_to_hw_layers_separate_th, - "step_create_dataflow_partition", - "step_specialize_layers", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - "step_create_stitched_ip", - "step_synthesize_bitfile", - "step_make_driver", - "step_deployment_package", - ] - # mobilenet_build_steps_alveo = [ - # step_mobilenet_streamline, - # step_mobilenet_lower_convs, - # step_mobilenet_convert_to_hw_layers, - # "step_create_dataflow_partition", - # "step_specialize_layers", - # "step_apply_folding_config", - # "step_minimize_bit_width", - # "step_generate_estimate_reports", - # "step_hw_codegen", - # "step_hw_ipgen", - # "step_set_fifo_depths", - # "step_create_stitched_ip", - # step_mobilenet_slr_floorplan, - # "step_synthesize_bitfile", - # "step_make_pynq_driver", - # "step_deployment_package", - # ] - - cfg = DataflowBuildConfig( - steps=mobilenet_build_steps, - ) - - return cfg diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml new file mode 100644 index 0000000000..71a80c4f2a --- /dev/null +++ b/src/finn/benchmarking/dut/mobilenetv1.yml @@ -0,0 +1,16 @@ +steps: + - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step + - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step + - finn.builder.custom_step_library.mobilenet.step_mobilenet_convert_to_hw_layers_separate_th # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package diff --git a/src/finn/benchmarking/dut/resnet50.py b/src/finn/benchmarking/dut/resnet50.py deleted file mode 100644 index efcd0de275..0000000000 --- a/src/finn/benchmarking/dut/resnet50.py +++ /dev/null @@ -1,42 +0,0 @@ -import finn.builder.build_dataflow_config as build_cfg -from finn.util.basic import alveo_default_platform - -from finn.benchmarking.dut.resnet50_custom_steps import ( - step_resnet50_tidy, - step_resnet50_streamline, - step_resnet50_convert_to_hw, - step_resnet50_slr_floorplan, - ) - -from finn.benchmarking.bench_base import bench - -class bench_resnet50(bench): - def step_build_setup(self): - # create build config for ResNet-50 (based on finn-examples) - - resnet50_build_steps = [ - step_resnet50_tidy, - step_resnet50_streamline, - step_resnet50_convert_to_hw, - "step_create_dataflow_partition", - "step_specialize_layers", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - step_resnet50_slr_floorplan, - "step_create_stitched_ip", # was not in finn-examples - "step_measure_rtlsim_performance", # was not in finn-examples - "step_out_of_context_synthesis", # was not in finn-examples - "step_synthesize_bitfile", - "step_make_driver", - "step_deployment_package", - ] - - cfg = build_cfg.DataflowBuildConfig( - steps=resnet50_build_steps, - ) - - return cfg \ No newline at end of file diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml new file mode 100644 index 0000000000..6d6d4bcc31 --- /dev/null +++ b/src/finn/benchmarking/dut/resnet50.yml @@ -0,0 +1,19 @@ +steps: + - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step + - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step + - finn.builder.custom_step_library.resnet.step_resnet50_convert_to_hw # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step + - step_create_stitched_ip + - step_measure_rtlsim_performance + - step_out_of_context_synthesis + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py index 27583ec5e1..48152ce9d5 100644 --- a/src/finn/benchmarking/dut/transformer.py +++ b/src/finn/benchmarking/dut/transformer.py @@ -29,7 +29,7 @@ from qonnx.util.range_analysis import RangeInfo # Custom build steps required to streamline and convert the attention operator -from finn.benchmarking.dut.transformer_custom_steps import ( +from finn.builder.custom_step_library.transformer import ( prepare_graph, step_streamline, step_convert_attention_to_hw, diff --git a/src/finn/benchmarking/dut/vgg10.py b/src/finn/benchmarking/dut/vgg10.py deleted file mode 100644 index d34c186387..0000000000 --- a/src/finn/benchmarking/dut/vgg10.py +++ /dev/null @@ -1,35 +0,0 @@ -from finn.builder.build_dataflow_config import DataflowBuildConfig -from finn.benchmarking.bench_base import bench - -class bench_vgg10(bench): - def step_build_setup(self): - # create build config for VGG-10 (based on finn-examples) - vgg10_build_steps = [ - "step_tidy_up", - step_pre_streamline, - "step_streamline", - "step_convert_to_hw", - step_convert_final_layers, - "step_create_dataflow_partition", - "step_specialize_layers", - "step_target_fps_parallelization", - "step_apply_folding_config", - "step_minimize_bit_width", - "step_generate_estimate_reports", - "step_set_fifo_depths", - "step_hw_codegen", - "step_hw_ipgen", - "step_create_stitched_ip", - "step_measure_rtlsim_performance", - "step_out_of_context_synthesis", - "step_synthesize_bitfile", - "step_make_driver", - "step_deployment_package", - ] - - cfg = DataflowBuildConfig( - steps=vgg10_build_steps, - standalone_thresholds=True, - ) - - return cfg diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml new file mode 100644 index 0000000000..9e271a6921 --- /dev/null +++ b/src/finn/benchmarking/dut/vgg10.yml @@ -0,0 +1,23 @@ +steps: + - step_tidy_up + - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step + - step_streamline + - step_convert_to_hw + - finn.builder.custom_step_library.conv1d.step_convert_final_layers # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_target_fps_parallelization + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_measure_rtlsim_performance + - step_out_of_context_synthesis + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package + +standalone_thresholds: True diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index 3661b414ab..40c186a434 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -264,7 +264,7 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) -> @click.option( "--bench_config", help="Name or path of experiment configuration file", - default="", + required=True ) @click.option("--dependency-path", "-d", default="") @click.option("--num-workers", "-n", default=-1, show_default=True) From 7a3f928dc83ea8b98fe4464d6b8a9217a8d879b4 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 17:50:02 +0200 Subject: [PATCH 100/125] Adapt to FINN_ROOT refactoring --- src/finn/transformation/fpgadataflow/instrumentation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py index 7f37c5ed14..a22d770307 100644 --- a/src/finn/transformation/fpgadataflow/instrumentation.py +++ b/src/finn/transformation/fpgadataflow/instrumentation.py @@ -28,7 +28,7 @@ def collect_ip_dirs(model, ipstitch_path): ip_dirs += [ipstitch_path + "/ip"] if need_memstreamer: # add RTL streamer IP - ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") + ip_dirs.append("$::env(FINN_RTLLIB)/memstream") return ip_dirs @@ -71,7 +71,7 @@ def apply(self, model): ko = out_shape_folded[-1] # fill out instrumentation wrapper template with open( - os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation.template.cpp"), "r" + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation.template.cpp"), "r" ) as f: instrwrp_cpp = f.read() instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending)) @@ -150,7 +150,7 @@ def apply(self, model): # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines) # fill in testbench template with open( - os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_tb.template.sv"), + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_tb.template.sv"), "r", ) as f: testbench_sv = f.read() @@ -158,7 +158,7 @@ def apply(self, model): f.write(testbench_sv) # fill in testbench project creator template with open( - os.path.join(os.environ["FINN_ROOT"], "custom_hls", "instrumentation_sim.template.tcl"), + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_sim.template.tcl"), "r", ) as f: testbench_tcl = f.read() From ccebbdca2b6eb88dffded9b1e794ce9912b7af89 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 17:59:48 +0200 Subject: [PATCH 101/125] Fix use of deprecated FINN_ROOT --- src/finn/transformation/fpgadataflow/make_driver.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py index b17cb9c8e8..1cea95f9c5 100644 --- a/src/finn/transformation/fpgadataflow/make_driver.py +++ b/src/finn/transformation/fpgadataflow/make_driver.py @@ -477,8 +477,7 @@ def apply(self, model): # create (copy) the static instrumentation driver driver_template = ( - os.environ["FINN_ROOT"] - + "/src/finn/qnn-data/templates/driver/driver_instrumentation.py" + os.environ["FINN_QNN_DATA"] + "/templates/driver/driver_instrumentation.py" ) driver_py = pynq_driver_dir + "/driver.py" shutil.copy(driver_template, driver_py) From 6511559f8038e2551ec01dfca966251a5c120e01 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 20 May 2025 21:17:32 +0200 Subject: [PATCH 102/125] Fix bench cmd --- src/finn/benchmarking/bench-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml index 9e960f8ecd..2738ad3d56 100644 --- a/src/finn/benchmarking/bench-ci.yml +++ b/src/finn/benchmarking/bench-ci.yml @@ -31,7 +31,7 @@ FINN Build: # Launch benchmarking script via FINN CLI, includes deps update and environment preparation - | source ./finn-plus-venv/bin/activate - finn bench $BENCH_CFG + finn bench --bench_config $BENCH_CFG cache: key: $CI_COMMIT_SHA policy: pull From cf6254dcfb3f4ed372e1fb4bb0a03fa7bb157d5e Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 13:56:39 +0200 Subject: [PATCH 103/125] Fix CLI call --- src/finn/benchmarking/bench-ci.yml | 2 +- src/finn/benchmarking/bench.py | 2 +- src/finn/interface/run_finn.py | 8 ++------ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml index 2738ad3d56..8a1269ff9e 100644 --- a/src/finn/benchmarking/bench-ci.yml +++ b/src/finn/benchmarking/bench-ci.yml @@ -31,7 +31,7 @@ FINN Build: # Launch benchmarking script via FINN CLI, includes deps update and environment preparation - | source ./finn-plus-venv/bin/activate - finn bench --bench_config $BENCH_CFG + finn bench --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG cache: key: $CI_COMMIT_SHA policy: pull diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 745d6c62b2..7a9b0877e6 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -47,7 +47,7 @@ def get_default_session_options_new(): config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) else: configs_path = os.path.join(os.path.dirname(__file__), "cfg") - config_select = config_name + ".json" + config_select = config_name + ".yml" config_path = os.path.join(configs_path, config_select) print("Job launched with SLURM ID: %d" % (job_id)) except KeyError: diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index 40c186a434..a01b70bfb4 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -261,11 +261,7 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) -> @click.command(help="Run a given benchmark configuration.") -@click.option( - "--bench_config", - help="Name or path of experiment configuration file", - required=True -) +@click.option("--bench_config", help="Name or path of experiment configuration file", required=True) @click.option("--dependency-path", "-d", default="") @click.option("--num-workers", "-n", default=-1, show_default=True) @click.option( @@ -278,7 +274,7 @@ def bench(bench_config: str, dependency_path: str, num_workers: int, build_path: console = Console() build_dir = Path(build_path).expanduser() if build_path != "" else None dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None - prepare_finn(dep_path, Path(), build_dir, num_workers, is_test_run=True) + prepare_finn(dep_path, Path(), build_dir, num_workers) console.rule("RUNNING BENCHMARK") # Late import because we need prepare_finn to setup remaining dependencies first From cc0be94bb0ae15e8721ad6c9c5a525602ae9de81 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 16:16:05 +0200 Subject: [PATCH 104/125] [CI] Adapt to recent runner version change --- .gitlab-ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ebdad54bee..a2f9527976 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,11 +93,11 @@ Sync finn-dev: .setup_venv_from_whl: &setup_venv_from_whl # Move everything to working directory (e.g., RAMdisk) - - cp -dfR .. $PATH_WORKDIR + - cp -dfR . $PATH_WORKDIR - cd $PATH_WORKDIR # Create fresh virtual environment and install finn-plus from .whl (artifact) - python3 -m venv finn-plus-venv - - finn-plus-venv/bin/pip install ./finn-plus/dist/*.whl + - finn-plus-venv/bin/pip install dist/*.whl Build: id_tokens: @@ -171,8 +171,8 @@ FINN Test Suite 2022.2: - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & # Launch FINN via test command, includes preparation of (cached) dependencies - | - source ./finn-plus-venv/bin/activate - finn test --variant $TEST_SUITE --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL + source finn-plus-venv/bin/activate + finn test --variant $TEST_SUITE --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL artifacts: name: "test_reports" when: always From d1708971c55285fabaa6fbdf5e24fe284ceedbfb Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 17:39:57 +0200 Subject: [PATCH 105/125] Minor fixes --- src/finn/benchmarking/dut/metafi.yml | 14 +++++++------- src/finn/builder/build_dataflow.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml index d3ea2c69ff..fba5a68fe5 100644 --- a/src/finn/benchmarking/dut/metafi.yml +++ b/src/finn/benchmarking/dut/metafi.yml @@ -1,12 +1,12 @@ steps: - - # step_residual_tidy - - # step_extract_absorb_bias - - # step_residual_topo - - # step_pre_streamline - - # step_residual_streamline - - # step_residual_convert_to_hw + #- step_residual_tidy + #- step_extract_absorb_bias + #- step_residual_topo + #- step_pre_streamline + #- step_residual_streamline + #- step_residual_convert_to_hw - step_create_dataflow_partition - - # step_set_preferred_impl_style + #- step_set_preferred_impl_style - step_specialize_layers - step_target_fps_parallelization - step_apply_folding_config diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index b14d69a1f9..f6f3f6127d 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -253,7 +253,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): print("Build failed") metadata = { "status": "failed", - "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")), + "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")), } with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: json.dump(metadata, f, indent=2) @@ -264,7 +264,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): json.dump(time_per_step, f, indent=2) metadata = { "status": "ok", - "tool_version": os.path.basename(os.environ.get("VIVADO_PATH")), + "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")), } with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: json.dump(metadata, f, indent=2) From 9718a30442e99a6525431fbb6070c459ad3473e8 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 17:48:26 +0200 Subject: [PATCH 106/125] [CI] Use empty git strategy for benchmarking as well --- src/finn/benchmarking/bench-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml index 691ddeb5fe..0212aee122 100644 --- a/src/finn/benchmarking/bench-ci.yml +++ b/src/finn/benchmarking/bench-ci.yml @@ -22,6 +22,7 @@ FINN Build: - job: Build pipeline: $PARENT_PIPELINE_ID variables: + GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH" extends: .setup_full_2022_2 From bd36b8fbccd8b10a9677f88dce5d5775eea4a760 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 21:06:35 +0200 Subject: [PATCH 107/125] Fix typo --- .gitlab-ci.yml | 2 +- src/finn/benchmarking/bench-ci.yml | 2 +- src/finn/builder/build_dataflow.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0a7aaab37e..09fa9e0930 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -132,7 +132,7 @@ FINN Test Suite 2022.2: paths: - deps variables: - GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead + GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" PYTEST_PARALLEL: "$CPU_CORES" extends: .setup_full_2022_2 diff --git a/src/finn/benchmarking/bench-ci.yml b/src/finn/benchmarking/bench-ci.yml index 0212aee122..0f039180d1 100644 --- a/src/finn/benchmarking/bench-ci.yml +++ b/src/finn/benchmarking/bench-ci.yml @@ -22,7 +22,7 @@ FINN Build: - job: Build pipeline: $PARENT_PIPELINE_ID variables: - GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead + GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH" extends: .setup_full_2022_2 diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index f6f3f6127d..b29e36ab56 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -253,7 +253,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): print("Build failed") metadata = { "status": "failed", - "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")), + "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), } with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: json.dump(metadata, f, indent=2) @@ -264,7 +264,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): json.dump(time_per_step, f, indent=2) metadata = { "status": "ok", - "tool_version": os.path.basename(os.environ.get("$XILINX_VIVADO")), + "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), } with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: json.dump(metadata, f, indent=2) From a942390d20d27e8d2c9a1ea70e95bea523b91442 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 21 May 2025 21:09:59 +0200 Subject: [PATCH 108/125] Refactor remaining MakePYNQDriver calls --- notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb | 4 ++-- notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb | 4 ++-- src/finn/qnn-data/templates/driver/driver_base.py | 2 +- tests/end2end/test_end2end_bnn_pynq.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 2b01f24557..014a13db27 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -456,8 +456,8 @@ "metadata": {}, "outputs": [], "source": [ - "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n", - "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))" + "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n", + "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))" ] }, { diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index b0510b0fdb..de6de23d3f 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -751,8 +751,8 @@ "metadata": {}, "outputs": [], "source": [ - "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n", - "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))" + "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n", + "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))" ] }, { diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index a6ff29d608..af55ee13df 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -38,7 +38,7 @@ # Driver base class for FINN-generated dataflow accelerators. # The particulars of the generated accelerator are specified via the -# io_shape_dict (generated by the MakePYNQDriver transformation). +# io_shape_dict (generated by the MakePYNQDriverIODMA transformation). class FINNExampleOverlay(Overlay): diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 9a2da7a45e..9d40b3ba93 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -73,7 +73,7 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC -from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim @@ -812,7 +812,7 @@ def test_make_pynq_driver(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "build") model = load_test_checkpoint_or_skip(prev_chkpt_name) board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma" - model = model.transform(MakePYNQDriver(board_to_driver_platform)) + model = model.transform(MakePYNQDriverIODMA(board_to_driver_platform)) model.save(get_checkpoint_name(board, topology, wbits, abits, "driver")) def test_deploy(self, topology, wbits, abits, board): From 4ee4da19f8ec46c1b701f4218ea7041f8bbbf840 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 22 May 2025 19:34:15 +0200 Subject: [PATCH 109/125] Adapt virtual FIFO output stream naming --- src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py index f17bc48fc6..e7d02a4915 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py @@ -66,7 +66,7 @@ def strm_decl(self): ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( + 'hls::stream> out0_{} ("out0_{}");'.format( self.get_outstream_width(), self.hls_sname(), self.hls_sname() ) ) @@ -88,7 +88,7 @@ def docompute(self): VirtualFIFO(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy); // FIFO -> AXI-Stream - move(out_fifo, out_%s); + move(out_fifo, out0_%s); """ % (self.hls_sname(), self.hls_sname()) ] @@ -99,7 +99,7 @@ def blackboxfunction(self): out_packed_bits = self.get_outstream_width() out_packed_hls_type = "ap_uint<%d>" % out_packed_bits self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode, + """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out0_%s, ap_uint<32> mode, ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)""" % ( self.onnx_node.name, @@ -115,7 +115,7 @@ def pragmas(self): "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=out0_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode") self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth") From fb1853751c84d5b89299bcfad6a1e81c6dbac877 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Thu, 22 May 2025 21:40:58 +0200 Subject: [PATCH 110/125] Move CI-specific scripts --- .gitlab-ci.yml | 6 +- .../bench-ci.yml => ci/.gitlab-bench.yml | 6 +- .gitlab-ci-base.yml => ci/.gitlab-setup.yml | 0 ci/collect.py | 412 +++++++++ {src/finn/benchmarking => ci}/measure.py | 70 +- driver/iterative_live_fifosizing_driver.ipynb | 833 ------------------ src/finn/benchmarking/bench_rtl_swg.py | 403 --------- src/finn/benchmarking/collect.py | 280 ------ 8 files changed, 466 insertions(+), 1544 deletions(-) rename src/finn/benchmarking/bench-ci.yml => ci/.gitlab-bench.yml (93%) rename .gitlab-ci-base.yml => ci/.gitlab-setup.yml (100%) create mode 100644 ci/collect.py rename {src/finn/benchmarking => ci}/measure.py (51%) delete mode 100644 driver/iterative_live_fifosizing_driver.ipynb delete mode 100644 src/finn/benchmarking/bench_rtl_swg.py delete mode 100644 src/finn/benchmarking/collect.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 09fa9e0930..ad524d0fd7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -include: .gitlab-ci-base.yml +include: ci/.gitlab-setup.yml stages: - sync @@ -164,7 +164,7 @@ Bench (Manual): when: never - if: $MANUAL_CFG_PATH != "" trigger: - include: benchmarking/bench-ci.yml + include: ci/.gitlab-bench.yml strategy: depend forward: pipeline_variables: true @@ -180,7 +180,7 @@ Bench: when: never - if: $MANUAL_CFG_PATH == "" trigger: - include: src/finn/benchmarking/bench-ci.yml + include: ci/.gitlab-bench.yml strategy: depend forward: pipeline_variables: true diff --git a/src/finn/benchmarking/bench-ci.yml b/ci/.gitlab-bench.yml similarity index 93% rename from src/finn/benchmarking/bench-ci.yml rename to ci/.gitlab-bench.yml index 0f039180d1..f3139c0fbd 100644 --- a/src/finn/benchmarking/bench-ci.yml +++ b/ci/.gitlab-bench.yml @@ -1,4 +1,4 @@ -include: .gitlab-ci-base.yml +include: ci/.gitlab-setup.yml stages: - build @@ -56,7 +56,7 @@ Measurement: - when: always script: # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment - - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python src/finn/benchmarking/measure.py" + - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python ci/measure.py" artifacts: name: "measurement_artifacts" when: always @@ -74,5 +74,5 @@ Result Collection: # Also run on failure of previous tasks to collect partial results - when: always script: - - python3.10 src/finn/benchmarking/collect.py + - python3.10 ci/collect.py - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git diff --git a/.gitlab-ci-base.yml b/ci/.gitlab-setup.yml similarity index 100% rename from .gitlab-ci-base.yml rename to ci/.gitlab-setup.yml diff --git a/ci/collect.py b/ci/collect.py new file mode 100644 index 0000000000..b833278fe9 --- /dev/null +++ b/ci/collect.py @@ -0,0 +1,412 @@ +import json +import os +import shutil +from dvclive.live import Live + + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print("Failed to delete %s. Reason: %s" % (file_path, e)) + + +def log_dvc_metric(live, prefix, name, value): + # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) + live.log_metric(prefix + name.replace("/", "-"), value, plot=False) + + +def open_json_report(id, report_name): + # look in both, build & measurement, artifacts + path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) + path2 = os.path.join( + "measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name + ) + if os.path.isfile(path1): + with open(path1, "r") as f: + report = json.load(f) + return report + elif os.path.isfile(path2): + with open(path2, "r") as f: + report = json.load(f) + return report + else: + return None + + +def log_all_metrics_from_report(id, live, report_name, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in report: + log_dvc_metric(live, prefix, key, report[key]) + + +def log_metrics_from_report(id, live, report_name, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in keys: + if key in report: + log_dvc_metric(live, prefix, key, report[key]) + + +def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + if key_top in report: + for key in keys: + if key in report[key_top]: + log_dvc_metric(live, prefix, key, report[key_top][key]) + + +if __name__ == "__main__": + # Go through all runs found in the artifacts and log their results to DVC + run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output")) + print("Looking for runs in build artifacts") + run_ids = [] + for run_dir in run_dir_list: + if run_dir.startswith("run_"): + run_id = int(run_dir[4:]) + run_ids.append(run_id) + run_ids.sort() + print("Found %d runs" % len(run_ids)) + + follow_up_bench_cfg = list() + # Prepare (local) output directory where follow-up bench configs will be stored + output_cfg_dir = os.path.join( + os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID") + ) + output_folding_dir = os.path.join(output_cfg_dir, "folding") + output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json") + + for id in run_ids: + print("Processing run %d" % id) + experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + experiment_msg = ( + "[CI] " + + os.environ.get("CI_PIPELINE_NAME") + + " (" + + os.environ.get("CI_PIPELINE_ID") + + "_" + + str(id) + + ")" + ) + # TODO: cache images once we switch to a cache provider that works with DVC Studio + with Live(exp_name=experiment_name, exp_message=experiment_msg, cache_images=False) as live: + # PARAMS + # input parameters logged by benchmarking infrastructure + metadata_bench = open_json_report(id, "metadata_bench.json") + params = {"params": metadata_bench["params"]} + live.log_params(params) + + # optional metadata logged by builder + metadata_builder = open_json_report(id, "metadata_builder.json") + if metadata_builder: + metadata = { + "metadata": { + "tool_version": metadata_builder["tool_version"], + } + } + live.log_params(metadata) + + # optional dut_info.json (additional information generated during model generation) + dut_info_report = open_json_report(id, "dut_info.json") + if dut_info_report: + dut_info = {"dut_info": dut_info_report} + live.log_params(dut_info) + + # METRICS + # TODO: for microbenchmarks, only summarize results for target node (surrounding SDP?) + # TODO: make all logs consistent (at generation), e.g., BRAM vs BRAM18 vs BRAM36) + + # status + status = metadata_bench["status"] + if status == "ok": + # mark as failed if either bench or builder indicates failure + if metadata_builder: + status_builder = metadata_builder["status"] + if status_builder == "failed": + status = "failed" + log_dvc_metric(live, "", "status", status) + + # verification steps + if "output" in metadata_bench: + if "builder_verification" in metadata_bench["output"]: + log_dvc_metric( + live, + "", + "verification", + metadata_bench["output"]["builder_verification"]["verification"], + ) + + # estimate_layer_resources.json + log_nested_metrics_from_report( + id, + live, + "estimate_layer_resources.json", + "total", + [ + "LUT", + "DSP", + "BRAM_18K", + "URAM", + ], + prefix="estimate/resources/", + ) + + # estimate_layer_resources_hls.json + log_nested_metrics_from_report( + id, + live, + "estimate_layer_resources_hls.json", + "total", + [ + "LUT", + "FF", + "DSP", + "DSP48E", + "DSP58E", # TODO: aggregate/unify DSP reporting + "BRAM_18K", + "URAM", + ], + prefix="hls_estimate/resources/", + ) + + # estimate_network_performance.json + log_metrics_from_report( + id, + live, + "estimate_network_performance.json", + [ + "critical_path_cycles", + "max_cycles", + "max_cycles_node_name", + "estimated_throughput_fps", + "estimated_latency_ns", + ], + prefix="estimate/performance/", + ) + + # rtlsim_performance.json + log_metrics_from_report( + id, + live, + "rtlsim_performance.json", + [ + "N", + "TIMEOUT", + "latency_cycles", + "cycles", + "fclk[mhz]", + "throughput[images/s]", + "stable_throughput[images/s]", + # add INPUT_DONE, OUTPUT_DONE, number transactions? + ], + prefix="rtlsim/performance/", + ) + + # fifo_sizing.json + log_metrics_from_report( + id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/" + ) + + # stitched IP DCP synth resource report + log_nested_metrics_from_report( + id, + live, + "post_synth_resources_dcp.json", + "(top)", + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(dcp)/resources/", + ) + + # stitched IP DCP synth resource breakdown + # TODO: generalize to all build flows and bitfile synth + layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"] + for category in layer_categories: + log_nested_metrics_from_report( + id, + live, + "res_breakdown_build_output.json", + category, + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(dcp)/resources(breakdown)/" + category + "/", + ) + + # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) + log_metrics_from_report( + id, + live, + "ooc_synth_and_timing.json", + [ + "LUT", + "LUTRAM", + "FF", + "DSP", + "BRAM", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(ooc)/resources/", + ) + log_metrics_from_report( + id, + live, + "ooc_synth_and_timing.json", + [ + "WNS", + "fmax_mhz", + # add TNS? what is "delay"? + ], + prefix="synth(ooc)/timing/", + ) + + # post_synth_resources.json (shell synth / step_synthesize_bitfile) + log_nested_metrics_from_report( + id, + live, + "post_synth_resources.json", + "(top)", + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth/resources/", + ) + + # post synth timing report + # TODO: only exported as post_route_timing.rpt, not .json + + # instrumentation measurement + log_all_metrics_from_report( + id, live, "measured_performance.json", prefix="measurement/performance/" + ) + + # IODMA validation accuracy + log_metrics_from_report( + id, + live, + "validation.json", + [ + "top-1_accuracy", + ], + prefix="measurement/validation/", + ) + + # power measurement + # TODO + + # live fifosizing report + graph png + log_metrics_from_report( + id, + live, + "fifo_sizing_report.json", + [ + "error", + "fifo_size_total_kB", + ], + prefix="fifosizing/live/", + ) + + image = os.path.join( + "measurement_artifacts", + "runs_output", + "run_%d" % (id), + "reports", + "fifo_sizing_graph.png", + ) + if os.path.isfile(image): + live.log_image("fifosizing_pass_1", image) + + # time_per_step.json + log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) + + # ARTIFACTS + # Log build reports as they come from GitLab artifacts, + # but copy them to a central dir first so all runs share the same path + run_report_dir1 = os.path.join( + "build_artifacts", "runs_output", "run_%d" % (id), "reports" + ) + run_report_dir2 = os.path.join( + "measurement_artifacts", "runs_output", "run_%d" % (id), "reports" + ) + dvc_report_dir = "reports" + os.makedirs(dvc_report_dir, exist_ok=True) + delete_dir_contents(dvc_report_dir) + if os.path.isdir(run_report_dir1): + shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True) + if os.path.isdir(run_report_dir2): + shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True) + live.log_artifact(dvc_report_dir) + + # Prepare benchmarking config for follow-up runs after live FIFO-sizing + folding_config_lfs_path = os.path.join( + "measurement_artifacts", + "runs_output", + "run_%d" % (id), + "reports", + "folding_config_lfs.json", + ) + if os.path.isfile(folding_config_lfs_path): + # Copy folding config produced by live FIFO-sizing + output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json") + os.makedirs(output_folding_dir, exist_ok=True) + print( + "Saving lfs-generated folding config of this run to use in future builds: %s" + % output_folding_path + ) + shutil.copy(folding_config_lfs_path, output_folding_path) + + # Create benchmarking config + metadata_bench = open_json_report(id, "metadata_bench.json") + configuration = dict() + for key in metadata_bench["params"]: + # wrap in list + configuration[key] = [metadata_bench["params"][key]] + # overwrite FIFO-related params + import_folding_path = os.path.join( + os.environ.get("LOCAL_CFG_DIR"), + "lfs", + "CI_" + os.environ.get("CI_PIPELINE_ID"), + "folding", + experiment_name + ".json", + ) + configuration["fifo_method"] = ["manual"] + configuration["target_fps"] = ["None"] + configuration["folding_path"] = [import_folding_path] + + follow_up_bench_cfg.append(configuration) + + # Save aggregated benchmarking config for follow-up job + if follow_up_bench_cfg: + print("Saving follow-up bench config for lfs: %s" % output_cfg_path) + with open(output_cfg_path, "w") as f: + json.dump(follow_up_bench_cfg, f, indent=2) + + print("Done") diff --git a/src/finn/benchmarking/measure.py b/ci/measure.py similarity index 51% rename from src/finn/benchmarking/measure.py rename to ci/measure.py index 9a44ff3192..42db938d33 100644 --- a/src/finn/benchmarking/measure.py +++ b/ci/measure.py @@ -1,9 +1,19 @@ import os -import sys -import subprocess import shutil +import subprocess +import sys + -from finn.benchmarking.util import delete_dir_contents +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print("Failed to delete %s. Reason: %s" % (file_path, e)) if __name__ == "__main__": @@ -26,21 +36,36 @@ # Run driver print("Running driver..") - # run validate.py (from IODMA driver) if present, otherwise driver.py from instrumentation + # run validate.py (from IODMA driver) if present, otherwise driver.py (instrumentation) # TODO: unify IODMA/instrumentation shell & driver if os.path.isfile(f"{extract_dir}/driver/validate.py"): - result = subprocess.run(["python", f"{extract_dir}/driver/validate.py", - "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", - "--settingsfile", f"{extract_dir}/driver/settings.json", - "--reportfile", f"{extract_dir}/validation.json", - "--dataset_root", "/home/xilinx/datasets", #TODO: env var - ]) + result = subprocess.run( + [ + "python", + f"{extract_dir}/driver/validate.py", + "--bitfile", + f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", + f"{extract_dir}/driver/settings.json", + "--reportfile", + f"{extract_dir}/validation.json", + "--dataset_root", + "/home/xilinx/datasets", # TODO: env var + ] + ) else: - result = subprocess.run(["python", f"{extract_dir}/driver/driver.py", - "--bitfile", f"{extract_dir}/bitfile/finn-accel.bit", - "--settingsfile", f"{extract_dir}/driver/settings.json", - "--reportfile", f"{extract_dir}/measured_performance.json", - ]) + result = subprocess.run( + [ + "python", + f"{extract_dir}/driver/driver.py", + "--bitfile", + f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", + f"{extract_dir}/driver/settings.json", + "--reportfile", + f"{extract_dir}/measured_performance.json", + ] + ) if result.returncode != 0: print("Driver reported error!") exit_code = 1 @@ -48,13 +73,14 @@ print("Driver finished successfully.") # Copy results back to artifact directory - for report in ["measured_performance.json", - "fifo_sizing_report.json", - "fifo_depth_export.json", - "fifo_sizing_graph.png", - "folding_config_lfs.json", - "validation.json", - ]: + for report in [ + "measured_performance.json", + "fifo_sizing_report.json", + "fifo_depth_export.json", + "fifo_sizing_graph.png", + "folding_config_lfs.json", + "validation.json", + ]: report_path = os.path.join(extract_dir, report) if os.path.isfile(report_path): print("Copying %s to %s" % (report_path, reports_dir)) diff --git a/driver/iterative_live_fifosizing_driver.ipynb b/driver/iterative_live_fifosizing_driver.ipynb deleted file mode 100644 index 83a329d263..0000000000 --- a/driver/iterative_live_fifosizing_driver.ipynb +++ /dev/null @@ -1,833 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "0ee21ecb", - "metadata": {}, - "outputs": [ - { - "data": { - "application/javascript": [ - "\n", - "try {\n", - "require(['notebook/js/codecell'], function(codecell) {\n", - " codecell.CodeCell.options_default.highlight_modes[\n", - " 'magic_text/x-csrc'] = {'reg':[/^%%microblaze/]};\n", - " Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n", - " Jupyter.notebook.get_cells().map(function(cell){\n", - " if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n", - " });\n", - "});\n", - "} catch (e) {};\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "\n", - "try {\n", - "require(['notebook/js/codecell'], function(codecell) {\n", - " codecell.CodeCell.options_default.highlight_modes[\n", - " 'magic_text/x-csrc'] = {'reg':[/^%%pybind11/]};\n", - " Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n", - " Jupyter.notebook.get_cells().map(function(cell){\n", - " if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n", - " });\n", - "});\n", - "} catch (e) {};\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import time\n", - "import json\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "from IPython.display import clear_output\n", - "import numpy as np\n", - "from pynq import Overlay\n", - "\n", - "path = \"bitstreams/resnet50/live_instrumentation\"\n", - "bitstream = path + \"/finn-accel.bit\"\n", - "\n", - "# Program FPGA\n", - "ol = Overlay(bitstream, download=True, device=None)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f476fd87", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#FIFO IP detected: 266\n", - "#FIFO width information found: 266\n" - ] - } - ], - "source": [ - "### Sanity checks\n", - "# We expect 3 AXI-Lite peripherals next to the virtual FIFOs: instrumentation_wrap_0, axi_gpio_0 (for reset), zynq_ps\n", - "# We don't expect any additional FINN SDPs with AXI-Lite interface, such as runtime-writable weights\n", - "print(\"#FIFO IP detected: %d\" % (len(ol.ip_dict.keys()) - 3))\n", - "\n", - "# We expect a fifo_widths.json file exported by FINN listing the width of each FIFO, e.g.,\n", - "# {'fifo_widths': {'StreamingFIFO_hls_0': 8, 'StreamingFIFO_hls_1': 32, 'StreamingFIFO_hls_2': 24}}\n", - "with open(path + \"/fifo_widths.json\", \"r\") as f:\n", - " fifo_info = json.load(f)\n", - "print(\"#FIFO width information found: %d\" % len(fifo_info[\"fifo_widths\"]))" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e419656f", - "metadata": {}, - "outputs": [], - "source": [ - "### Instrumentation driver\n", - "# Register map\n", - "#ap_uint<32> cfg, \t// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed\n", - "#ap_uint<32> &status,\t// [0] - timestamp overflow; [1] - timestamp underflow\n", - "#ap_uint<32> &latency,\n", - "#ap_uint<32> &interval,\n", - "#ap_uint<32> &checksum,\n", - "#ap_uint<32> &min_latency\n", - "\n", - "def read_register(ol, name):\n", - " return ol.instrumentation_wrap_0.read(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"])\n", - "\n", - "def write_register(ol, name, value):\n", - " return ol.instrumentation_wrap_0.write(offset=ol.ip_dict[\"instrumentation_wrap_0\"][\"registers\"][name][\"address_offset\"], value=value)\n", - "\n", - "def observe_instrumentation(debug_print=True):\n", - " status_reg = read_register(ol, \"status\")\n", - " chksum_reg = read_register(ol, \"checksum\")\n", - " min_latency = read_register(ol, \"min_latency\")\n", - " latency = read_register(ol, \"latency\")\n", - " interval = read_register(ol, \"interval\")\n", - "\n", - " frame = (chksum_reg >> 24) & 0x000000ff\n", - " checksum = chksum_reg & 0x00ffffff\n", - " overflow_err = (status_reg & 0x00000001) != 0\n", - " underflow_err = (status_reg & 0x00000002) != 0\n", - "\n", - " if debug_print:\n", - " print(\"---INSTRUMENTATION_REPORT---\")\n", - " if overflow_err or underflow_err:\n", - " print(\"Status ERROR\")\n", - " print(\"Overflow error: %s\" % overflow_err)\n", - " print(\"Underflow error: %s\" % underflow_err)\n", - " else:\n", - " print(\"Status OK\")\n", - " print(\"Frame number (8-bit): %d\" % frame)\n", - " print(\"Checksum: 0x%06x\" % checksum)\n", - " print(\"Min Latency (cycles): %d\" % min_latency)\n", - " print(\"Latency (cycles): %d\" % latency)\n", - " print(\"Interval (cycles): %d\" % interval)\n", - " print(\"----------------------------\")\n", - "\n", - " return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)\n", - "\n", - "def start_accelerator():\n", - " lfsr_seed = 0x00010000 # upper 16 bits\n", - " write_register(ol, \"cfg\", lfsr_seed + 1) # start operation\n", - "\n", - "### Virtual FIFO driver\n", - "# Register map\n", - "mode_offset = 0x10\n", - "depth_offset = 0x18\n", - "occupancy_offset = 0x20\n", - "occupancy_ctrl_offset = 0x24\n", - "max_occupancy_offset = 0x30\n", - "max_occupancy_ctrl_offset = 0x34\n", - "\n", - "def configure_fifo(ol, i, mode, depth = 2):\n", - " ip_name = \"StreamingDataflowPartition_%d\" % i\n", - " getattr(ol, ip_name).write(offset=mode_offset, value = mode)\n", - " getattr(ol, ip_name).write(offset=depth_offset, value = depth)\n", - "\n", - "def total_fifo_size(depths):\n", - " # Assuming FIFO SDP/AXI-Lite interfaces are ordered consistently with FIFO IDs\n", - " total_size_bits = 0\n", - " for i, depth in enumerate(depths):\n", - " total_size_bits += depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % i]\n", - " total_size_kB = total_size_bits / 8.0 / 1000.0\n", - " return total_size_kB\n", - "\n", - "### GPIO Reset Driver\n", - "def reset_accelerator():\n", - " ol.axi_gpio_0.write(offset=ol.ip_dict[\"axi_gpio_0\"][\"registers\"][\"GPIO_DATA\"][\"address_offset\"], value=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2e2a4b88", - "metadata": {}, - "outputs": [], - "source": [ - "### Iterative FIFO-sizing function\n", - "def size_iteratively(start_depth, iteration_runtime, reduction_factor = 0.5):\n", - " num_fifos = len(fifo_info[\"fifo_widths\"])\n", - " fifo_minimum_reached = [False] * num_fifos\n", - " \n", - " if isinstance(start_depth, list):\n", - " # Individual start depth for each FIFO has been supplied\n", - " fifo_depths = start_depth\n", - " else:\n", - " # Initialize all depths to the same start depth\n", - " fifo_depths = [start_depth] * num_fifos\n", - " \n", - " # Reset accelerator and configure FIFOs\n", - " reset_accelerator()\n", - " for i in range(0, num_fifos):\n", - " configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n", - "\n", - " # Run once to determine target interval\n", - " start_accelerator()\n", - " time.sleep(1)\n", - " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n", - " log_total_fifo_size = [int(total_fifo_size(fifo_depths))]\n", - " log_interval = [interval]\n", - " log_min_latency = [min_latency]\n", - " log_latency = [latency]\n", - " target_interval = interval\n", - " \n", - " # Iteratively reduce FIFO depth until all FIFOs are minimized\n", - " iteration = 0\n", - " start_time = time.time()\n", - " while not all(fifo_minimum_reached):\n", - " for fifo_id in range(0, num_fifos):\n", - " if not fifo_minimum_reached[fifo_id]:\n", - " fifo_depth_before = fifo_depths[fifo_id]\n", - " fifo_depths[fifo_id] = int(fifo_depths[fifo_id] * reduction_factor)\n", - "\n", - " # Reset accelerator\n", - " reset_accelerator()\n", - "\n", - " # Configure all FIFOs\n", - " for i in range(0, num_fifos):\n", - " configure_fifo(ol, i, mode = 1, depth = fifo_depths[i])\n", - "\n", - " # Start accelerator\n", - " start_accelerator()\n", - "\n", - " # Let it run\n", - " time.sleep(iteration_runtime)\n", - "\n", - " # Check if throughput dropped or deadlock occured \n", - " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation(False)\n", - "\n", - " if interval > target_interval or interval == 0 or overflow_err or underflow_err:\n", - " # Revert depth reduction and mark FIFO as minimized\n", - " fifo_depths[fifo_id] = fifo_depth_before\n", - " fifo_minimum_reached[fifo_id] = True\n", - " else:\n", - " log_total_fifo_size.append(int(total_fifo_size(fifo_depths)))\n", - " log_interval.append(interval)\n", - " log_min_latency.append(min_latency)\n", - " log_latency.append(latency) \n", - "\n", - " if fifo_depths[fifo_id] == 1:\n", - " fifo_minimum_reached[fifo_id] = True\n", - "\n", - " # Report status\n", - " clear_output(wait=True)\n", - " print(\"Iteration: %d\" % iteration)\n", - " print(\"Reducing depth of FIFO: %d/%d\" % (fifo_id, num_fifos))\n", - " print(\"Numer of minimized FIFOs: %d/%d\" % (sum(fifo_minimum_reached), num_fifos))\n", - " print(\"Interval: %d\" % log_interval[-1])\n", - " print(\"Min. latency / latency: %d/%d\" % (log_min_latency[-1], log_latency[-1]))\n", - " print(\"Total FIFO Size (kB): %d\" % log_total_fifo_size[-1])\n", - "\n", - " iteration += 1\n", - "\n", - " end_time = time.time()\n", - " print(\"Done (%d seconds)\" % int(end_time - start_time))\n", - " \n", - " return fifo_depths, log_total_fifo_size, log_interval, log_min_latency, log_latency" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2ebb2aa3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Testing start depth of 64\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 128\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 256\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 512\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 1024\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 2048\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 4096\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 0\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 4294967295\n", - "Latency (cycles): 0\n", - "Interval (cycles): 0\n", - "----------------------------\n", - "Testing start depth of 8192\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 108\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 2548522\n", - "Latency (cycles): 5030984\n", - "Interval (cycles): 903174\n", - "----------------------------\n", - "Testing start depth of 16384\n", - "---INSTRUMENTATION_REPORT---\n", - "Status OK\n", - "Frame number (8-bit): 108\n", - "Checksum: 0x000000\n", - "Min Latency (cycles): 2548522\n", - "Latency (cycles): 7496520\n", - "Interval (cycles): 903174\n", - "----------------------------\n", - "Determined start depth for all FIFOs: 8192\n", - "Determined iteration runtime based on performance: 0.127426 s\n" - ] - } - ], - "source": [ - "### Attempt to determine start depth for all FIFOs automatically\n", - "# If it doesn't find a working setting, start depth must be set manually, potentially on per-FIFO basis\n", - "start_depth = 64\n", - "last_interval = 0\n", - "start_depth_found = False\n", - "\n", - "while not start_depth_found:\n", - " print(\"Testing start depth of %d\" % start_depth)\n", - " reset_accelerator()\n", - "\n", - " # Configure FIFOs\n", - " num_fifos = len(fifo_info[\"fifo_widths\"])\n", - " for i in range(0, num_fifos):\n", - " configure_fifo(ol, i, mode = 1, depth = start_depth)\n", - " \n", - " # Start accelerator and let it run for a long time\n", - " start_accelerator()\n", - " time.sleep(1)\n", - " \n", - " # Examine performance\n", - " (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) = observe_instrumentation()\n", - " if interval > 0 and interval == last_interval and not overflow_err and not underflow_err:\n", - " # Accelerator runs with stable interval, reset to previous start depth\n", - " start_depth_found = True\n", - " start_depth = last_start_depth\n", - " else:\n", - " # Start depth is still too small, increase for next try\n", - " last_start_depth = start_depth\n", - " start_depth = start_depth * 2\n", - " \n", - " last_interval = interval\n", - " \n", - "# Determine runtime per iteration based on performance, so that stable-state is guaranteed\n", - "# Use a simple overestimation for now to be safe\n", - "iteration_runtime = max(0.01, (min_latency * 5) * 10 / 1000 / 1000 / 1000)\n", - "\n", - "print(\"Determined start depth for all FIFOs: %d\" % start_depth)\n", - "print(\"Determined iteration runtime based on performance: %f s\" % iteration_runtime)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4ba40f96", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration: 12\n", - "Reducing depth of FIFO: 265/266\n", - "Numer of minimized FIFOs: 266/266\n", - "Interval: 903174\n", - "Min. latency / latency: 2549314/2580777\n", - "Total FIFO Size (kB): 244\n", - "Done (389 seconds)\n" - ] - } - ], - "source": [ - "### First pass\n", - "(fifo_depths,\n", - " log_total_fifo_size,\n", - " log_interval,\n", - " log_min_latency,\n", - " log_latency) = size_iteratively(start_depth, iteration_runtime)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ebf027a4", - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAdgAAAE3CAYAAAAJy1DOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAxOAAAMTgF/d4wjAABNoElEQVR4nO3dd5wU5f3A8c+ze527oyPlhKHpDjZEUEFRMRjLGjTRoCZijMZIJImKbWPys0XjGiOaWGLFCnZAdAELitgQVCAis1KXKkXKHe3a7vz+mNljOa7M7u3eXvm+X6993e48U76znnzveeYpyjRNhBBCCJFcrnQHIIQQQrREkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECkgCVYIIYRIAUmwQgghRApkpDsAIYQQLYPmC/wHGAX0Ao4K+b1LHByTDTwAnAmUAwtDfu+lKQ20kUgNVgghRLK8AZwMrInjGD8QAQ4L+b1HADelIrB0UDKTkxBCiGTSfIEQcG60Bqv5Av2Bh4AuQBbwRMjvfUzzBdoAG4CikN+7O03hpow0EQshhEgZzRdwA5OBMSG/N6j5AnnAPM0XmAdUAtuAv2m+wEhgH3BHyO+dnb6Ik0eaiIUQQqTS4cARwCuaL7AI+BwoAAYAmUAfYGnI7x0M/NHer3OaYk0qqcEKIYRIJQX8GPJ7B1Yv0HyBTljPXycBhPzexZovsBorIc9pxBhTQmqwQgghUul7YK/mC1wW3aD5Av00X6BDyO/9EZiN1YMYzRfoBfS2j2n2pJOTEEKIpNB8gUeB84CuwI/A7pDf28/u5PQg0BNwA1uBX4f83g2aL9AHmAh0BMLAnSG/d2pabiDJJMEKIYQQKSBNxEIIIUQKtKpOTkop0+VK8G8K07ReiR4vhBAiLpFIBNM0VbrjSFSrSrAul4twOJzQsfv+9z9Coy+i0zV/oPOf/5zkyIQQQlSnlErsH+wmQqpjDmX17AlA5dYf0xyJEEKI5kASrEOuNm0ACO/cmd5AhBBCNAuSYB1SmZlk9evLrg8+YNfsFjGLlxBCiBRqVcN03G63megzWIDydetYecZPyR8xgkP/+1gSIxMtkWmaVS8hxMGUUtTV8VQpFTZNs9n2FWq2gadD1qGHkqVp7J0/n/DOnbjbtUt3SKIJikQibNmyhZ07d0pyFaIemZmZ9OzZk6ysrHSHknSSYOPU7sIL2PKvB9j+wgvSm1jUaM2aNbhcLjRNIzMzM93hCNFkmabJtm3bWLt2Lf369Ut3OEknCTZO7UaPZsu/HqBy69Z0hyKaoEgkQmlpKf379ycjQ/73EqI+HTt2ZPv27UQikTqbi5ujlnU3jcCVlwdAxcYf0hyJaIqiTcJKNdux8UI0quj/Ky3xcYokWIciEZPSijCmy01G926Ur1mT7pCEEEI0YZJgHZrw/jI8/zeLVT/uIW/gQCrWr2fntGnpDksIRzRNo0uXLlRUVFRt+/DDD1FKceONNwIwffp0brrppnrPtXHjRkaMGJGyWBMxfvx4XnnlFQAWLFjAsGHDyMvL48ILL3R0/LRp05g/f36t5StXrmTQoEEce+yxPPvss0mJOV433HADL7/8cq3lp512Gu+8805c57zjjjsoLy9vaGiiFvKQyKEMt9WMURmJ0OWWW9i74Cu2PvgQ7c4/P72BCeFQz549mT59OhdccAEAEydOZPDgwVXlo0aNYtSoUfWep3v37nz00UcpizNeGzZsYObMmTzwwAMAdOvWjYceeoiFCxfy/vvvOzrHtGnTGDx4MMcff3yN5W+88QZDhw7l0UcfPaissrKyUZ6333LLLQwfPpyLLrooac8q77zzTm688cZm34NX8wVCQKn9Arg35Pe+WsN+VwI+rMrlbOCakN9bmaq4JME6lOm2fqErwyaZPQ4hb8hgSmbMZO/CheQde2yaoxNN2aiXR7Fyx8qUnLtv+75Mv2S6o32vuOIKJk6cyAUXXEBxcTHz5s3jkksuYd++fQA899xzvPPOO7zxxhvMmTOH6667jmHDhvHZZ59RWVnJ888/z+DBgwmFQgwePJgff7SmDVVK8Y9//IOpU6fy448/8uSTTzJ79mxmzZpFeXk5r732GkcccQRz5szhxhtv5KuvvgJgyZIlnHvuuYRCoapzjh07lkAgwL59+3jppZd48sknmTdvHjk5OUybNo3u3bsfdF8TJ07kwgsvrHqWV1RURFFREUuXLj1o33nz5jFu3DjC4TCVlZWMGzeOXr16MX36dD744AOefvpp/vjHP/K73/2u6pgXXniBBx98kEgkwmeffcbkyZO55pprOOmkk5g3bx4A7777Ll6vl23btrFv3z4GDhzIU089RV5eHs899xyTJ0+mQ4cOLFq0iO7du/Pwww9z8803s3z5cgYNGsTkyZNxuVzs2rWL8ePHs3jxYkpLSxk2bBgPP/wwmZmZdOnShd69ezN79mzOOOMMx78jEyZM4OWXX6ayspLMzEwefvhhTjjhBMaOHQvAsGHDcLlcvPfee+Tm5tZ6/dNOO40TTjiBzz//nI0bN3LGGWfw+OOPA1BcXMwNN9zAl19+icvl4rjjjuOxxx5D0zQWLFjAoYceCsBf/vIXIpEI9913n+P443BhyO9dUluh5gv0Bv4OHAtsAd4CrgSeSEUwIE3EjmW4rP95K8IRANpfeikAJTNmpi0mIeJxyimnsGrVKjZs2MDLL7/ML3/5S9xud637f/fdd1xxxRUsXryYP/3pT/z1r3+tdd/CwkLmz5/Pfffdx3nnncfJJ5/MwoUL+c1vfsM999zjKL5t27YxdOhQFi5cyJVXXsnIkSO55ppr+N///sfgwYN55JFHajxuzpw5DBs2zNE17r33Xm644QYWLVrEkiVLuPjiiznnnHMYNWoUPp+PRYsWHZBcAS677DLGjh3LZZddxqJFixgwYAAAixYtYtasWcyePRu3283kyZP56quvWLJkCYWFhTz22P7JaBYsWMC//vUvgsEgeXl5/OpXv2Ly5MksXbqUpUuX8sEHHwBWM/App5zC/PnzWbx4MZWVlQfc97Bhw5gd50xyY8aMYcGCBSxcuJD//Oc/XHnllQBVyfHzzz9n0aJFdOnSpd7rr1y5kjlz5rBkyRLeffddvvjiCwCuu+46cnNzWbx4MYsXL+a+++4jJyeHK6+8kieesPJXWVkZzz77LH/4wx/iij+JLgSmhvzezSG/1wQeBy5J5QWlButQRrQGG7F6uuUcfjgAkV270haTaB6c1jAbw5gxY3j++eeZNm0akyZNYtKkSbXue/jhh1c1IQ8dOpR//etfte570UUXATBo0CBcLhderxeA4447jilTpjiKLT8/v+q4QYMGUVRUxMCBA6vOU1tz7/r16+natauja4wYMYK7776bFStWcPrpp3PyySc7Oq4mY8aMqRrnbJomDz74IIFAgMrKSoqLiznllFOq9j3ppJMoKioC4Nhjj0XTNNq2bQvAMcccw6pVqwCrqXrevHlVzd379u07oPm2a9euzJ07N644Fy5cyD333MO2bdvIyMhg6dKllJeX19gsXN/1L774YtxuN7m5uQwcOJCVK1cydOhQ3nnnHb7++uuqpuvOnTsDcM0113DCCSdw22238corr3DCCSegaVo84buUUutjPk8wTXNCLftO0nwBF/Al8JeQ31t9LGVPILZ3asjeljKSYB3KdB9Yg1V5eZCZSdmKFekMS4i4XH755QwaNIjDDjuM/v3717lvTk5O1Xu3201lZe2PqqL7ut1usrOzazwuIyPjgOUiS0tLDzhH9eOcXj8vL6+qmbs+1113HaNGjWL27NnceuutHHnkkQfUNOORn59f9X7y5Ml8/PHHzJ07l4KCAv7zn/8ckAir30tt92aaJtOmTaNPnz41XrO0tJTc3FzHMZaXl3PBBRcwZ84cjjvuOEpKSmjbtm2tCba+68fzOwHQo0cPhg8fzhtvvMGjjz7quDUjRsQ0zSIH+50S8nvXar5AJnA38DxwTg37xY4FSvlYOmkidijDtf8ZLFjPnbL79aNi3Toi1f6hEKKp6t69O/fee2+qnoHVqXfv3qxevZpt27YB8OKLLyblvEcffTTBYNDRvt9//z19+vThqquu4tZbb616hlpYWEhxcXHCMezYsYOOHTtSUFDArl27eO655xI6z6hRo/D7/VWJa8eOHayI+SPeMAyOOeYYx+crLS2loqKi6hnoww8/fEB5QUHBAfdd3/Xrivv+++8nErEqIFtjJuK59tprueWWWygpKWHkyJGOY49HyO9da/+sAB4Chtew21pAi/ncy96WMpJgHYrtRRxVeOaZhIuL2XTHnekKS4i4/fa3v2Xo0KGNft0ePXpw4403MnjwYEaMGEG7JM3lfeGFFzJz5v6+ECtXrqSoqIjx48czY8YMioqKqmqpDz/8MEcccQTHHnssf/vb36qaQseMGcPkyZMZOHAgTz/9dNwxXHbZZezevZsBAwbwi1/8guHDa/r3vX4PPfQQGRkZDBw4kKOPPpqRI0cSCoUAq3Y5e/ZszjvvvFqPv/zyy6s6eRUVFfHdd99x1113cfzxx3PKKacc0EoA1jPf008/nYEDB7Jly5Y6r1+XBx98kL1793LkkUcycOBAbr311qqyE088kXbt2jFu3LiUTMCi+QJtNF+gXcymS4CFNez6JvBzzRc4RPMFFDAWeCXpAcWQ1XQcmrpwPde/upgnxhzHmUdYz3vMSITQL0dTtmwZhy9aiKqjw4hoHcLhMMuWLeOwww6rswORSJ5IJMKQIUN46623qp5ztkSzZs1i0qRJSav5N5Z169Zx/PHHs2zZMgoKCg4qr+v/GSer6Wi+QB+s5OnGavZdBVwb8ntDmi/wNDA95PdOt/e9CrgFq3L5IfAHu9abEvIM1qHqTcQAyuUi74QTKP3uO3a88godfv3rdIUnRKvlcrl44oknCIVCLTrBFhcXp6VpvyFuu+02Jk6ciN/vrzG5JkPI712FNfSmprLfVfv8FPBUSgKpgSRYhzJraCIG6HDZGLZPnCidnYRIo9gJM1qqaE/t5uSuu+7irrvuSncYaSPPYB2K1mArwgc2qbvbtwcgvG17o8ckhBCi6Up5Ddbw6NnAA8CZQDmwUA8alxoevQvwAtAXKAPG6kHjU/uYPOAZYAgQAXx60Jhil7mAf2N1wTaBCXrQSKyffRyqOjmFD6zBqqwsXHl5lK9NaWc00Uy05JVBhEiFlrwCVWM0EfuxkuRhetAwDY/eLWb7PD1onGV49CHAG4ZH76sHjUrgRqBMDxr9DI/eG/jC8Ogf6UFjB3ApMAA4DGgLfGN49A/1oOGsn36ColMlVkQO/IdTKUWWphHeuTOVlxfNhMvlIicnhw0bNnDIIYfIgutC1CG64HpmZmaLWwsWUpxgDY/eBvgtUKQHDRNADxrRhVRHA73tbQsMj74ZOBmYA1wEXG6XrTY8+lzgPOA5u+xxPWiEge2GR38NuBi4I5X3kpNp9W7bU3bwwGpXQQFlq1dTsWkTmQ5nlBEtV69evdiyZQuhUEhqskLUIzMzk549UzqhUtqkugbbF9gG/M3w6COBfViJcBHg0oNG7FRWIfZPW1XXlFY1ldXYw0EpNR4YH/M5kXsAoGtbawaTLSVlB5W1u+AX7P3ySzaMvwFtcu1Tz4nWweVy0bVrVw455BBM05QkK0QtlFItsuYaleoEmwn0AZbqQcNnePRjgA+AIzlwyio4eNqquqa0cjTdlT1nZdW8lW63O+F/6dzRZ2sHhQ1tR41i+wsvUr56daKnFy2QUqpFPlcSQjiT6j8d1mA9f50EoAeNxcBqQAcwPHrnmH1jp62qa0qrRp/uCsBeTIfaKiMZnToR3rEDsyJlY5aFEEI0IylNsHrQ+BFrUdszAQyP3gvruev3wOvAOHv7EKAr8Kl9aGxZb+BUYHpM2dWGR3cbHr0D1jPZgxbWTbqqBFtzhs3qa02OXTJrVspDEUII0fQ1RuP3WOBmw6N/i7XA7e/tjk63AMMMj74cq/PSGLsHMcD9QK7h0VcA7wLj9KARHWj6IlaCXgYsAO7Xg4aR6ptQRJuIa1ZoL4BcuXlzqkMRQgjRDKR8mI4eNFYBp9WwfTPw01qO2YNVM62pLIxdu21Mqp4mYldhIQAVm7c0UkRCCCGaspbbfSvJol1VaurkBJDZvTuuwkJKZs2ssVwIIUTrIgnWIVfVDD21lOfk0OaE4wn/uE2GZQghhJAE61S0iThSR+505ReAacqsTkIIISTBOqViGolrk9XLmgujLJjSWRuFEEI0A5JgnaqnkxNA7rGDAPjx0ccwy8sbISghhBBNlSRYh+rrRQyQd/wQ2l10EXu/+oo9X85vnMCEEEI0SZJgHXLVMVVilFKKwjOtkUfFU6c0SlxCCCGaJkmwDkWfwNbVyQkg74QTyOrbl10fzUl1SEIIIZowSbAOOWkiBlBuN9mH9cfctw+z8uCl7YQQQrQOkmAd2j9VYv1jXN1t2wJQsWFDSmMSQgjRdEmCdUjVP0qnSnb//gDsW7QoZfEIIYRo2iTBxsnJHE0Fp52Gq6CATX+/mwqZ/F8IIVolSbAORXsRRxxMg5jZowddbryRyO7d7Pns81SHJoQQogmSBOuQ005OUXlDBgOw/dlnUxSREEKIpkwSrENxPIIFILtPH9qcdBLloVCKIhJCCNGUSYJ1SFWtpuN8pRx3+/aYFRVEyspSFZYQQogmShKsQ/HWYAEyunQBoMwwkh6PEEKIpk0SrEPxDNOJyj36aABKZs5KfkBCCCGaNEmwDqk4ehFH5Z8+gpwBA9j+/POULl2aqtCEEEI0QZJg46CU817EAK6sLDr89nIASmbOTE1QQgghmiRJsHFQOJsqMVb+iBG4O3Zk5xtvpiYoIYQQTZIk2DgopeKqwQK48/PJG3Qs4eLiuHogCyGEaN4kwcZBUf9ydTVxtcmHSITw9u1Jj0kIIUTTJAk2Dm2yM9hdVhH3cVl9+gBQulSG6wghRGshCTYO3drmsKm4NO7j2px4AgA/3H4b4ZKSZIclhBCiCZIEG4eubXP4obg07mepuUcfTYcrrqBy4w+yhJ0QQrQSkmDj0LUwh7LKCDv3xt9MnHvUkQBE9uxJdlhCCCGaoIx0B9CctMm2vq59FWHax3msu2NHAHbN/pDCs89OcmRCCNG6ab7A7cAdwFEhv3dJtbLTgBnAspjNQ0N+775UxiQJNg5ZGVaFv7wyEvexeYMH4+7UibJly+rfWQghhGOaLzAIOBFYW8duS0N+7+BGCglohARrePQQUGq/AO7Vg8arhkfvArwA9AXKgLF60PjUPiYPeAYYAkQAnx40pthlLuDfwDlYMwNP0IPGY6m+D4Ast5VgyxJIsMrlIvOQQ6jcIUN1hBAiWTRfIBt4FPgV8FGawzlAYz2DvVAPGgPt16v2Nj8wTw8a/YHfApMMjx5N+DcCZXrQ6AecCTxmePRoq+ylwADgMOB44GbDo3sa4yYaUoMFyOjalcqNP1D6vdRihRDCAZdSan3Ma3wN+9wFvBTye1fXc67DNV/gG80XWKD5AtekINaDpLOT02isvzrQg8YCYDNwsl12UUzZamAucF5M2eN60AjrQWM78BpwcWMEnB1NsOFwQscXnnUWAGXfB5MWkxBCtGAR0zSLYl4TYgs1X2AoVktnfa2Y3wBFIb93EPBzYKzmC4xOTcj7NVaCnWR49G8Nj/604dE7Gx69I+DSg8bWmH1CQE/7fU9gTQJlB1BKjY/966ehUxVGE2wiTcQAGZ07AVC5TZqJhRAiCU4FPMBqzRcIAUXAu5ovcEBP0pDfWxLye4vt9+uBl4HhqQ6uMTo5naIHjbWGR88E7gaeB8Zw8MqqqtpnM8Gy/TtZf+1U/cXjdrsblGEb2kSc2a2bdfyqVQ0JQwghBBDye/1YjxsBsJPsuTX0Iu4GbA75vRHNFygAzsXq55NSKa/B6kFjrf2zAngIGK4HjW0AhkfvHLNrL/b3AFsLaAmUpVSDE2zPnqjsbPZ+9RXh3TIeVgghUkXzBZ7WfIFR9scLgG81X2AxMA94H3g21TGktAZrePQ2QKYeNHbamy4BFtrvXwfGAXcYHn0I0BX4tFrZ5YZH743VDDA2puxqw6NPAdpiPZM9K5X3EZXldgNQHk4swSql6PSHsWx96N/seOlFOo0dW/9BQgghHAn5vVrM+9/FvH8EeKSx40l1DfYQ4CPDo//P8OjfYiXKy+yyW4BhhkdfDjwHjNGDRqVddj+Qa3j0FcC7wDi7QxPAi8D3WAOGFwD360GjUWbRz8m0vq7SisQSLED7S8cAsOvDjzAjiZ9HCCFE05bSGqweNFYBx9ZSthn4aS1le7BqpjWVhbFqt40uJ9Oqwe6rSKwXMYA7vw35I3/C7g9mU7Z8OTmHH56s8IQQQjQhMhdxHKIJtmRf/HMRx2pz4lAAwjuLGxyTEEKIpkkSbBzy7bmIF4QaNszGXZAPQPmaUENDEkII0URJgo1DUftcADq2yW7QebLtZuHy0Jp69hRCCNFcSYKNg7JH3JoHDeGNT1afPmR07syOSZMoW748CZEJIYRoaiTBxkFRlWEbxJWVRdc778QsK2PXh01qbmohhBBJUm8vYntlm/pE9KBRWv9uzZuqdc6o+OUeOxCVnc32l16k45VXoDJk5UAhhGhJnNRgdwO77J/VX9HtK1MVYFPUwAosABnt21Po9RLe+iPhYulNLIQQLY2TatNiPWjUOJY1yvDoC+sqFzXL6NgBgHBxMRkdO6Y5GiGEEMnkpAb7pyTt02I0dFWeqIzO1lTMZd9/n5TzCSGEaDrqTbB60Pg0Gfu0BCo5fZyq5Bx1FAAl776XtKQthBCiaXDSySkXuBzYgbW4+T+BM7HmA75WDxobUhlgU6JqXxkvIbnHHEP+iBHsmjWLfZddRt6gOlvihRBCNCNOmoifAs4Bfg+8B7QDbgZWA4+nLLImLFmVTeVy0e7CCwDY9e67yTmpEEKIJsFJgh2kB42fYSXZwcDv9aAxUw8aNwG9UxpdE5PsJmKANiedREbXrhS//XYSzyqEECLdnCTYMgB7nOtqPWjErrFWnpKomqjkNhBbXDk55AwYQHjXLnkOK4QQLYiTYTrZhkfXsfJL7HuAnJRF1oQlOxG6CwqgooLwtm1kdOqU1HMLIYRwTvMFtjjYbVPI7z26vp2cJNg8YEbM5xm17djSqWRO5RQju38/AEq/+478U09NyTWEEEI4shXrkWhtFDDdyYnqTbB60NCcxdR6JLsht83w4fDQv9l05130mTUTV1ZWkq8ghBDCoTtDfm+dS51pvsDdTk7keLJ/w6OfWcO2sU6Pbwmq6q9JzrA5hx9Oh8suo2LjRsqWLk3uyYUQQjgW8ntfS8Y+EN9qOvcbHv2o6AfDo48Brojj+GYvRS3EALQZeiIA2yY+m7qLCCGEcETzBe7SfIF2mi+gNF8goPkCP2q+wAXxnCOeBHsxMNnw6N0Nj/4L4Ebg7Hgu1lI0dD3YmuQPH05G926Ur5FF2IUQogk4L+T37gRGApXAScBf4zmB4wSrB42lwJ+xJpv4O3CmHjS2xXOx5i5VnZyiMtq1p3KLkw5sQgghUiw6JPVU4PWQ3xv3pPFOpkr8Z7VNlcByYLzh0dGDxs3xXrS5S9Vw1YyuXSldupRwcTHutm1TcxEhhBBO7NF8AR9W6+1Jmi/gAuLqgeqkBrun2msqsCTmc6uTqgSbe7Q1rGrnlKmpuYAQQginLge6AjeH/N7NQB9gUjwnUK1p9iC3222Gw+EGnaP3XwKcOaArj485LklR7RfeuZPVoy+icssWDv/ma5QrnkfkQgjRsiilwqZpOpmvISU0X8ANHBrye0OJHF/vv+CGR6+3p7CTfVqSVHRyAnC3a0f+Kadglpay+6OPUnINIYQQ9dN8geHAGmCu/XmI5gu8GM85nPxlcKPh0b+g7ql4rwMmxnPh5kqRuiZigA6X/podL71ESWAGBT/5SeouJIQQoi7/xOrg9AZAyO9doPkCg+I5QSJTJdZkazwXbc5S3ZM4S9NQOTmEd+9K6XWEEELUKSPk967UfIHYbXEtcCNTJSYg1U+t3W3bUr5iZYqvIoQQog6lmi+Qj/1PvuYLHAGUxnMC6UUTp9TWXy3Z/fpRsWULlTt2NMLVhBBC1ODvwLtAd80XeA6YDfxfPCdotN5Zhke/HbgDOEoPGksMj94FeAHoi7Xm7Fg9aHxq75sHPAMMwRrs69ODxhS7zAX8G2u1AxOYoAeNxxrrPiC1z2ABCs85hz2ffca6q36P9uorKLc7tRcUQghxgJDf+57mCywHzsKqW90d8ntXxHOORkmwhkcfBJwIrI3Z7Afm6UHjLMOjDwHeMDx6Xz1oVGJNw1imB41+hkfvDXxhePSP9KCxA7gUGAAcBrQFvjE8+od60Ag2xr1Yj2BTm2HbXfAL9s7/kuK3plPxww9kFRWl9HpCCCEOFvJ7VwP/TfT4uBOs4dEz7CTodP9s4FHgV0Ds2JPRQG8APWgsMDz6ZuBkYA5wEdYgX/Sgsdrw6HOB84Dn7LLH9aARBrYbHv01rJk27oj3XhKhGqWRGDJ79gQgsnt3o1xPCCEEaL7AAuqoRYX83uOdnstxgjU8+hFYs1h0BA41PPpxwGg9aNxSz6F3AS/ZiTJ6ro6ASw8asb2PQ0BP+31PrPFHTssG13RhpdR4YHzM53pCdaYx5ubI6NQZgF0fzCbH40n9BYUQQoDVgpoU8XRyegT4I/Cj/fkbwFvXAYZHH4r1HLWmZ6TV01T17GcmWLZ/J9OcYJpmUfSVlATbOBVYCr3WV1u2Iq4mfyGEEA0Q8ns/Dvm9HwNfAnNjPn9ib3MsngRbEO2EBKAHDROoqOeYUwEPsNrw6CGgCKtX1vEAhkfvHLNvL/Y/o10LaAmUNYrGmFzSnd8GlZdHZJeMhxVCiDT4ECiM+VwAfBDPCeJ5BltpePRM7PxiePQi9i/nUyM9aPixOjNhHxMCzrV7Eb8OjAPusDs5dQWiCTxadrndyelUYGxM2dWGR5+C1cnpIqxeXo3CmsmpceZvzuzShb1ffSWr6wghRD00X6BqpErI711SQ/mVgA+rYjkbuCbk99bVnygv5PcWRz+E/N5izRdoE09M8TYRTwU6GR79Dqz5Ge+P52LV3AIMMzz6cqzOS2NiOk/dD+QaHn0FVo13nB40tttlLwLfA8uABcD9etAwGhBHXFI8kdMBCs4+C7OsjPJ16xvvokII0czYUxhWH6kSW94ba1zryUA/rArdlfWc1hWbUDVfoADIjCcuxzVYPWi8ZHj0VVi9efOA3+hB45N4LhY7K5QeNDYDP61lvz1YNdOaysJYtdu0aaz1hzI6dQIgLBNOCCFEjTRfoLaRKrEuBKbay86h+QKPAzcDT9Rx6knAe5ovEB2m8wfg+Xhic1yDNTz6cKxxq7foQeNmPWh8Yo9vbVUUqlF6EQNkdLYeUZevkmkThRCtkksptT7mNb6Gfe4CXrLHrNamrpEpNQr5vfcBTwKj7Nd/Q35vXK228TyD/Qh4z/DoF+pBY6+97WmgVSXZxmwizj3iCAB2f/wx7X/9a1RG2pZFFEKIdIiYplnrTDuaLxAdqeJzcC5Ho09izt0u5Pc+T5y11ljxPIP9FqsT0lzDox9ib2vEdNN0NFYTcWaPHhSO+hl7Pv+CXR/MbqSrCiFEs1E1UkXzBULYI1U0X+DsavslMvpkueYLPKX5AkcnGlw8VSJTDxr/MDz6Wqwkez6Nl2uajMb+i6LDmDGUTH+bXe+9S+FZZzby1YUQoukK+b0HjFSxk+y5NfQifhP4VPMF7gK2YI1KeaWe0/fD6gj1puYLbAIeBt4M+b1hp/HFU4NVYHV2wnrYOwPoEcfxLUZjDdMByBkwgOz+/SmZOYtIaVwrJQkhRKul+QJPa77AKICQ37sKuB34DFiJlWSfqev4kN9bHPJ7J4T83v5YSfxfwFrNF/ir0+E68dRgH4m+0YPGh4ZH/xkx0xC2FqlecP2g67nd5A0ZQtny5UR278aVk9Oo1xdCiOYi5PdqMe9/V63sKeCpeM5nD825HLgG+M4+/ifALGB4fcfHM0znmWqflwBXxBFri5COh86uggIAKjZurBq6I4QQInXsoTznYTUvnx/ye7+3i6ZovoCjuRfqTbCGR39RDxpjDI9e4woDetBwvLJAS9GILcQAZPftA0DZ8uXkHp3w83YhhBDOrQA8sbM5xTjdyQmc1GAfsn8mbYWBZi0NVdi8445DZWWx5YEJ5J9+Ohnt2zd+EEII0bp8TMx0wJovUAgcFvJ7vwr5vT84OUG9CVYPGl/bPz+ObjM8ejs9aOyMO9wWwmzkztOZPXrQ+dpr2XL//ez9cr70JhZCiNR7AmuMbdRee9txTk9Qby9iw6NfZ3h03X7vMjz621gLnW+1l6NrVazJ/hv/unnHWy3x2597rvEvLoQQrY8rdkiOvTBAXLP9OBmm8zusbs0Av8QaG9QNq2fVffFcrCVo7F7EUblHHUnuscdStrqu2cCEEEIkSbnmC/SNftB8gX7Uv0TrAZxk40o9aJTb738CvGhP1B8wPPrd8VyspUhHDRbA3aEDkYULMU0zbYleCCFaiTuxJqcI2J/Ppv4VeA7gpAabYXj06L/mQ4HPY8riWrqnJVCq8Z/BRmV0tobolK9alZbrCyFEaxHyewPAKcA39uuUkN87K55zOKnBzgZeNjz6JqwFzj8FMDx6V6AsrohbgHTWG3OPOoqdr7zKrtkfkt23b/0HCCGESFjI710OLE/0eCc12BuA+fb7s2IWRe8PTEj0ws1ZupqIC848k8yiIrY9/TSRslb3t40QQqSc5gtMS8Y+4GyYTiU1JNJ4F1tvKdL57NOdn0+HMZey+V4/u97/gLbnetMWixBCtFBDNV/gn/Xsc4STE8Uz2b+wpasGC1A4ahRkZrJj0qT0BSGEEC3XY8Ceel6POzmRrOAdJ5eC+aHtLN+8C5crHbXZTHaOPJfIzGlUbt9ORocOaYhBCCFappDfe2eyzqUac+m1dHO73WY47Hgpvxqd+/AnLNlQkqSIEnfR97O5as939Jk2FVdeXrrDEUKIpFNKhU3TbLYVQccJ1vDoI4BB9sdv9KDxUcqiSpFkJNhNxaW8umAdlZFI/TungGnCIx+t4Az3Dsa/eQ+F55xDjwkPpCUWIYRIpRafYA2PXggEAA34GmukyiBgDXCOHjTSX51zKBkJtinod+sMTuvXgVteupXKjT/Q//PPpKlYCNHiNPcE66ST0z+BhUAfPWicrweN84C+9rZ/pTI4UbOcTDdlpqLtz0YBsO2JJ9IckRBCtCyaL3C15gs06PmbkwQ7ErhODxpVczDaUydejzV1omhkOZlu9pWH6TDmUlyFhWx//gV2zZ6d7rCEEKIlORVYrfkCD9rzEMfNSYKt0IPGQQ8c7fGx5TXsL1IsN8vFvoowGZ060XPiRAC2PPhgmqMSQoiWI+T3/go4BtgJfKT5AjM0X+CceM7hJMHuMjz60dU3Gh79GKzxQKKR5WS4Ka2wniXnHnkEOUccQfmKlZQtT3hGLyGEENWE/N5N9rCdXwNHAi9pvkBQ8wUctd46eXh8F/tXzpkHmMAw4G/AHxILWzREXnYGq7furlpVp91Fo9l02+2sueIK+r37rgzbEUKIBtJ8gRzgV8A4oBS4CXgDa8H117A6/tbJyVSJ7xgevRL4K/unTPwauEoPGjMTilw0SI92OSxet5OKsElWhqL96NHsW7iI4qlT2f7CC3QaOzbdIQohRHMXAt4Hxob83gUx2+drvsD7Tk6Q8okmDI/+HtAViAC7gD/pQWOR4dG7AC9g9UguA8bqQSO6Uk8e8AwwxD7OpweNKXaZC/g3cA5WbXqCHjQecxJLSxmmc/2ri5i6cAPBv59FTqYbgModO1g+dBi5AweivfJymiMUQoiGS+cwHc0X6Bbye39oyDnqfQZrePTHYt6fl8A1RutB42g9aAwEHgAm2tv9wDw9aPQHfgtMMjx69Iu8ESjTg0Y/4EzgMcOjt7fLLgUGAIcBxwM3Gx7dk0BczVZ0vYHYv40y2rfHlZ9PeSjEvv/9Lz2BCSFEyzFW8wU6Rj9ovkAnzRe4PZ4TOOnkdGLM+7hODqAHjZ0xH9ti1UgBRgOP2vssADYDJ9tlF8WUrQbmAufFlD2uB42wHjS2Y7WFXxxvXM2Zy86w4WqtD11uuonwnj388H+3pSMsIYRoSc4L+b3boh9Cfu+PwPnxnMBJ1VvV8t4xw6O/AIywP55lePSOgEsPGltjdgsBPe33PbFminJaNrim6yqlxgPjYz4nEn6T47bvI1Itwba/aDS73nuPPZ99Run335Nz+OHpCE8IIVqCmhJGZjwncJJgsw2PrtsXi30PgB40ltZ3Aj1oXAZgePTfAPcDY7Cen8aqfjNmgmX7dzLNCcSsZet2u1vEygYuu93BrGE65EKvlz2ffUbxlKnk/MXXuIEJIUTLsUzzBcYDD2LlmeuBYDwncNJEnAfMwJqPODfmfQB4J56L6UHjefbXZDE8eueY4l7AWvv9Wg7sAu20rFVQtdRgAQrOGAmZmeycMqWxwxJCiJbkWuBcYB/WnA9nAX+K5wROhuloiUQGVQsF5OtBY6P9+efANmA78DrW+KI7DI8+BKun8af2odGyyw2P3htryqqxMWVXGx59CtYz3YuwbrzViC5DW/0ZLIC7oICC005j1/vvs+eLL2gzdGgjRyeEEM1fyO/dCJyu+QJt7M9xT6yU6u7PbYE3DY+ei9W5aStwrh40TMOj3wK8aHj05VhTLo6xp18Eqxl5ouHRV9jHjbM7NAG8iDV8Z1l0Xz1oGCm+jyaltmewUW1/fj673n+fdWP/QL8P3iejc+ca9xNCCFE7zRfoBvQGMjRfAICQ3zvX6fFOlqvbysHPS8Fqkzb1oNHFcbRp1lLGwd4x/Tue+zzEl7f+hEMKc2rc58ennmLrAxNoN3o03e66s5EjFEKIhkvzONi/Ys3etAqIJg4z5Pce7/QcTgKvsYeuSB9XPTVYgI6/+x0/PvIoJTNmcMjf/oorK6uxwhNCiJbgCqCfPTwnIU4S7B49aCR8AZF8Vc9gI7UnWKUUhed6KX5zChtvupmifz/UOMEJIUTLsKkhyRWcJdj3gEEAhkd/Rg8aVzbkgqLhXHaGrW+Wy663386ez79g17vvsmfel7Q58YRGiE4IIVqEdzVf4AFgEtZk/wCE/N56h6ZGORmmEzvO9FjnsYlUcdJEDODKyqLr3/4KQPHUqSmPSwghWpDfAr8A3iTBoalOarAtYnKGliTaRFxHC3GVNsOGAVD81lsUnnsu+cNPrucIIYQQIb+3d0PP4STB9jA8+j9reA+AHjRubmgQIj6FudZsXT/uLqN3pzZ17uvKzeXQp55i3VVXsfHmm+k35yNc2dmNEaYQQjRrmi9wHuAJ+b33ab5Ad6BjyO/91unxTpqIH8OaxWJPtffRl2hknq4FACxet9PR/vnDT6bQ6yW8Ywf7Fi1OYWRCCNEyaL7AHVgTHEX7HZnA4/Gcw8lMTjKIsokZ0L0QgNA253/f5I8YQUkgwIZrr6X31ClkduuWqvCEEKJRab7AQeuOh/zeRdX2OQ1rqt9lMZuHhvzefbWc9nzgOOArgJDf+4PmCxTEE1daBvCKhinMsZqId5dW1rNnzDHecyhbtoxtTz7Jjskv0+WG8fUfJIQQzcPokN+7E0DzBc7HWnd8UA37LQ35vU7ndigN+b3h6AxOiXDSRCyamJxMN1luF7viSLBKKTpc/hsAiqdPT1VoQgjR6KLJ1Ra77nhDrNF8gZMBU/MFXJov8DfA8fNXkBpss1WQkxFXggXI6NCB/JE/YfcHs9m3eDG5xxyTouiEECIpXEqp9TGfJ9jLkB5E8wUOWHe8lvMdrvkC32BNffhsyO99rI5r/xl4HjgS2At8AlwaV/Dx7Cyajra5mezYWx73ce0uvBCAHZMnJzskIYRItohpmkUxrxqTK0DI770s5PceCvwNa8GY6r4BikJ+7yDg58BYzRcYXcf5Nof83rOAdkCnkN97Rsjv3RxP8E4m+19AHWNh9aDheOLjdGspk/0DXPncAuYu38rSu84i0+3876TIvn18f+wgXHl5aK++Qnb//imMUgghEpfoZP+aL7APK5luq2OfvwDdQ35vjWu8ar7A/OoT+9e0rS5OAr/R6clE4+nWLoeKsMnOvRV0LnA+rtWVm0vX229j0513sfbK39F31kxceXkpjFQIIVJH8wUKgXx7/VY0XyB23fHY/boBm0N+b8TuDXwu8Ewdpz4gP2q+gBvIjyc2J8N0Po7nhKJxRKdLNBOYaKv9JZew95uFlLz9NjunTaPDr36V7PCEEKKxtAXe1HyBA9YdD/m9puYLPA1MD/m904ELgD9ovkAlVu57HXi2+sk0X+Am4GagreYLbIkpysOal9ixepuIowyP3gm4HTgGqFqEVJqI0+P2t5bw/BdrmH/rT+hSy5qwdSk1DFb//Bdk6zraKy/L7E5CiCYnHevBar5AW6A98F+siSaiSkJ+7454zhVP4BOBz4AzgRuAq4GF8VxMJI+qqsEmJkfXyR6gU7bUYNNtt9P9Pn/yghNCiGYq5PcWA8XA2Q09VzwJtqceNEYZHv3XetB42/Do7wIzGxqASIydX+tdsq4u2ssvs3z4KRS/9RYdf38V2X37Jic4IYRo5jRfoC/wENVabUN+bxen54hnmE50TEiZ4dE7AJVAURzHiyRSOFuyri6u7Gy6jLdmdNr55pSkxCWEEC3E08BLWFMv/gSYhpVwHYsnwX5vJ9aXgHnAl0gTcdpU1WAbeJ78U4aDUmyfOJGS999vcFxCCNFCtA35va8CEXsFnauBM+I5geMEqweNMXrQ2K4HjX9jLUR7JyDdT9PEzq847aRWm8zu3en5rNWRbtP/3UakrKyBkQkhRItQYf/cpfkCvYBsoFc8J3CcYA2PXjWllB40PtODxjvAI/FcTCSPy151vYH5FYA2J55A4aifEd65U5azE0IIy8eaL9ABK899BawA4prIPZ5OTifWsG1oPBcTybO/Bpuc8+Wfciol099mw4030Gf6dDLat0/OiYUQohkK+b03228na77AJ1jjbbfXcchB6k2whkf/JTAa0AyP/lpMUVtkwfX0qXoGm5wMW+g9h73z57PztdfYePMt9HzqyaScVwghmruQ37sOWKf5AmuBnk6Pc1KDXQYEgOPtn1ElwOx4ghTJE+1FnKwarFKKrrffxu5PP2HPJ59glpejsrKSc3IhhGgZVP277OdkqsTFwGLDowf0oLE14bBEUrmS1Is4lnK7yT/lFHa+8io/3HEn3e65u2pCCyGEEPH9kxvPM9gMw6O/w/719mYDV+tB44d4LiiSI5r3GjIOtiadr72WPV98QfGUKeQceYTMUyyEaFU0X2BAHcVxTdsYzzjYJ4HPgR7263N7m0iDZDcRR2W0b0/PZyYC8ON//5vckwshRNMXqONVGs+J4snGh+pB42cxn/2GR18Uz8VE8uxvuU1yhgWyinqQO3Ag+xYtomTWuxSedWbSryGEEE1RyO/tnaxzxZNgXYZH76oHjU0AhkfvQj0PfA2PngO8AgwA9gKbgLF60AjZx78A9AXK7O2f2sflYa3TNwRr+SGfHjSm2GUu4N/AOVjZZYIeNB6jlama7D/5+RWATn8Yy/o/X8uG668nu+9bsjC7EELEqd4mYsOjv2y/vR9YaHj0Jw2P/gTwtb2tPk8Ch+tBYyDwDvublf3APD1o9MeaGWqS4dGjCf9GoEwPGv2wVu95zPDo0YGZl2Il7MOwejbfbHh0j4M4WpToXzaRFCXY/FNPpdvdfwfTZMerr9V/gBBCiAM4eQbrAdCDxotY8zD+D1gCnKkHjZfqOlAPGqV60JihB41oGpgH9LHfjwYetfdbAGwGTrbLLoopWw3MBc6LKXtcDxphPWhsB14DLnZwHy2KSvI42JoUjBwJQMnMmQ2eklEIIVobJ03EVf+y6kFjCVZyTdSfgbcNj94RcFUb9hNi/wDensCaOMoG13QxpdR4YHzM58Qjb2JS1ckplis3l/yRP2H3B7PZ++WXtDmxpsm8hBBC1MRJgj3K8OhbatiuAFMPGo7WxjM8+q1Af6wV4nM5uHdO9exnJli2fyfTnABMiH52u90tphpWNQ42xXfU/pe/ZPcHs9n52uuSYIUQIg5OmoiXYXU2qv4abP+sl+HRbwR+AZytB429etDYZm/vHLNbL2Ct/X4toCVQ1mqkahxsdXlDhkBmJiUzZlDy7nspvZYQQrQkTmqwZXrQWFP/bjUzPPp44BJgpB40dsYUvQ6MA+4wPPoQoCvwabWyyw2P3hs4FavmGy272vDoU7DmQ74IOCvR+JqrxmruduXl0WviM6wZcxmbbr+dghGnyRSKQgjhgJMabML/khsevQh4AGgHfGR49EWGR//SLr4FGGZ49OXAc8AYPWhU2mX3A7mGR18BvAuMszs0AbwIfI9Vs14A3K8HDSPRGJu7xuh7lDdkCIU/s5az27toUeovKIQQLYBqTb1D3W63GQ6H0x1GUvx3zkrumxXkrXEnccyh7VJ+vZKZM9lw/XgyunSh76yZuPLyUn5NIUTrppQKm6YZ1/SETUk8UyWKJkSlYLL/uhSefTZtL/gFlVu2sOmeexrpqkII0Xw1278MWrtou/0bX69LeUenKPOKa1n76f8w53xD//U7GdC9LW5Xyxn6JIQQySRNxM3Uu99t4uoXv05rDHeddwSXDdXSGoMQouVq7k3EzTbw1u7MI7ry+KXHsW773ka9bnjXLta/OY2Xugxm5aIgSIIVQogaSYJtxs46smtarrtNb8dLTy7hh7mfU6KZFJ59dlriEEKIpkw6OYm4tdd6ojDZlZnHhuvHs/7P16Y7JCGEaHIkwYq4uVyKdnlZbDliMK7CQna99x4bb7mFcElJukMTQogmQxKsSEhR+zx2VULv114lW9cpfms66676PWZ5ebpDE0KIJkESrEhIXpabyohJlqbR+43XyezZk32LF7P81NMoX78h3eEJIUTaSYIVCcl0u6iojACg3G56T5lCm2HDCO/YwborryS8e0+aIxRCiPSSBCsSkuFWVEQiVZ/d+W049KknAShfs4ZVo36GWVlZ2+FCCNHiSYIVCcl0u6gMHzhJiXK76f/pJ6isLCo3/sDme/1pik4IIdJPEqxISKZbURkxqT4TWEanTvSdOQOAHZMmsW3is5gtZPYsIYSIhyRYkZCC7EwAtu4uO6gss0cPDn3qKVz5+Wz55z/ZLIsDCCFaIUmwIiHHae0BmBPcWmN5/vCT6fPO26AUOya/zJrLfkOk7OBkLIQQLZUkWJGQY+01aNftqH0u5MyuXekzI4C7Uyf2zp/P+mvGyWQUQohWQ+YiFgk5pG0OAJuKS+vcL7t3b/q8PZ1VZ5/Dns8+Y8WI0zls/pcot7sxwhRCtAKaL/Ae0BWIALuAP4X83kU17Hcl4MOqXM4Grgn5vSkb7iA1WJGQguwM8rMzmLlkU737ZrRvT/9P5pIzYACRPXtYNuwkSr9f1ghRCiFaidEhv/fokN87EHgAmFh9B80X6A38HTgZ6IeVkK9MZVCSYEVClFKUVoSpjBkLW+f+mZn0evEF8k87jUhxMWt+/WsipXXXfoUQwomQ37sz5mNbrJpsdRcCU0N+7+aQ32sCjwOXpDIuSbAiYcP7d4prf1ebNhQ9+giZvXoS2b2bjTfdhFlRkaLohBAtgEsptT7mNb62HTVf4AXNF1gH3A38poZdegJrYj6H7G0pIwlWJMylFA4rsFWU24328suonBx2vf8B3x9/gnR8EkLUJmKaZlHMa0JtO4b83stCfu+hwN+A+2vZLXbgvkpmoDWRBCsS5nIpItUmmnAio0MH+n88h5yjj8bct481v76Ufd99l4IIhRCtTcjvfR4YofkCHasVrQW0mM+97G0pIwlWJMylIJxAggVwt21Lr+efI//00ylbvpzQBRdStmpVkiMUQrR0mi9QqPkC3WM+/xzYBmyvtuubwM81X+AQzRdQwFjglVTGJglWJMztUpgmB02X6JQrN5dDH3uUTtf8AYDV5/+c7ZMmJTNEIUTL1xaYpvkC32q+wGJgHHBuyO81NV/gac0XGAUQ8ntXAbcDnwErgS3AM6kMTCX6j2Nz5Ha7zbDMi5s04yZ/Q+B/P7DyH+fgdjXscUbx22+z8aabAejmv5e2552HUil/RCKEaMKUUmHTNJvtfA1SgxUJc9sJMJHnsNW1/dnPKHr8v+By8YPvL6y94ooGn1MIIdJJEqxIWLTSGo4kpxWk4LTT6PfB+7jy89n7xTzW/OZyKjZvTsq5hRCisUmCFQlz2Rk2mU8ZMrt3p9ekSWT378/eL78kdNHFVGyqf7YoIYRoaiTBioS57CbiRHsS1ybn8MPoPf0tsj0eKjdtYsVpI9j37bdJvYYQQqRayh8eGx79P8AorDFHR+lBY4m9vQvwAtAXKAPG6kHjU7ssD6t31xCsKa98etCYYpe5gH8D52ANGp6gB43HUn0f4mDJfAZbnVKK3q+9ykbfXyiZMYN1v7uKfp/MxZWVlfRrCSFEKjRGDfYNrMmV11Tb7gfm6UGjP/BbYJLh0aMJ/0agTA8a/YAzgccMj97eLrsUGAAcBhwP3Gx4dE+K70HUwGX/9phxzubklMrKoseEB8ju359wcTEbrrue8M6dqbmYEEIkWcprsHrQmAtgePTqRaOB3vY+CwyPvhkrEc8BLgIut8tWGx59LnAe8Jxd9rgeNMLAdsOjvwZcDNyR2jsR1UWH5pz177l0KcxJ3YVG3kBZn1WY+0pRf32dbF0HpcjNdPHPC46hZ8e81F1bCCESlJbxRYZH7wi49KCxNWZziP0TL9c1KXNNZYNruo49MfT4mM8NiFpU9/NjezB/9XZ2l1aytSTFK+N07Unltm2Y5eWwch1bswsBmLpwA9eO7J/aawshRALSOYC3+oO76tmvrkmZHU3YbE8MXTU5tNvtbj2zajSC43p14L3rT22060X27GHT3fdQPHUquws78MvTb+WzlT9KghVCNElp6UWsB41tAIZH7xyzOXbi5bomZW70CZtF0+Bq04bu9/6DtuefT37JdrLCFexZt57ydevSHZoQQhwkncN0XseaMxLDow/BWl3+0xrKegOnAtNjyq42PLrb8OgdsJ7JvtqIcYs063bvPyj672N0qdjFnm07WXnGT1n1s1Hs/uRTTJkKUwjRRKQ8wRoe/VHDo68HioAPDI++wi66BRhmePTlWJ2XxuhBo9Iuux/Itfd9FxinB43oyggvAt8Dy4AFwP160DBSfR+i6VBKUTBiBG21nhR37Ia7a1fKli9n3VVXsWrUeRS/9VbCCxAIIUSyyGT/otn627RveWneWmb8+WS0jcvZdM8/KLP/1soZMIBOfxxHwemnpzlKIUSiZLJ/IdLknCO7AfDMpyHyBg+mz9Qp9PtwNrnHHUfp0qWsv2YcG/9yK2WrVqc5UiFEayQJVjRbQ3p3AMD4oaRqW2b37miTXqLnC8/jatuW4qlTWXXOOay/7nr2fv11ukIVQrRCkmBFs5XpdtG7UxvKwwdPJdXm+OPpP/djekx4gMxDD2XXrFms+fWlrLnsN5TMmiXPaIUQKScJVjRr2RkuKmtIsACu7GwKzzmHvu+9S89nJ5J34onsnT+fDdddz7LBQyiZMaORoxVCtCaSYEWzlul2URGuuzaqlKLN0KH0eu5Zek95k/yRPyGyZw8bxt/A+j9fy+65cxspWiFEayIJVjRrGW5FZcT5agM5AwZw6COPoL3yMrnHHMOu995j3e+vZvUvR7Pj9dcx4ziXEELURRKsaNYyXS4q66nB1iR34EC0V1+hT+AdCs46i9Jvv2XT/93GsqHD2P7884SLi1MQrRCiNZFxsKJZ+9VT85i/ejunHd4FAKWsyamtn8r6ab+nqsyavlrF7B/ZvZuKUIjyVatQmCgTcjyHkd2nL67cnAPOhf0+w6W4bGgv+nUpSM/NC9HCNfdxsJJgRbM24b3vefzjVZiYmKa1CoRpmvbP1F//mKK2vPXHk1N/ISFaIUmwzYgk2NbJNM2qZFs9AUcTM9U+h8vK2Pna62yfPJnKLVsxFbS/4grannceGYd0xQRO8n9IRTjC8nvOlqUQhUgBSbDNiCRYkYidU6byw//9H9i/O+1Gj6bDby7jtoV7ePWrdTx+6SDOsmeVEkIkjyTYZkQSrEhUePduiqdOY9tTT1G5ZQsAG0aez+/yT6Z9tosF44eR0bZtmqMUomWRBNuMSIIVDWWaJiXvvMOOyS+zb/Firjn1Ola37c606T7yCvPJ7t+fzEMPJfeoI2kzfDiZPXpI87EQCZIE24xIghXJVLljB3+ZNJ/X11cy0/yC3NAKylasILJnT9U+GV27kjfoWLIPO4wsTSOrVy8yu3fHLbVdIeolCbYZkQQrku3v7yzlmU9Xc/7A7uRmuQGo3LWbyu07qNy6lYptPxIu2YU1GAhMBSYKd2Eh7q5dcbdvj6ugAJWTCy57H7s3tPXerHqP3QkLoF+XAq4f2V9qx6JFa+4JttkGLkRTcNgh+QBMW7SxhtLO0L4ztK/l4FLghzD8sBPYGeeVN+E9qhuHd5UxuEI0VVKDFaKBSkoriERMazILsCa02P/2oIktAKgMU75yBfu+XULFmjWUr1pF+Zo1VG7YgIqEq/bHNMk95hjyTzqJ7N69ye6jsUi145IXFjGsb0ce/dUg2rfJasS7FaLxNPcarCRYIZqQSHk5FevWUb5mLeWrVrJnwQL2fPIpxMyRHEHx91OuZl6Hfly1L8hv22wno2NHMjp3IuOQrmT26EFG506427bFlZOTxrsRomEkwTYjkmBFcxTevYeKdWspW7GS8lCI8jVr2LB2E6P7XMwhe7dTtGtL7Qe7XKjMTFwZGaisLFR2Nioz03plZaEyMlCZmZCRgSszA+XOiD4urtFPB3TlVyf0TP5NClEDSbDNiCRY0VJUhiP84r+fs3LLbsDqDIUZgYhp95IyqxaVN83922pjxmZVZTdrx75QlJrWPocXunG5XCiXwuV2oVwulNtt/VQqZj5odcC80Pvngj54nuiqJvUDjgGXUrTLyyIn8+B1SWq7m9pv8+CC2vatdXstV619f+fnr+3ccW6mtn/Ta9oaf9zOzw1w7U/6c9ghifcTkATbjEiCFa2ZGYkQ3r6d8M6dhHfupGLzZiK7dhEuLrG2FRcT2VVCeMdOyteupfLHHw9oml7YuT+v9R/BpjYdwFoSAdNe/MAE+7MCl/1Tueyfan+Zsn/an62zq6re1Sax57NeFbKCYLM1+aoTGNa3U8LHS4JtRiTBCuGcGYkQLi62knJxCeGSYsLbthMpK8UsL8fct4/I3r1E9uwlsncvZnkZkX2l1vuyMiKlMe/37iVSWgoVFXHHsTszh7CqeWVNFf3nK1qLzs6ymrzdLpQrA+VyQYbbaibPykS53Ci329rf5YKMDFR2JsqdiXIpcLlRbqsmjtuFUi5UZhYq0z6X/QcEygUul33p/duVy4XKyrb3jWkJsHu+7X974PbYZaCiZSp6TIYblZlt3W9sT7mYRgdX9H3ssC0V7XZ34LYDNsXs79rfjLB/9/0njjmH/SnaeS+mEx/VzpFz5JFkduhQ4387J5p7gm22gScikhfhiMeOSHcYQrQcufbLKdNV1ZRd1XQdidhN2hH7s9XcbUbsOqxZbG0jegzW9mhzuDVAuFpTeMx+sWVV22PLIvv3jSmi0n4f/98EwpY5vweuvHh+QVqWVpVghRBpprBrfwf3pWoyU2bEJtuqZZdin5CaHPjQsXrSrvb5oHMf/KHm3avFUPuJ6lfn7g7OleDxKqt1DyGTJmIhhBBNkjQRCyGEEGmi+QI5wCvAAGAvsAkYG/J7Q9X2Ow2YASyL2Tw05PfuS1VskmCFEEI0d08CM0N+r6n5An+0P/+0hv2WhvzewY0VlCRYIYQQzVbI7y3FqplGzQOuS080B5IEK4QQoqlyKaXWx3yeYJrmhHqO+TPwdi1lh2u+wDdAGHg25Pc+lowga9NsE6zh0fsDzwOdsJYiuVwPGkvTGpQQQohkipimWeR0Z80XuBXoD4ytofgboCjk9xZrvkARMEPzBX4M+b2vJSnWg9Q8ert5eAJ4Ug8ahwH/BJ5JczxCCCHSRPMFbgR+AZwd8nv3Vi8P+b0lIb+32H6/HngZGJ7KmJplgjU8ehdgEPCSvelNoLfh0bW0BSWEECItNF9gPHAJcEbI791Zyz7dNF/AZb8vAM4FFqYyrubaRHwosFEPGpUAetAwDY++FugJhKI7KaXGA+NjD1RKNWQgrAuQmVEt8l1Y5HvYT74Li3wP+zX0u3DXt4Pd3PsAsAr4SPMFAMpCfu8Jmi/wNDA95PdOBy4A/qD5ApVYue914NkGxFavZjnRhOHRjwNe0IPGETHbFgA36EFjbqquq5RaH8/zgJZMvguLfA/7yXdhke9hv9b+XTTLJmJgHVBkePQMAMOjK6xa7dq0RiWEEELYmmWC1YPGFqy280vtTRcAIT1ohNIWlBBCCBGjuT6DBbgaeM7w6LcCJcBvGuGa9Y2/ak3ku7DI97CffBcW+R72a9XfRbN8BiuEEEI0dc2yiVgIIYRo6iTBCiGEECkgCVYIIYRIAUmwDiml+iulPldKLVNKzVdKDUh3TKmilAoppYJKqUX26yJ7exel1Cyl1HKl1BKl1Mkxx+QppV5WSq2wv6NfpO8OEqOU+o9976ZS6siY7Qndt1LKpZR6WCm10i6/prHvKVF1fBdzlFKrYn43ro8pa3HfhVIqRyk1zb6fRfbvgWaXtarfi3q+i1b1e+GYaZrycvACPgQut99fCHyR7phSeK8h4Mgatk8E7rDfDwHWABn259uA5+z3vbEWPW6f7nuJ875PAYqq33+i9w1cBszGmo2mg31eT7rvs4HfxRzg3FqOaXHfBZADnMP+DqF/BN5rjb8X9XwXrer3wulLarAOKKVqnPs4+tdbKzIaeBTANM0FwGYg+lf7RTFlq4G5wHlpiDFhpmnONU1zfQ1Fid73RcDjpmmGTdPcDrwGXJy6O0ieOr6LurS478I0zVLTNGeYdjbAWmu0j/2+Vf1e1PNd1KXFfRdOSYJ15lBgo2malQD2L1h07uOWapJS6lul1NNKqc5KqY6AyzTNrTH7hNj/HfTE+gu+prJmq4H33SK/E+B++3fjVaVU7D+wreG7+DPwtvxeAAevu9qafy9qJAnWueoDhlVaomgcp5imeQxWrX0b1rq7UP93YNZR1pw15L5b2ncyxjRNHTga+AR4p1p5i/0ulFLRtUb/am9qtb8XNXwXrfb3oi6SYJ1ZBxQppTIAlFIteu5j0zTX2j8rgIeA4aZpbgNQSnWO2bUX+7+DtYBWS1mz1cD7bnHfiWma6+yfpmmajwB97NoctODvQilVtdaoaZp7W/PvRfXvAlrv70V9JME6YJpmjXMfm6YZSltQKaKUaqOUahez6RL2r5n4OjDO3m8I0BX4tIay3sCpwPRGCLkxJHrfrwNXK6XcSqkOWM+bXm3EuJNKKZWhlDok5vMFwOZosqGFfhfKWvbyEuAM0zR3xhS1ut+Lmr6L1vp74Ui6e1k1lxdwOPAFsAz4Cjgi3TGl6D77YCXU/wHfAm8Bml12CPAesBz4Djg15rg2WP9jrLC/owvTfS8J3PujwHqgEqun44qG3DdWz8hHgZX264/pvseGfBf2vX5l/14sxur9eUxL/i6welKbdsyL7NeXrfH3orbvojX+Xjh9yVzEQgghRApIE7EQQgiRApJghRBCiBSQBCuEEEKkgCRYIYQQIgUkwQohhBApIAlWiEZir05zpFLqcqXUYSk4fzul1M3Vtj2tlBqe7GsJIeonCVaIxnc5EHeCtZf2quv/2XbAAQnWNM3fmab5SbzXEkI0nCRYIRrXacBg4D/2upnngDX9nLLWGf5GKTVDKXWovf0OpdSLSqkpWAP7uyml7ldKLbCP/1gp1d8+9+NAO3v7V/bxc5RS59rvD1FKTbUnZF+ilPp9NCi7dn27stY8Xq2U+ltjfSFCtFQZ6Q5AiFZmDtasN/8yTfMdAKXUr7BqtENN0wwrpcYAj7B/Sa8RwCDTmrITpdR9pmneZL+/GHgQOBcYC3xlmubAWq79HyBomubP7SUYv1ZKLTJNc75d3s40zWH2/LorlFLPmqa5Ial3L0QrIglWiPQ7H6tW+7W1jgRuIBxT/k40udp+qpT6E1CA1QpV6PA6I4FjwJpf264V/wSIJthJdtlWpdQqrMWxJcEKkSBJsEKknwLuNk1zYi3lu6t2VKonVk30eNM0VymljgY+jONa1edGjf1cGvM+jPz7IESDyDNYIRpfCdA25vN04Bp7NRGUUplKqWNrObYtUA5sspdN/GO18+ZFl1WswQfA7+1rdAZ+TnzJWQgRB0mwQjS+J4Hbop2cTNN8EXgJmKOUWozVmWlETQeapvkt1hJf32E9z10bU7Ydq5n322gnp2r+DBytlPof8BFwT8zzVyFEkslqOkIIIUQKSA1WCCGESAFJsEIIIUQKSIIVQgghUkASrBBCCJECkmCFEEKIFJAEK4QQQqSAJFghhBAiBSTBCiGEECnw/9u1/1xcs4TpAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "### Visualize results\n", - "mpl.rcParams['figure.dpi'] = 80\n", - "fig, ax1 = plt.subplots()\n", - "\n", - "color = 'tab:red'\n", - "ax1.set_xlabel('Iteration')\n", - "ax1.set_ylabel('Total FIFO Size [kB]', color=color)\n", - "ax1.plot(range(len(log_total_fifo_size)), log_total_fifo_size, color=color)\n", - "ax1.tick_params(axis='y', labelcolor=color)\n", - "ax1.set_ylim(0, max(log_total_fifo_size))\n", - " \n", - "ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis\n", - "\n", - "color = 'tab:blue'\n", - "ax2.set_ylabel('Latency [cycles]', color=color)\n", - "ax2.plot(range(len(log_total_fifo_size)), log_latency, color=color)\n", - "ax2.tick_params(axis='y', labelcolor=color)\n", - "#ax2.set_ylim(0, max(log_latency))\n", - "\n", - "ax2.axhline(log_min_latency[0], color=\"green\", label=\"Minimum (1st frame) Latency\")\n", - "ax2.legend()\n", - "\n", - "plt.tight_layout()\n", - "plt.savefig('fifo_iterative_graph.png', dpi = 300)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "466f818f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Iteration: 11\n", - "Reducing depth of FIFO: 48/266\n", - "Numer of minimized FIFOs: 266/266\n", - "Interval: 903174\n", - "Min. latency / latency: 2549314/2580781\n", - "Total FIFO Size (kB): 226\n", - "Done (49 seconds)\n" - ] - } - ], - "source": [ - "### Optional second pass for fine-tuning\n", - "(fifo_depths,\n", - " log_total_fifo_size,\n", - " log_interval,\n", - " log_min_latency,\n", - " log_latency) = size_iteratively(fifo_depths, iteration_runtime, reduction_factor = 0.95)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "2c707459", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FIFO DEPTH | SIZE\n", - "FIFO 000: 1 | 24\n", - "FIFO 001: 2 | 48\n", - "FIFO 002: 2 | 48\n", - "FIFO 003: 16 | 2048\n", - "FIFO 004: 8 | 64\n", - "FIFO 005: 2 | 16\n", - "FIFO 006: 8 | 64\n", - "FIFO 007: 32 | 256\n", - "FIFO 008: 32 | 128\n", - "FIFO 009: 32 | 128\n", - "FIFO 010: 2 | 8\n", - "FIFO 011: 128 | 8192\n", - "FIFO 012: 1 | 32\n", - "FIFO 013: 1 | 2\n", - "FIFO 014: 16 | 128\n", - "FIFO 015: 256 | 2048\n", - "FIFO 016: 2 | 16\n", - "FIFO 017: 2 | 16\n", - "FIFO 018: 355 | 45440\n", - "FIFO 019: 1 | 4\n", - "FIFO 020: 4 | 256\n", - "FIFO 021: 1 | 8\n", - "FIFO 022: 1 | 10\n", - "FIFO 023: 1 | 8\n", - "FIFO 024: 4096 | 32768\n", - "FIFO 025: 1 | 8\n", - "FIFO 026: 1 | 4\n", - "FIFO 027: 4096 | 32768\n", - "FIFO 028: 1 | 64\n", - "FIFO 029: 256 | 1024\n", - "FIFO 030: 256 | 2048\n", - "FIFO 031: 2 | 16\n", - "FIFO 032: 2 | 16\n", - "FIFO 033: 288 | 36864\n", - "FIFO 034: 1 | 4\n", - "FIFO 035: 1 | 64\n", - "FIFO 036: 1 | 8\n", - "FIFO 037: 1 | 10\n", - "FIFO 038: 4 | 32\n", - "FIFO 039: 4 | 32\n", - "FIFO 040: 4096 | 32768\n", - "FIFO 041: 4096 | 32768\n", - "FIFO 042: 8 | 32\n", - "FIFO 043: 16 | 1024\n", - "FIFO 044: 256 | 1024\n", - "FIFO 045: 256 | 2048\n", - "FIFO 046: 2 | 16\n", - "FIFO 047: 2 | 16\n", - "FIFO 048: 288 | 36864\n", - "FIFO 049: 1 | 4\n", - "FIFO 050: 1 | 128\n", - "FIFO 051: 1 | 8\n", - "FIFO 052: 1 | 10\n", - "FIFO 053: 1 | 8\n", - "FIFO 054: 1 | 4\n", - "FIFO 055: 1 | 4\n", - "FIFO 056: 1 | 4\n", - "FIFO 057: 1 | 8\n", - "FIFO 058: 28 | 3584\n", - "FIFO 059: 1 | 4\n", - "FIFO 060: 1 | 8\n", - "FIFO 061: 1 | 8\n", - "FIFO 062: 114 | 14592\n", - "FIFO 063: 1 | 8\n", - "FIFO 064: 2 | 16\n", - "FIFO 065: 1 | 8\n", - "FIFO 066: 243 | 31104\n", - "FIFO 067: 1 | 4\n", - "FIFO 068: 2 | 128\n", - "FIFO 069: 1 | 8\n", - "FIFO 070: 1 | 10\n", - "FIFO 071: 1 | 8\n", - "FIFO 072: 1 | 8\n", - "FIFO 073: 4096 | 32768\n", - "FIFO 074: 4096 | 32768\n", - "FIFO 075: 1 | 4\n", - "FIFO 076: 6 | 384\n", - "FIFO 077: 60 | 240\n", - "FIFO 078: 128 | 1024\n", - "FIFO 079: 2 | 16\n", - "FIFO 080: 2 | 16\n", - "FIFO 081: 394 | 50432\n", - "FIFO 082: 1 | 4\n", - "FIFO 083: 1 | 64\n", - "FIFO 084: 15 | 120\n", - "FIFO 085: 15 | 150\n", - "FIFO 086: 16 | 128\n", - "FIFO 087: 16 | 128\n", - "FIFO 088: 4096 | 32768\n", - "FIFO 089: 4096 | 32768\n", - "FIFO 090: 16 | 64\n", - "FIFO 091: 32 | 2048\n", - "FIFO 092: 64 | 256\n", - "FIFO 093: 128 | 1024\n", - "FIFO 094: 32 | 256\n", - "FIFO 095: 2 | 16\n", - "FIFO 096: 394 | 50432\n", - "FIFO 097: 1 | 4\n", - "FIFO 098: 1 | 64\n", - "FIFO 099: 15 | 120\n", - "FIFO 100: 15 | 150\n", - "FIFO 101: 16 | 128\n", - "FIFO 102: 16 | 128\n", - "FIFO 103: 4096 | 32768\n", - "FIFO 104: 4096 | 32768\n", - "FIFO 105: 16 | 64\n", - "FIFO 106: 32 | 2048\n", - "FIFO 107: 64 | 256\n", - "FIFO 108: 128 | 1024\n", - "FIFO 109: 32 | 256\n", - "FIFO 110: 2 | 16\n", - "FIFO 111: 394 | 50432\n", - "FIFO 112: 1 | 4\n", - "FIFO 113: 1 | 64\n", - "FIFO 114: 1 | 8\n", - "FIFO 115: 8 | 80\n", - "FIFO 116: 8 | 64\n", - "FIFO 117: 8 | 32\n", - "FIFO 118: 1 | 4\n", - "FIFO 119: 8 | 32\n", - "FIFO 120: 1 | 8\n", - "FIFO 121: 16 | 2048\n", - "FIFO 122: 8 | 32\n", - "FIFO 123: 1 | 8\n", - "FIFO 124: 8 | 64\n", - "FIFO 125: 121 | 15488\n", - "FIFO 126: 1 | 8\n", - "FIFO 127: 2 | 16\n", - "FIFO 128: 1 | 8\n", - "FIFO 129: 243 | 31104\n", - "FIFO 130: 2 | 8\n", - "FIFO 131: 8 | 512\n", - "FIFO 132: 1 | 8\n", - "FIFO 133: 8 | 80\n", - "FIFO 134: 8 | 64\n", - "FIFO 135: 8 | 64\n", - "FIFO 136: 1024 | 8192\n", - "FIFO 137: 8192 | 65536\n", - "FIFO 138: 8 | 32\n", - "FIFO 139: 16 | 1024\n", - "FIFO 140: 4 | 16\n", - "FIFO 141: 8 | 64\n", - "FIFO 142: 2 | 16\n", - "FIFO 143: 2 | 16\n", - "FIFO 144: 512 | 65536\n", - "FIFO 145: 1 | 4\n", - "FIFO 146: 1 | 64\n", - "FIFO 147: 30 | 240\n", - "FIFO 148: 32 | 320\n", - "FIFO 149: 32 | 256\n", - "FIFO 150: 32 | 256\n", - "FIFO 151: 1024 | 8192\n", - "FIFO 152: 8192 | 65536\n", - "FIFO 153: 32 | 128\n", - "FIFO 154: 32 | 2048\n", - "FIFO 155: 32 | 128\n", - "FIFO 156: 32 | 256\n", - "FIFO 157: 2 | 16\n", - "FIFO 158: 2 | 16\n", - "FIFO 159: 512 | 65536\n", - "FIFO 160: 1 | 4\n", - "FIFO 161: 1 | 64\n", - "FIFO 162: 30 | 240\n", - "FIFO 163: 32 | 320\n", - "FIFO 164: 32 | 256\n", - "FIFO 165: 32 | 256\n", - "FIFO 166: 1024 | 8192\n", - "FIFO 167: 8192 | 65536\n", - "FIFO 168: 32 | 128\n", - "FIFO 169: 32 | 2048\n", - "FIFO 170: 32 | 128\n", - "FIFO 171: 32 | 256\n", - "FIFO 172: 2 | 16\n", - "FIFO 173: 2 | 16\n", - "FIFO 174: 512 | 65536\n", - "FIFO 175: 1 | 4\n", - "FIFO 176: 1 | 64\n", - "FIFO 177: 30 | 240\n", - "FIFO 178: 32 | 320\n", - "FIFO 179: 32 | 256\n", - "FIFO 180: 32 | 256\n", - "FIFO 181: 1024 | 8192\n", - "FIFO 182: 8192 | 65536\n", - "FIFO 183: 32 | 128\n", - "FIFO 184: 32 | 2048\n", - "FIFO 185: 32 | 128\n", - "FIFO 186: 32 | 256\n", - "FIFO 187: 2 | 16\n", - "FIFO 188: 2 | 16\n", - "FIFO 189: 512 | 65536\n", - "FIFO 190: 1 | 4\n", - "FIFO 191: 1 | 64\n", - "FIFO 192: 30 | 240\n", - "FIFO 193: 32 | 320\n", - "FIFO 194: 32 | 256\n", - "FIFO 195: 1024 | 8192\n", - "FIFO 196: 32 | 256\n", - "FIFO 197: 32 | 128\n", - "FIFO 198: 8192 | 65536\n", - "FIFO 199: 32 | 2048\n", - "FIFO 200: 32 | 128\n", - "FIFO 201: 32 | 256\n", - "FIFO 202: 2 | 16\n", - "FIFO 203: 2 | 16\n", - "FIFO 204: 512 | 65536\n", - "FIFO 205: 1 | 4\n", - "FIFO 206: 1 | 64\n", - "FIFO 207: 1 | 8\n", - "FIFO 208: 1 | 10\n", - "FIFO 209: 1 | 8\n", - "FIFO 210: 1 | 10\n", - "FIFO 211: 1 | 4\n", - "FIFO 212: 1 | 4\n", - "FIFO 213: 1 | 4\n", - "FIFO 214: 1 | 8\n", - "FIFO 215: 8 | 1024\n", - "FIFO 216: 1 | 4\n", - "FIFO 217: 1 | 8\n", - "FIFO 218: 2 | 16\n", - "FIFO 219: 121 | 15488\n", - "FIFO 220: 1 | 8\n", - "FIFO 221: 2 | 16\n", - "FIFO 222: 1 | 8\n", - "FIFO 223: 218 | 27904\n", - "FIFO 224: 4 | 16\n", - "FIFO 225: 8 | 512\n", - "FIFO 226: 3 | 24\n", - "FIFO 227: 4 | 40\n", - "FIFO 228: 8 | 64\n", - "FIFO 229: 8 | 64\n", - "FIFO 230: 3696 | 29568\n", - "FIFO 231: 7782 | 62256\n", - "FIFO 232: 8 | 32\n", - "FIFO 233: 64 | 4096\n", - "FIFO 234: 16 | 64\n", - "FIFO 235: 16 | 128\n", - "FIFO 236: 2 | 16\n", - "FIFO 237: 2 | 16\n", - "FIFO 238: 512 | 65536\n", - "FIFO 239: 4 | 16\n", - "FIFO 240: 8 | 512\n", - "FIFO 241: 3 | 24\n", - "FIFO 242: 4 | 40\n", - "FIFO 243: 8 | 64\n", - "FIFO 244: 8 | 64\n", - "FIFO 245: 3696 | 29568\n", - "FIFO 246: 7782 | 62256\n", - "FIFO 247: 8 | 32\n", - "FIFO 248: 64 | 4096\n", - "FIFO 249: 16 | 64\n", - "FIFO 250: 16 | 128\n", - "FIFO 251: 2 | 16\n", - "FIFO 252: 2 | 16\n", - "FIFO 253: 512 | 65536\n", - "FIFO 254: 4 | 16\n", - "FIFO 255: 8 | 512\n", - "FIFO 256: 2 | 16\n", - "FIFO 257: 2 | 20\n", - "FIFO 258: 2 | 16\n", - "FIFO 259: 2 | 20\n", - "FIFO 260: 4 | 80\n", - "FIFO 261: 2 | 40\n", - "FIFO 262: 1 | 16\n", - "FIFO 263: 1 | 20\n", - "FIFO 264: 1 | 21\n", - "FIFO 265: 1 | 16\n" - ] - } - ], - "source": [ - "### Display resulting FIFO depths\n", - "print(\"FIFO DEPTH | SIZE\")\n", - "for fifo, depth in enumerate(fifo_depths):\n", - " size = depth * fifo_info[\"fifo_widths\"][\"StreamingFIFO_hls_%d\" % fifo]\n", - " print(\"FIFO %03d: \"%(fifo) + (\"%d\"%(depth)).rjust(7) + \" | %d\"%(size))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "64c444f5", - "metadata": {}, - "outputs": [], - "source": [ - "### Export for use in FINN\n", - "fifo_depth_export = {}\n", - "for fifo, depth in enumerate(fifo_depths):\n", - " fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo] = {}\n", - " # Try to account for additional registers introduced by virtual FIFO HLS implementation\n", - " fifo_depth_export[\"StreamingFIFO_rtl_%d\" % fifo][\"depth\"] = depth + 4\n", - "\n", - "with open(\"fifo_depth_export.json\", \"w\") as f:\n", - " json.dump(fifo_depth_export, f, indent=2)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/finn/benchmarking/bench_rtl_swg.py b/src/finn/benchmarking/bench_rtl_swg.py deleted file mode 100644 index 37995be10e..0000000000 --- a/src/finn/benchmarking/bench_rtl_swg.py +++ /dev/null @@ -1,403 +0,0 @@ -import numpy as np -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor - -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation -from finn.analysis.fpgadataflow.res_estimation import res_estimation -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( - ReplaceVerilogRelPaths, -) -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="finn.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = helper.make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - type, - k, - ifm_ch, - ifm_dim, - ofm_dim, - simd, - m, - parallel_window, - stride, - dilation, - idt, - dw=0, - ram_style="auto", -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - type, - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - M=m, - parallel_window=parallel_window, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ram_style=ram_style, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = helper.make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - # DEBUG - # swg_node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] - # swg_inst = getCustomOp(swg_node) - # swg_inst.set_nodeattr("rtlsim_trace", "/workspace/finn/finn-rtllib/swg/swg_test_trace.vcd") - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -def bench_rtl_swg(params, task_id, run_id, results_dir): - # Read params - idt = params["idt"] - k = params["k"] - ifm_dim = params["ifm_dim"] - ifm_ch = params["ifm_ch"] - stride = params["stride"] - dilation = params["dilation"] - dw = params["dw"] - simd = params["simd"] - m = params["m"] - parallel_window = params["parallel_window"] - flip = params["flip"] - ram_style = params["ram_style"] - - only_estimates = params["only_estimates"] - skip_rtlsim = params["skip_rtlsim"] - skip_synth = params["skip_synth"] - synthesize_hls_comparison = params["synthesize_hls_comparison"] - - output_dict = {} - - # convert string to FINN DataType - idt = DataType[idt] - - if flip: - if ( - ifm_dim[0] == ifm_dim[1] - and k[0] == k[1] - and stride[0] == stride[1] - and dilation[0] == dilation[1] - ): - return - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation - kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation - - # inter-dependent test parameters - if simd == "ifm_ch": - simd = ifm_ch - - # skip conditions - if simd > ifm_ch: - return - if ifm_ch % simd != 0: - return - if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - return - if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - return - if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( - k_w == 1 and (stride_w != 1 or dilation_w != 1) - ): - return - if k_h == 1 and k_w == 1 and simd != ifm_ch: - return - if parallel_window and simd != ifm_ch: - return - if not parallel_window and m > 1: - return - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - type="ConvolutionInputGenerator_rtl", - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ram_style=ram_style, - ) - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - if not only_estimates: - model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) - model = model.transform(PrepareRTLSim()) - - node = model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")[0] - inst = getCustomOp(node) - - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - exp_res_dict = model.analysis(res_estimation) - exp_res = exp_res_dict[node.name] - - output_dict["est_Cycles"] = exp_cycles - output_dict["est_LUT"] = exp_res["LUT"] - output_dict["est_BRAM"] = exp_res["BRAM_18K"] * 0.5 - output_dict["est_URAM"] = exp_res["URAM"] - - if only_estimates: - return output_dict - - if not skip_rtlsim: - # prepare input data - input_dict = prepare_inputs(x) - # execute model - oxe.execute_onnx(model, input_dict)["outp"] - - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - output_dict["Cycles"] = cycles_rtlsim - print("RTLSIM cycles: %d" % cycles_rtlsim) - - if not skip_synth: - model = model.transform(ReplaceVerilogRelPaths()) - model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5)) - model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5)) - ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth")) - output_dict["LUT"] = ooc_res_dict["LUT"] - output_dict["BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"] - output_dict["URAM"] = ooc_res_dict["URAM"] - output_dict["WNS"] = ooc_res_dict["WNS"] - output_dict["Fmax"] = ooc_res_dict["fmax_mhz"] - - ############################################################### - # HLS COMPARISON: - if synthesize_hls_comparison: - output_dict["HLS_compatible"] = "yes" - - is_square = True - props_to_check = [k, ifm_dim, ofm_dim, stride, dilation] - for prop in props_to_check: - is_square = prop[0] == prop[1] - if not is_square: - is_square = False - - if not is_square or dilation[0] != 1 or dilation[1] != 1: - # try 1D HLS ConvInpGen - - # rectangular case not supported - if ifm_dim[0] == 1: - if ofm_dim[0] != 1 or k[0] != 1 or stride[0] != 1 or dilation[0] != 1: - output_dict["HLS_compatible"] = "no" - elif ifm_dim[1] == 1: - if ofm_dim[1] != 1 or k[1] != 1 or stride[1] != 1 or dilation[1] != 1: - output_dict["HLS_compatible"] = "no" - else: - output_dict["HLS_compatible"] = "no" - - # unsupported parallelization - if m > 1: - output_dict["HLS_compatible"] = "no" - if parallel_window > 0: - fully_unfolded = simd == ifm_ch - non_dws = dw == 0 - no_stride = stride_h == 1 and stride_w == 1 - no_dilation = dilation_h == 1 and dilation_w == 1 - supported_ram_style = ram_style in ["auto", "distributed"] - if not ( - fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style - ): - output_dict["HLS_compatible"] = "no" - - # unsupported hyperparams - if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): - output_dict["HLS_compatible"] = "no" - if (dilation_h > 1 or dilation_w > 1) and dw == 0: - output_dict["HLS_compatible"] = "no" - - model = make_single_slidingwindow_modelwrapper( - type="ConvolutionInputGenerator1D", - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ram_style=ram_style, - ) - else: - # try 2D HLS ConvInpGen - - # unsupported parallelization - if m > 1 or parallel_window > 0: - output_dict["HLS_compatible"] = "no" - - model = make_single_slidingwindow_modelwrapper( - type="ConvolutionInputGenerator", - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ram_style=ram_style, - ) - - if output_dict["HLS_compatible"] == "no": - return output_dict - - # perform usual RTLSIM steps - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - - # extract first results (estimates) - node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator") - if len(node_) == 0: - node_ = model.get_nodes_by_op_type("ConvolutionInputGenerator1D") - node = node_[0] - inst = getCustomOp(node) - - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - output_dict["HLS_FINN_est_Cycles"] = exp_cycles - - exp_res_dict = model.analysis(res_estimation) - exp_res = exp_res_dict[node.name] - output_dict["HLS_FINN_est_LUT"] = exp_res["LUT"] - output_dict["HLS_FINN_est_BRAM"] = exp_res["BRAM_18K"] * 0.5 - output_dict["HLS_FINN_est_URAM"] = exp_res["URAM"] - - exp_res_dict_hls = model.analysis(hls_synth_res_estimation) - exp_res_hls = exp_res_dict_hls[node.name] - output_dict["HLS_HLS_est_LUT"] = int(exp_res_hls["LUT"]) - output_dict["HLS_HLS_est_BRAM"] = int(exp_res_hls["BRAM_18K"]) * 0.5 - output_dict["HLS_HLS_est_URAM"] = int(exp_res_hls["URAM"]) - - # perform rtlsim (for cycle measurement) - if not skip_rtlsim: - input_dict = prepare_inputs(x) - oxe.execute_onnx(model, input_dict)["outp"] - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - output_dict["HLS_Cycles"] = cycles_rtlsim - - # perform ooc synthesis (for resource/slack measurement) - model = model.transform(ReplaceVerilogRelPaths()) - model = model.transform(CreateStitchedIP("xczu7ev-ffvc1156-2-e", 5)) - model = model.transform(SynthOutOfContext(part="xczu7ev-ffvc1156-2-e", clk_period_ns=5)) - ooc_res_dict = eval(model.get_metadata_prop("res_total_ooc_synth")) - output_dict["HLS_LUT"] = ooc_res_dict["LUT"] - output_dict["HLS_BRAM"] = ooc_res_dict["BRAM_18K"] * 0.5 + ooc_res_dict["BRAM_36K"] - output_dict["HLS_URAM"] = ooc_res_dict["URAM"] - output_dict["HLS_WNS"] = ooc_res_dict["WNS"] - output_dict["HLS_Fmax"] = ooc_res_dict["fmax_mhz"] - - return output_dict diff --git a/src/finn/benchmarking/collect.py b/src/finn/benchmarking/collect.py deleted file mode 100644 index fa71c2a2aa..0000000000 --- a/src/finn/benchmarking/collect.py +++ /dev/null @@ -1,280 +0,0 @@ -import json -import os -import shutil -from dvclive.live import Live - -from finn.benchmarking.util import delete_dir_contents - - -def log_dvc_metric(live, prefix, name, value): - # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) - live.log_metric(prefix + name.replace("/", "-"), value, plot=False) - -def open_json_report(id, report_name): - # look in both, build & measurement, artifacts - path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) - path2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) - if os.path.isfile(path1): - with open(path1, "r") as f: - report = json.load(f) - return report - elif os.path.isfile(path2): - with open(path2, "r") as f: - report = json.load(f) - return report - else: - return None - -def log_all_metrics_from_report(id, live, report_name, prefix=""): - report = open_json_report(id, report_name) - if report: - for key in report: - log_dvc_metric(live, prefix, key, report[key]) - -def log_metrics_from_report(id, live, report_name, keys, prefix=""): - report = open_json_report(id, report_name) - if report: - for key in keys: - if key in report: - log_dvc_metric(live, prefix, key, report[key]) - -def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""): - report = open_json_report(id, report_name) - if report: - if key_top in report: - for key in keys: - if key in report[key_top]: - log_dvc_metric(live, prefix, key, report[key_top][key]) - -if __name__ == "__main__": - # Go through all runs found in the artifacts and log their results to DVC - run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output")) - print("Looking for runs in build artifacts") - run_ids = [] - for run_dir in run_dir_list: - if run_dir.startswith("run_"): - run_id = int(run_dir[4:]) - run_ids.append(run_id) - run_ids.sort() - print("Found %d runs" % len(run_ids)) - - follow_up_bench_cfg = list() - # Prepare (local) output directory where follow-up bench configs will be stored - output_cfg_dir = os.path.join(os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID")) - output_folding_dir = os.path.join(output_cfg_dir, "folding") - output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json") - - for id in run_ids: - print("Processing run %d" % id) - experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) - experiment_msg = "[CI] " + os.environ.get("CI_PIPELINE_NAME") + " (" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + ")" - #TODO: cache images once we switch to a cache provider that works with DVC Studio - with Live(exp_name = experiment_name, exp_message=experiment_msg, cache_images=False) as live: - ### PARAMS ### - # input parameters logged by benchmarking infrastructure - metadata_bench = open_json_report(id, "metadata_bench.json") - params = {"params": metadata_bench["params"]} - live.log_params(params) - - # optional metadata logged by builder - metadata_builder = open_json_report(id, "metadata_builder.json") - if metadata_builder: - metadata = { - "metadata": { - "tool_version": metadata_builder["tool_version"], - } - } - live.log_params(metadata) - - # optional dut_info.json (additional information about DUT generated during model generation) - dut_info_report = open_json_report(id, "dut_info.json") - if dut_info_report: - dut_info = {"dut_info": dut_info_report} - live.log_params(dut_info) - - ### METRICS ### - # TODO: for microbenchmarks, only summarize results for target node (or surrounding SDP?) (see old step_finn_estimate etc.) - # TODO: make all logs consistent at the point of generation (e.g. BRAM vs BRAM18 vs BRAM36) - - # status - status = metadata_bench["status"] - if status == "ok": - # mark as failed if either bench or builder indicates failure - if metadata_builder: - status_builder = metadata_builder["status"] - if status_builder == "failed": - status = "failed" - log_dvc_metric(live, "", "status", status) - - # verification steps - if "output" in metadata_bench: - if "builder_verification" in metadata_bench["output"]: - log_dvc_metric(live, "", "verification", metadata_bench["output"]["builder_verification"]["verification"]) - - # estimate_layer_resources.json - log_nested_metrics_from_report(id, live, "estimate_layer_resources.json", "total", [ - "LUT", - "DSP", - "BRAM_18K", - "URAM", - ], prefix="estimate/resources/") - - # estimate_layer_resources_hls.json - log_nested_metrics_from_report(id, live, "estimate_layer_resources_hls.json", "total", [ - "LUT", - "FF", - "DSP", - "DSP48E", - "DSP58E", # TODO: aggregate/unify DSP reporting - "BRAM_18K", - "URAM", - ], prefix="hls_estimate/resources/") - - # estimate_network_performance.json - log_metrics_from_report(id, live, "estimate_network_performance.json", [ - "critical_path_cycles", - "max_cycles", - "max_cycles_node_name", - "estimated_throughput_fps", - "estimated_latency_ns", - ], prefix="estimate/performance/") - - # rtlsim_performance.json - log_metrics_from_report(id, live, "rtlsim_performance.json", [ - "N", - "TIMEOUT", - "latency_cycles", - "cycles", - "fclk[mhz]", - "throughput[images/s]", - "stable_throughput[images/s]", - # add INPUT_DONE, OUTPUT_DONE, number transactions? - ], prefix="rtlsim/performance/") - - # fifo_sizing.json - log_metrics_from_report(id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/") - - # stitched IP DCP synth resource report - log_nested_metrics_from_report(id, live, "post_synth_resources_dcp.json", "(top)", [ - "LUT", - "FF", - "SRL", - "DSP", - "BRAM_18K", - "BRAM_36K", - "URAM", - ], prefix="synth(dcp)/resources/") - - # stitched IP DCP synth resource breakdown - # TODO: generalize to all build flows and bitfile synth - layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"] - for category in layer_categories: - log_nested_metrics_from_report(id, live, "res_breakdown_build_output.json", category, [ - "LUT", - "FF", - "SRL", - "DSP", - "BRAM_18K", - "BRAM_36K", - "URAM", - ], prefix="synth(dcp)/resources(breakdown)/" + category + "/") - - # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) - log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ - "LUT", - "LUTRAM", - "FF", - "DSP", - "BRAM", - "BRAM_18K", - "BRAM_36K", - "URAM", - ], prefix="synth(ooc)/resources/") - log_metrics_from_report(id, live, "ooc_synth_and_timing.json", [ - "WNS", - "fmax_mhz", - # add TNS? what is "delay"? - ], prefix="synth(ooc)/timing/") - - # post_synth_resources.json (shell synth / step_synthesize_bitfile) - log_nested_metrics_from_report(id, live, "post_synth_resources.json", "(top)", [ - "LUT", - "FF", - "SRL", - "DSP", - "BRAM_18K", - "BRAM_36K", - "URAM", - ], prefix="synth/resources/") - - # post synth timing report - # TODO: only exported as post_route_timing.rpt, not .json - - # instrumentation measurement - log_all_metrics_from_report(id, live, "measured_performance.json", prefix="measurement/performance/") - - # IODMA validation accuracy - log_metrics_from_report(id, live, "validation.json", [ - "top-1_accuracy", - ], prefix="measurement/validation/") - - # power measurement - # TODO - - # live fifosizing report + graph png - log_metrics_from_report(id, live, "fifo_sizing_report.json", [ - "error", - "fifo_size_total_kB", - ], prefix="fifosizing/live/") - - image = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "fifo_sizing_graph.png") - if os.path.isfile(image): - live.log_image("fifosizing_pass_1", image) - - # time_per_step.json - log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) - - ### ARTIFACTS ### - # Log build reports as they come from GitLab artifacts, - # but copy them to a central dir first so all runs share the same path - run_report_dir1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports") - run_report_dir2 = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports") - dvc_report_dir = "reports" - os.makedirs(dvc_report_dir, exist_ok=True) - delete_dir_contents(dvc_report_dir) - if os.path.isdir(run_report_dir1): - shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True) - if os.path.isdir(run_report_dir2): - shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True) - live.log_artifact(dvc_report_dir) - - # Prepare benchmarking config for follow-up runs after live FIFO-sizing - folding_config_lfs_path = os.path.join("measurement_artifacts", "runs_output", "run_%d" % (id), "reports", "folding_config_lfs.json") - if os.path.isfile(folding_config_lfs_path): - # Copy folding config produced by live FIFO-sizing - output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json") - os.makedirs(output_folding_dir, exist_ok=True) - print("Saving lfs-generated folding config of this run to use in a future follow-up run: %s" % output_folding_path) - shutil.copy(folding_config_lfs_path, output_folding_path) - - # Create benchmarking config - metadata_bench = open_json_report(id, "metadata_bench.json") - configuration = dict() - for key in metadata_bench["params"]: - # wrap in list - configuration[key] = [metadata_bench["params"][key]] - # overwrite FIFO-related params - import_folding_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID"), "folding", experiment_name + ".json") - configuration["fifo_method"] = ["manual"] - configuration["target_fps"] = ["None"] - configuration["folding_path"] = [import_folding_path] - - follow_up_bench_cfg.append(configuration) - - # Save aggregated benchmarking config for follow-up job - if follow_up_bench_cfg: - print("Saving follow-up bench config for lfs: %s" % output_cfg_path) - with open(output_cfg_path, "w") as f: - json.dump(follow_up_bench_cfg, f, indent=2) - - print("Done") From 9a1682e79b5cfc6b0896e5b2b7329eaf0982ee25 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 10:22:14 +0200 Subject: [PATCH 111/125] Move VGG10 files to dvc --- .gitignore | 1 + ci/.gitlab-bench.yml | 2 +- .../benchmarking => ci}/cfg/metafi_test.yml | 0 .../cfg/mobilenetv1_test.yml | 5 ++- .../benchmarking => ci}/cfg/mvau_test.yml | 0 .../benchmarking => ci}/cfg/resnet50_test.yml | 4 +- .../cfg/synthetic_fifotest.yml | 2 +- .../cfg/transformer_gpt_all.yml | 0 .../cfg/transformer_radioml_all.yml | 0 .../cfg/transformer_sweep.yml | 0 .../cfg/transformer_test.yml | 0 ci/cfg/vgg10_test.yml | 33 ++++++++++++++ models.dvc | 6 +++ src/finn/benchmarking/bench.py | 2 +- src/finn/benchmarking/cfg/vgg10_test.yml | 33 -------------- src/finn/benchmarking/dut/resnet50.yml | 2 +- .../builder/custom_step_library/resnet.py | 44 ------------------- 17 files changed, 49 insertions(+), 85 deletions(-) rename {src/finn/benchmarking => ci}/cfg/metafi_test.yml (100%) rename {src/finn/benchmarking => ci}/cfg/mobilenetv1_test.yml (83%) rename {src/finn/benchmarking => ci}/cfg/mvau_test.yml (100%) rename {src/finn/benchmarking => ci}/cfg/resnet50_test.yml (84%) rename {src/finn/benchmarking => ci}/cfg/synthetic_fifotest.yml (95%) rename {src/finn/benchmarking => ci}/cfg/transformer_gpt_all.yml (100%) rename {src/finn/benchmarking => ci}/cfg/transformer_radioml_all.yml (100%) rename {src/finn/benchmarking => ci}/cfg/transformer_sweep.yml (100%) rename {src/finn/benchmarking => ci}/cfg/transformer_test.yml (100%) create mode 100644 ci/cfg/vgg10_test.yml create mode 100644 models.dvc delete mode 100644 src/finn/benchmarking/cfg/vgg10_test.yml diff --git a/.gitignore b/.gitignore index 7ddc2c6d67..2d48ddac55 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ bench_input bench_output bench_save bench_work +/models diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml index f3139c0fbd..ca98a4b115 100644 --- a/ci/.gitlab-bench.yml +++ b/ci/.gitlab-bench.yml @@ -22,7 +22,6 @@ FINN Build: - job: Build pipeline: $PARENT_PIPELINE_ID variables: - GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH" extends: .setup_full_2022_2 @@ -32,6 +31,7 @@ FINN Build: # Launch benchmarking script via FINN CLI, includes deps update and environment preparation - | source finn-plus-venv/bin/activate + dvc pull finn bench --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG cache: key: $CI_COMMIT_SHA diff --git a/src/finn/benchmarking/cfg/metafi_test.yml b/ci/cfg/metafi_test.yml similarity index 100% rename from src/finn/benchmarking/cfg/metafi_test.yml rename to ci/cfg/metafi_test.yml diff --git a/src/finn/benchmarking/cfg/mobilenetv1_test.yml b/ci/cfg/mobilenetv1_test.yml similarity index 83% rename from src/finn/benchmarking/cfg/mobilenetv1_test.yml rename to ci/cfg/mobilenetv1_test.yml index 040fa380e4..e43fc5d081 100644 --- a/src/finn/benchmarking/cfg/mobilenetv1_test.yml +++ b/ci/cfg/mobilenetv1_test.yml @@ -11,7 +11,8 @@ "auto_fifo_depths": [False], - "rtlsim_batch_sizauto_fifo_depths": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + "rtlsim_batch_size": [2], + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["mobilenetv1"], @@ -25,7 +26,7 @@ "live_fifo_sizing": [True], - "rtlsim_batch_size": [5], + "rtlsim_batch_size": [2], "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] } ] \ No newline at end of file diff --git a/src/finn/benchmarking/cfg/mvau_test.yml b/ci/cfg/mvau_test.yml similarity index 100% rename from src/finn/benchmarking/cfg/mvau_test.yml rename to ci/cfg/mvau_test.yml diff --git a/src/finn/benchmarking/cfg/resnet50_test.yml b/ci/cfg/resnet50_test.yml similarity index 84% rename from src/finn/benchmarking/cfg/resnet50_test.yml rename to ci/cfg/resnet50_test.yml index e3acf9fa7d..937d106474 100644 --- a/src/finn/benchmarking/cfg/resnet50_test.yml +++ b/ci/cfg/resnet50_test.yml @@ -5,7 +5,7 @@ "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], "board": ["U250"], "synth_clk_period_ns": [4], @@ -21,7 +21,7 @@ "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "floorplan_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], + "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], "board": ["RFSoC2x2"], "synth_clk_period_ns": [10], diff --git a/src/finn/benchmarking/cfg/synthetic_fifotest.yml b/ci/cfg/synthetic_fifotest.yml similarity index 95% rename from src/finn/benchmarking/cfg/synthetic_fifotest.yml rename to ci/cfg/synthetic_fifotest.yml index 58a49d108d..d0daa12d6a 100644 --- a/src/finn/benchmarking/cfg/synthetic_fifotest.yml +++ b/ci/cfg/synthetic_fifotest.yml @@ -17,7 +17,7 @@ "rtlsim_n": [5], "live_fifo_sizing": [True], - "output_products": [["bitfile", "pynq_driver", "deployment_package"]] + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] }, { "dut": ["synthetic_nonlinear"], diff --git a/src/finn/benchmarking/cfg/transformer_gpt_all.yml b/ci/cfg/transformer_gpt_all.yml similarity index 100% rename from src/finn/benchmarking/cfg/transformer_gpt_all.yml rename to ci/cfg/transformer_gpt_all.yml diff --git a/src/finn/benchmarking/cfg/transformer_radioml_all.yml b/ci/cfg/transformer_radioml_all.yml similarity index 100% rename from src/finn/benchmarking/cfg/transformer_radioml_all.yml rename to ci/cfg/transformer_radioml_all.yml diff --git a/src/finn/benchmarking/cfg/transformer_sweep.yml b/ci/cfg/transformer_sweep.yml similarity index 100% rename from src/finn/benchmarking/cfg/transformer_sweep.yml rename to ci/cfg/transformer_sweep.yml diff --git a/src/finn/benchmarking/cfg/transformer_test.yml b/ci/cfg/transformer_test.yml similarity index 100% rename from src/finn/benchmarking/cfg/transformer_test.yml rename to ci/cfg/transformer_test.yml diff --git a/ci/cfg/vgg10_test.yml b/ci/cfg/vgg10_test.yml new file mode 100644 index 0000000000..33b5e7ba5f --- /dev/null +++ b/ci/cfg/vgg10_test.yml @@ -0,0 +1,33 @@ +[ + { + "dut": ["vgg10"], + + "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"], + "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"], + "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "auto_fifo_depths": [True], + "auto_fifo_strategy": ["largefifo_rtlsim"], + + "rtlsim_batch_size": [5], + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["vgg10"], + + "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"], + "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"], + "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"], + + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + + "live_fifo_sizing": [True], + + "rtlsim_batch_size": [5], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } +] \ No newline at end of file diff --git a/models.dvc b/models.dvc new file mode 100644 index 0000000000..75a6adb5e4 --- /dev/null +++ b/models.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 888f3cd73800cf97d94d78e71456370f.dir + size: 348910 + nfiles: 3 + hash: md5 + path: models diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 7a9b0877e6..d47a98bd44 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -46,7 +46,7 @@ def get_default_session_options_new(): if config_name == "manual": config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) else: - configs_path = os.path.join(os.path.dirname(__file__), "cfg") + configs_path = os.path.join(os.path.dirname(__file__), "../../..", "ci/cfg") config_select = config_name + ".yml" config_path = os.path.join(configs_path, config_select) print("Job launched with SLURM ID: %d" % (job_id)) diff --git a/src/finn/benchmarking/cfg/vgg10_test.yml b/src/finn/benchmarking/cfg/vgg10_test.yml deleted file mode 100644 index e16122b130..0000000000 --- a/src/finn/benchmarking/cfg/vgg10_test.yml +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "dut": ["vgg10"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "auto_fifo_depths": [True], - "auto_fifo_strategy": ["largefifo_rtlsim"], - - "rtlsim_batch_size": [5], - "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["vgg10"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/radioml_w4a4_small_tidy.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/VGG-10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "live_fifo_sizing": [True], - - "rtlsim_batch_size": [5], - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml index 6d6d4bcc31..7452ef5df9 100644 --- a/src/finn/benchmarking/dut/resnet50.yml +++ b/src/finn/benchmarking/dut/resnet50.yml @@ -10,7 +10,7 @@ steps: - step_set_fifo_depths - step_hw_codegen - step_hw_ipgen - - finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step + #- finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step - step_create_stitched_ip - step_measure_rtlsim_performance - step_out_of_context_synthesis diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py index 90deae5721..a4082b1adf 100644 --- a/src/finn/builder/custom_step_library/resnet.py +++ b/src/finn/builder/custom_step_library/resnet.py @@ -207,47 +207,3 @@ def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(SortGraph()) return model - - -def step_resnet50_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): - if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: - # previously, we would always ran the finn experimental partitioner on ResNet-50 - # this is now changed and a fixed floorplan is applied - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(ApplyConfig(cfg.floorplan_path)) - print("Fixed SLR floorplanning applied") - - # if you would like to try out the experimental partitioner - # please uncomment the lines (that are not marked as comment) below. - - # import numpy as np - # from finnexperimental.analysis.partitioning import partition - - # comment: apply partitioning of the model, restricting the first and last layer to SLR0 - # default_slr = 0 - # abs_anchors = [(0, [default_slr]), (-1, [default_slr])] - - # comment: increase resource limits to make partitioning feasible, except for SLR0 - # comment: which also has DDR subsystem - # limits = np.array( - # [ - # [0.75, 0.5, 0.7, 0.6, 0.6], - # [1, 0.7, 0.9, 0.8, 0.8], - # [1, 0.7, 0.9, 0.8, 0.8], - # [1, 0.7, 0.9, 0.8, 0.8], - # ] - # ) - # floorplan = partition( - # model, - # cfg.synth_clk_period_ns, - # cfg.board, - # abs_anchors=abs_anchors, - # multivariant=False, - # linear_cuts=True, - # limits=limits, - # )[0] - - # comment: apply floorplan to model - # model = model.transform(ApplyConfig(floorplan)) - # print("SLR floorplanning applied from partitioner") - return model From 881432fa713b616d427fe24458cd6e7834dc5868 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 10:35:30 +0200 Subject: [PATCH 112/125] Fix cfg path --- src/finn/benchmarking/bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index d47a98bd44..738d8a9c85 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -46,7 +46,7 @@ def get_default_session_options_new(): if config_name == "manual": config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) else: - configs_path = os.path.join(os.path.dirname(__file__), "../../..", "ci/cfg") + configs_path = os.path.join("ci", "cfg") config_select = config_name + ".yml" config_path = os.path.join(configs_path, config_select) print("Job launched with SLURM ID: %d" % (job_id)) From 1568af68163507f7172636f6fb2c3b9d6306cce4 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 13:37:17 +0200 Subject: [PATCH 113/125] Move all models to dvc, refactor configs --- .gitlab-ci.yml | 45 +++++- ci/.gitlab-bench.yml | 1 + ci/cfg/live_fifosizing.yml | 50 +++++++ ci/cfg/metafi_test.yml | 14 -- ...mvau_test.yml => microbenchmark_basic.yml} | 24 +++- ci/cfg/mobilenetv1_test.yml | 32 ----- ci/cfg/regression_basic.yml | 10 ++ ci/cfg/regression_extended.yml | 48 +++++++ ci/cfg/resnet50_test.yml | 33 ----- ci/cfg/synthetic_fifotest.yml | 68 --------- ci/cfg/transformer_gpt_all.yml | 12 -- ci/cfg/transformer_radioml_all.yml | 22 --- ci/cfg/transformer_sweep.yml | 87 ----------- ci/cfg/transformer_test.yml | 24 ---- ci/cfg/vgg10_test.yml | 33 ----- ci/collect.py | 5 +- models.dvc | 6 +- src/finn/benchmarking/bench.py | 10 +- src/finn/benchmarking/bench_base.py | 135 ------------------ src/finn/benchmarking/dut/metafi.yml | 28 ---- src/finn/benchmarking/dut/mobilenetv1.yml | 7 + src/finn/benchmarking/dut/resnet50.yml | 9 +- src/finn/benchmarking/dut/transformer.py | 2 +- src/finn/benchmarking/dut/vgg10.yml | 8 ++ 24 files changed, 207 insertions(+), 506 deletions(-) create mode 100644 ci/cfg/live_fifosizing.yml delete mode 100644 ci/cfg/metafi_test.yml rename ci/cfg/{mvau_test.yml => microbenchmark_basic.yml} (52%) delete mode 100644 ci/cfg/mobilenetv1_test.yml create mode 100644 ci/cfg/regression_basic.yml create mode 100644 ci/cfg/regression_extended.yml delete mode 100644 ci/cfg/resnet50_test.yml delete mode 100644 ci/cfg/synthetic_fifotest.yml delete mode 100644 ci/cfg/transformer_gpt_all.yml delete mode 100644 ci/cfg/transformer_radioml_all.yml delete mode 100644 ci/cfg/transformer_sweep.yml delete mode 100644 ci/cfg/transformer_test.yml delete mode 100644 ci/cfg/vgg10_test.yml delete mode 100644 src/finn/benchmarking/dut/metafi.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ad524d0fd7..23eb8c39fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -11,7 +11,7 @@ variables: value: "" TEST_SUITE: description: "Select test suite to run" - value: "quicktest_ci" # DEBUG + value: "full_ci" options: - "none" - "quicktest_ci" @@ -35,15 +35,14 @@ variables: description: "Optional QoS option (include --qos, e.g., --qos express)" value: "" MANUAL_CFG_PATH: - description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner" + description: "Name (in ci/cfg/) or path (relative to LOCAL_CFG_DIR) of benchmarking config to run" value: "" workflow: name: '$PIPELINE_NAME' rules: - # Run pipeline for GitHub PRs to dev or main (does not support PRs from forks) + # Run pipeline for GitHub PRs to dev (does not support PRs from forks) - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" - - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "main" # Run pipeline for pushes to dev or main - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "main" # Run pipeline if manually triggered via API or web GUI @@ -124,6 +123,9 @@ FINN Test Suite 2022.2: # Do not run if test suite has been deselected - if: $TEST_SUITE == "none" when: never + # Do not run for PRs to dev (run only for pushes) + - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" + when: never # Always run, as long as there was no prior failure - when: on_success cache: @@ -155,6 +157,15 @@ FINN Test Suite 2024.2: extends: - FINN Test Suite 2022.2 - .setup_full_2024_2 + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Do not run if test suite has been deselected + - if: $TEST_SUITE == "none" + when: never + # Always run, as long as there was no prior failure + - when: on_success Bench (Manual): stage: test @@ -172,7 +183,7 @@ Bench (Manual): PARENT_PIPELINE_ID: $CI_PIPELINE_ID BENCH_CFG: "manual" -Bench: +Bench (Basic): stage: test rules: # Do not run on a schedule @@ -188,4 +199,26 @@ Bench: PARENT_PIPELINE_ID: $CI_PIPELINE_ID parallel: matrix: - - BENCH_CFG: [mvau_test, resnet50_test, metafi_test, transformer_test, transformer_radioml_all, synthetic_fifotest, vgg10_test, mobilenetv1_test] + - BENCH_CFG: [regression_basic] + +Bench (Extended): + stage: test + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Do not run for PRs to dev (run only for pushes) + - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" + when: never + - if: $MANUAL_CFG_PATH == "" + trigger: + include: ci/.gitlab-bench.yml + strategy: depend + forward: + pipeline_variables: true + variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + PARALLEL_JOBS: "4" + parallel: + matrix: + - BENCH_CFG: [regression_extended, microbenchmark_basic] diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml index ca98a4b115..b5d17d7fdc 100644 --- a/ci/.gitlab-bench.yml +++ b/ci/.gitlab-bench.yml @@ -29,6 +29,7 @@ FINN Build: # Launch additional monitoring - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & # Launch benchmarking script via FINN CLI, includes deps update and environment preparation + # TODO: cache dvc pull - | source finn-plus-venv/bin/activate dvc pull diff --git a/ci/cfg/live_fifosizing.yml b/ci/cfg/live_fifosizing.yml new file mode 100644 index 0000000000..f121bacf6d --- /dev/null +++ b/ci/cfg/live_fifosizing.yml @@ -0,0 +1,50 @@ +[ + # Real models + { + "dut": ["vgg10"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["resnet50"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + + # Synthetic non-linear models + { + "dut": ["synthetic_nonlinear"], + "dim": [64], + "kernel_size": [5], + "ch": [8], + "simd": [8], + "pe": [8], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [4, 8, 16], + + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["synthetic_nonlinear"], + "dim": [64], + "kernel_size": [5], + "ch": [8], + "simd": [1], + "pe": [1], + "parallel_window": [0], + + "lb_num_layers": [1], + "rb_num_layers": [4, 8, 16], + + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/metafi_test.yml b/ci/cfg/metafi_test.yml deleted file mode 100644 index 711250bbdb..0000000000 --- a/ci/cfg/metafi_test.yml +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "dut": ["metafi"], - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/model.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/metafi/auto_folding_config_metaFi_f25.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "live_fifo_sizing": [True], - - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - } - ] \ No newline at end of file diff --git a/ci/cfg/mvau_test.yml b/ci/cfg/microbenchmark_basic.yml similarity index 52% rename from ci/cfg/mvau_test.yml rename to ci/cfg/microbenchmark_basic.yml index 7e0b3d14d2..e9a102e51c 100644 --- a/ci/cfg/mvau_test.yml +++ b/ci/cfg/microbenchmark_basic.yml @@ -1,4 +1,5 @@ [ + # MVAU Test { "dut": ["mvau"], "idt": ["INT4","INT2"], @@ -22,5 +23,26 @@ "dut_duplication": [1], "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + + # Transformer Dummy + { + "dut": ["transformer"], + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [1], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [32], + "model_mlp_dim": [192], + "model_seq_len": [64], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] } - ] +] diff --git a/ci/cfg/mobilenetv1_test.yml b/ci/cfg/mobilenetv1_test.yml deleted file mode 100644 index e43fc5d081..0000000000 --- a/ci/cfg/mobilenetv1_test.yml +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "dut": ["mobilenetv1"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "auto_fifo_depths": [False], - - "rtlsim_batch_size": [2], - "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["mobilenetv1"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/MobileNetV1/ZCU102_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "live_fifo_sizing": [True], - - "rtlsim_batch_size": [2], - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/ci/cfg/regression_basic.yml b/ci/cfg/regression_basic.yml new file mode 100644 index 0000000000..9a7604fe19 --- /dev/null +++ b/ci/cfg/regression_basic.yml @@ -0,0 +1,10 @@ +[ + { + "dut": ["vgg10"], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml new file mode 100644 index 0000000000..d4c2d127a2 --- /dev/null +++ b/ci/cfg/regression_extended.yml @@ -0,0 +1,48 @@ +[ + # ResNet-50 + { + "dut": ["resnet50"], + "board": ["U250"], + "synth_clk_period_ns": [4], + "rtlsim_batch_size": [3], + # no deployment package because Alveo deployment is not yet supported by CI + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile"]] + }, + + # 4x GPT Transformer models (currently disabled due to streamlining issues!) + # { + # "dut": ["transformer"], + # "seed": [12], + # "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", + # "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256", + # "models/gpt_c_gpt2-s512-t2048-l2-h4-e512", + # "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"], + # "board": ["U280"], + # "synth_clk_period_ns": [10], + # "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] + # } + + # 5x RadioML Transformer models + { + "dut": ["transformer"], + "seed": [12], + "model_dir": ["models/rml_transformer_0", + "models/rml_transformer_a", + "models/rml_transformer_b", + "models/rml_transformer_c", + "models/rml_transformer_d",], + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + }, + + # 1x RadioML Conformer model + { + "dut": ["transformer"], + "seed": [12], + "model_dir": ["models/rml_conformer"], + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/resnet50_test.yml b/ci/cfg/resnet50_test.yml deleted file mode 100644 index 937d106474..0000000000 --- a/ci/cfg/resnet50_test.yml +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["U250"], - "synth_clk_period_ns": [4], - - "auto_fifo_depths": [False], - - "rtlsim_batch_size": [5], - "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth"]] - }, - { - "dut": ["resnet50"], - - "model_path": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/resnet50_w1a2_exported.onnx"], - "folding_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_folding_config.json"], - "specialize_layers_config_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/U250_specialize_layers.json"], - "vitis_floorplan_file": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/ResNet-50/floorplan_resnet50.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "live_fifo_sizing": [True], - - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - } - ] \ No newline at end of file diff --git a/ci/cfg/synthetic_fifotest.yml b/ci/cfg/synthetic_fifotest.yml deleted file mode 100644 index d0daa12d6a..0000000000 --- a/ci/cfg/synthetic_fifotest.yml +++ /dev/null @@ -1,68 +0,0 @@ -[ - { - "dut": ["synthetic_nonlinear"], - "dim": [64], - "kernel_size": [5], - "ch": [8], - "simd": [8], - "pe": [8], - "parallel_window": [1], - - "lb_num_layers": [1], - "rb_num_layers": [4], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "rtlsim_n": [5], - - "live_fifo_sizing": [True], - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["synthetic_nonlinear"], - "dim": [64], - "kernel_size": [5], - "ch": [8], - "simd": [8], - "pe": [8], - "parallel_window": [1], - - "lb_num_layers": [1], - "rb_num_layers": [4], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "rtlsim_batch_size": [5], - - "auto_fifo_depths": [True], - "auto_fifo_strategy": ["characterize"], - "characteristic_function_strategy": ["analytical", "rtlsim"], - - "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["synthetic_nonlinear"], - "dim": [64], - "kernel_size": [5], - "ch": [8], - "simd": [8], - "pe": [8], - "parallel_window": [1], - - "lb_num_layers": [1], - "rb_num_layers": [4], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "rtlsim_batch_size": [5], - - "auto_fifo_depths": [True], - "auto_fifo_strategy": ["largefifo_rtlsim"], - - "fifosim_n_inferences": [2], - "generate_outputs": [["stitched_ip", "rtlsim_performance", "bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/ci/cfg/transformer_gpt_all.yml b/ci/cfg/transformer_gpt_all.yml deleted file mode 100644 index e0610c3d7e..0000000000 --- a/ci/cfg/transformer_gpt_all.yml +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_a", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_b", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_c", "/pc2/groups/hpc-prf-ekiapp/felix/ci_models/gpt_d"], - - "board": ["U280"], - "synth_clk_period_ns": [10], - - "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] - } -] diff --git a/ci/cfg/transformer_radioml_all.yml b/ci/cfg/transformer_radioml_all.yml deleted file mode 100644 index dede0988c8..0000000000 --- a/ci/cfg/transformer_radioml_all.yml +++ /dev/null @@ -1,22 +0,0 @@ -[ - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_0"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["transformer"], - "seed": [12], - "model_dir": ["/pc2/groups/hpc-prf-ekiapp/felix/ci_models/radioml_convformer"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/ci/cfg/transformer_sweep.yml b/ci/cfg/transformer_sweep.yml deleted file mode 100644 index 7fa9420d01..0000000000 --- a/ci/cfg/transformer_sweep.yml +++ /dev/null @@ -1,87 +0,0 @@ -[ - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [1], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [32], - "model_mlp_dim": [1536], - "model_seq_len": [512], - "model_bits": [2], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"] - }, - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [8], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [256], - "model_mlp_dim": [1536], - "model_seq_len": [512], - "model_bits": [2], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"] - }, - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [12], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [384], - "model_mlp_dim": [1536], - "model_seq_len": [512], - "model_bits": [2], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"] - }, - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [12], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [96], - "model_mlp_dim": [1536], - "model_seq_len": [512], - "model_bits": [2], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"] - }, - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [1], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [32], - "model_mlp_dim": [1536], - "model_seq_len": [512], - "model_bits": [2, 4, 6, 8], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"] - } -] diff --git a/ci/cfg/transformer_test.yml b/ci/cfg/transformer_test.yml deleted file mode 100644 index a529981fdc..0000000000 --- a/ci/cfg/transformer_test.yml +++ /dev/null @@ -1,24 +0,0 @@ -[ - { - "dut": ["transformer"], - "seed": [12], - - "calibration_passes": [32], - - "model_num_heads": [1], - "model_num_layers": [1], - "model_bias":[true], - "model_emb_dim": [32], - "model_mlp_dim": [192], - "model_seq_len": [64], - "model_bits": [2], - "model_norm": ["none"], - "model_mask": ["none"], - "model_positional_encoding": ["binary"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] - } -] diff --git a/ci/cfg/vgg10_test.yml b/ci/cfg/vgg10_test.yml deleted file mode 100644 index 33b5e7ba5f..0000000000 --- a/ci/cfg/vgg10_test.yml +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "dut": ["vgg10"], - - "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"], - "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"], - "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "auto_fifo_depths": [True], - "auto_fifo_strategy": ["largefifo_rtlsim"], - - "rtlsim_batch_size": [5], - "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] - }, - { - "dut": ["vgg10"], - - "model_path": ["models/vgg10/radioml_w4a4_small_tidy.onnx"], - "folding_config_file": ["models/vgg10/ZCU104_folding_config.json"], - "specialize_layers_config_file": ["models/vgg10/ZCU104_specialize_layers.json"], - - "board": ["RFSoC2x2"], - "synth_clk_period_ns": [10], - - "live_fifo_sizing": [True], - - "rtlsim_batch_size": [5], - "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] - } -] \ No newline at end of file diff --git a/ci/collect.py b/ci/collect.py index b833278fe9..c7042abf25 100644 --- a/ci/collect.py +++ b/ci/collect.py @@ -397,9 +397,10 @@ def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix= "folding", experiment_name + ".json", ) - configuration["fifo_method"] = ["manual"] + configuration["live_fifo_sizing"] = [False] + configuration["auto_fifo_depths"] = [False] configuration["target_fps"] = ["None"] - configuration["folding_path"] = [import_folding_path] + configuration["folding_config_file"] = [import_folding_path] follow_up_bench_cfg.append(configuration) diff --git a/models.dvc b/models.dvc index 75a6adb5e4..784500a21f 100644 --- a/models.dvc +++ b/models.dvc @@ -1,6 +1,6 @@ outs: -- md5: 888f3cd73800cf97d94d78e71456370f.dir - size: 348910 - nfiles: 3 +- md5: 5db49af689e7827c32280837e0c80470.dir + size: 202993533 + nfiles: 40 hash: md5 path: models diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 738d8a9c85..8233707260 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -44,11 +44,13 @@ def get_default_session_options_new(): # Gather benchmarking configs if config_name == "manual": - config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) + # First check if the repo contains a config with this name (in ci/cfg/*) + config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml") + if not os.path.exists(config_path): + # Otherwise look in LOCAL_CFG_DIR for the filename + config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) else: - configs_path = os.path.join("ci", "cfg") - config_select = config_name + ".yml" - config_path = os.path.join(configs_path, config_select) + config_path = os.path.join("ci", "cfg", config_name + ".yml") print("Job launched with SLURM ID: %d" % (job_id)) except KeyError: # Launched without SLURM, assume test run on local machine diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py index dc1b40cee2..4fe8e77168 100644 --- a/src/finn/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -241,141 +241,6 @@ def step_build_setup(self): def run(self): return self.steps_full_build_flow() - # def step_finn_estimate(self): - # # Gather FINN estimates - # print("Gathering FINN estimates") - - # model = self.model_initial - # finn_resources_model = res_estimation(model, fpgapart=self.part) - # finn_cycles_model = model.analysis(exp_cycles_per_layer) - # if self.target_node: - # node = model.get_nodes_by_op_type(self.target_node)[0] - # finn_resources = finn_resources_model[node.name] - # finn_cycles = finn_cycles_model[node.name] - # else: - # finn_resources = finn_resources_model # TODO: aggregate? - # finn_cycles = 0 # TODO: aggregate or drop - # finn_estimates = finn_resources - # finn_estimates["CYCLES"] = finn_cycles - # self.output_dict["finn_estimates"] = finn_estimates - - # def step_hls(self): - # # Perform Vitis HLS synthesis for HLS resource/performance reports - # start_time = time.time() - # print("Performing Vitis HLS synthesis") - # model = self.model_initial - # model = model.transform(PrepareIP(self.part, self.clock_period_ns)) - # model = model.transform(HLSSynthIP()) - - # hls_resources_model = model.analysis(hls_synth_res_estimation) - # if self.target_node: - # node = model.get_nodes_by_op_type(self.target_node)[0] - # hls_resources = hls_resources_model[node.name] - # else: - # hls_resources = hls_resources_model # TODO: aggregate? - # self.output_dict["hls_estimates"] = hls_resources - # self.output_dict["hls_time"] = int(time.time() - start_time) - - # self.model_step_hls = copy.deepcopy(model) - - # def step_rtlsim(self): - # # Perform RTL simulation for performance measurement - # start_time = time.time() - # print("Performing Verilator RTL simulation (n=1)") - # # Prepare - # model = self.model_step_hls - # model = model.transform(SetExecMode("rtlsim")) - # model = model.transform(PrepareRTLSim()) - # # Generate input data - # input_tensor = model.graph.input[0] - # input_shape = model.get_tensor_shape(input_tensor.name) - # input_dtype = model.get_tensor_datatype(input_tensor.name) - # x = gen_finn_dt_tensor(input_dtype, input_shape) - # input_dict = prepare_inputs(x, input_dtype, None) # TODO: fix Bipolar conversion case - # # Run - # oxe.execute_onnx(model, input_dict)["outp"] # do not check output for correctness TODO: add functional verification throughout benchmarking steps - # # Log result - # node = model.get_nodes_by_op_type("MVAU_hls")[0] - # inst = getCustomOp(node) - # rtlsim_cycles = inst.get_nodeattr("cycles_rtlsim") - # self.output_dict["rtlsim_cycles"] = rtlsim_cycles - # self.output_dict["rtlsim_time"] = int(time.time() - start_time) - -# TODO: re-introduce simple Vivado power estimation as new builder step - # def step_synthesis(self): - # # Perform Vivado synthesis for accurate resource/timing and inaccurate power reports - # start_time = time.time() - # print("Performing Vivado (stitched-ip, out-of-context) synthesis") - # model = self.model_step_hls - # model = model.transform(ReplaceVerilogRelPaths()) - # model = model.transform(CreateStitchedIP(self.part, self.clock_period_ns)) - # model = model.transform(SynthOutOfContext(part=self.part, clk_period_ns=self.clock_period_ns)) - # ooc_synth_results = eval(model.get_metadata_prop("res_total_ooc_synth")) - - # start_test_batch_fast( - # results_path=self.artifacts_dir_power, - # project_path=os.path.join( - # ooc_synth_results["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" - # ), - # run_target="impl_1", - # pairs=[(25, 0.5), (50, 0.5), (75, 0.5)], - # ) - - # # Log most important power results directly (refer to detailed logs for more) - # for reportname in ["25_0.5", "50_0.5", "75_0.5"]: - # with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: - # report = json.load(f) - # power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) - # power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) - # ooc_synth_results["power_%s" % reportname] = power - # ooc_synth_results["power_dyn_%s" % reportname] = power_dyn - - # self.output_dict["ooc_synth"] = ooc_synth_results - # self.output_dict["ooc_synth_time"] = int(time.time() - start_time) - - # # Save model for logging purposes - # model.save(os.path.join(self.artifacts_dir_models, "model_%d_synthesis.onnx" % (self.run_id))) - # self.model_step_synthesis = copy.deepcopy(model) - -# TODO: re-introduce sim-based Vivado power estimation as new builder step - # def step_sim_power(self): - # # Perform Vivado simulation for accurate power report - # start_time = time.time() - # if "ooc_synth" not in self.output_dict: - # print("ERROR: step_sim_power requires step_synthesis") - # print("Performing Vivado simulation for power report") - # if "rtlsim_cycles" in self.output_dict: - # sim_duration_ns = self.output_dict["rtlsim_cycles"] * 3 * self.clock_period_ns - # else: - # sim_duration_ns = self.output_dict["finn_estimates"]["CYCLES"] * 3 * self.clock_period_ns - - # model = self.model_step_synthesis - # input_tensor = model.graph.input[0] - # output_tensor = model.graph.output[0] - # input_node_inst = getCustomOp(model.find_consumer(input_tensor.name)) - # output_node_inst = getCustomOp(model.find_producer(output_tensor.name)) - # sim_power_report( - # results_path=self.artifacts_dir_power, - # project_path=os.path.join( - # self.output_dict["ooc_synth"]["vivado_proj_folder"], "vivadocompile", "vivadocompile.xpr" - # ), - # in_width=input_node_inst.get_instream_width(), - # out_width=output_node_inst.get_outstream_width(), - # dtype_width=model.get_tensor_datatype(input_tensor.name).bitwidth(), - # sim_duration_ns=sim_duration_ns, - # ) - - # # Log most important power results directly (refer to detailed logs for more) - # for reportname in ["sim"]: - # with open(os.path.join(self.artifacts_dir_power, "%s.json" % reportname), "r") as f: - # report = json.load(f) - # power = float(report["Summary"]["tables"][0]["Total On-Chip Power (W)"][0]) - # power_dyn = float(report["Summary"]["tables"][0]["Dynamic (W)"][0]) - # self.output_dict["power_%s" % reportname] = power - # self.output_dict["power_dyn%s" % reportname] = power_dyn - - # self.output_dict["sim_power_time"] = int(time.time() - start_time) - def step_parse_builder_output(self, build_dir): # TODO: output as .json or even add as new build step ### CHECK FOR VERIFICATION STEP SUCCESS ### diff --git a/src/finn/benchmarking/dut/metafi.yml b/src/finn/benchmarking/dut/metafi.yml deleted file mode 100644 index fba5a68fe5..0000000000 --- a/src/finn/benchmarking/dut/metafi.yml +++ /dev/null @@ -1,28 +0,0 @@ -steps: - #- step_residual_tidy - #- step_extract_absorb_bias - #- step_residual_topo - #- step_pre_streamline - #- step_residual_streamline - #- step_residual_convert_to_hw - - step_create_dataflow_partition - #- step_set_preferred_impl_style - - step_specialize_layers - - step_target_fps_parallelization - - step_apply_folding_config - - step_minimize_bit_width - - step_generate_estimate_reports - - step_set_fifo_depths - - step_hw_codegen - - step_hw_ipgen - - step_create_stitched_ip - - step_measure_rtlsim_performance - - step_out_of_context_synthesis - - step_synthesize_bitfile - - step_make_driver - - step_deployment_package - -target_fps: null # 23 - -#TODO: where is this used and why? -use_conv_rtl: True # use rtl for conv layers (MVAU cannot use rtl in our model) diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml index 71a80c4f2a..bb3b26f436 100644 --- a/src/finn/benchmarking/dut/mobilenetv1.yml +++ b/src/finn/benchmarking/dut/mobilenetv1.yml @@ -1,3 +1,7 @@ +model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx +folding_config_file: models/mobilenetv1/ZCU102_folding_config.json +specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json + steps: - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step @@ -14,3 +18,6 @@ steps: - step_synthesize_bitfile - step_make_driver - step_deployment_package + +# folding config comes with FIFO sizes +auto_fifo_depths: False diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml index 7452ef5df9..3a3211aad1 100644 --- a/src/finn/benchmarking/dut/resnet50.yml +++ b/src/finn/benchmarking/dut/resnet50.yml @@ -1,3 +1,8 @@ +model_path: models/resnet50/resnet50_w1a2_exported.onnx +folding_config_file: models/resnet50/U250_folding_config.json +specialize_layers_config_file: models/resnet50/U250_specialize_layers.json +vitis_floorplan_file: models/resnet50/floorplan_resnet50.json + steps: - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step @@ -10,10 +15,12 @@ steps: - step_set_fifo_depths - step_hw_codegen - step_hw_ipgen - #- finn.builder.custom_step_library.resnet.step_resnet50_slr_floorplan # Custom step - step_create_stitched_ip - step_measure_rtlsim_performance - step_out_of_context_synthesis - step_synthesize_bitfile - step_make_driver - step_deployment_package + +# folding config comes with FIFO sizes +auto_fifo_depths: False diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py index 48152ce9d5..9023c94aff 100644 --- a/src/finn/benchmarking/dut/transformer.py +++ b/src/finn/benchmarking/dut/transformer.py @@ -977,7 +977,7 @@ def step_build_setup(self): ) # TESTING custom vs live FIFO-sizing - if self.params.get("fifo_method") == "live": + if self.params.get("live_fifo_sizing"): # insert default FIFO-sizing step (behind step_generate_estimate_reports) for i in range(len(cfg.steps)): if cfg.steps[i] == "step_generate_estimate_reports": diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml index 9e271a6921..99a9ab333d 100644 --- a/src/finn/benchmarking/dut/vgg10.yml +++ b/src/finn/benchmarking/dut/vgg10.yml @@ -1,3 +1,7 @@ +model_path: models/vgg10/radioml_w4a4_small_tidy.onnx +folding_config_file: models/vgg10/ZCU104_folding_config.json +specialize_layers_config_file: models/vgg10/ZCU104_specialize_layers.json + steps: - step_tidy_up - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step @@ -20,4 +24,8 @@ steps: - step_make_driver - step_deployment_package +# folding config doesn't come with FIFO sizes +auto_fifo_depths: True +auto_fifo_strategy: largefifo_rtlsim + standalone_thresholds: True From 9710dffe8493ab4366c2c8cdce1866c19df03d46 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 14:09:14 +0200 Subject: [PATCH 114/125] Fix linting --- .pre-commit-config.yaml | 1 + ci/cfg/regression_extended.yml | 2 +- src/finn/benchmarking/bench.py | 36 ++- src/finn/benchmarking/bench_base.py | 160 +++++----- src/finn/benchmarking/dut/mvau.py | 74 +++-- .../benchmarking/dut/synthetic_nonlinear.py | 31 +- src/finn/benchmarking/dut/transformer.py | 298 ++++++++---------- src/finn/benchmarking/templates.py | 1 + src/finn/benchmarking/util.py | 16 +- .../builder/custom_step_library/conv1d.py | 4 +- .../builder/custom_step_library/mobilenet.py | 9 +- .../builder/custom_step_library/resnet.py | 3 +- .../custom_step_library/transformer.py | 19 +- .../qnn-data/templates/driver/validate.py | 36 ++- 14 files changed, 340 insertions(+), 350 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 048a3becda..10ff4d4415 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,7 @@ repos: - id: check-merge-conflict - id: check-xml - id: check-yaml + args: ['--unsafe'] - id: debug-statements exclude: '^src/finn/builder/build_dataflow.py$' - id: end-of-file-fixer diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml index d4c2d127a2..f40c11ab11 100644 --- a/ci/cfg/regression_extended.yml +++ b/ci/cfg/regression_extended.yml @@ -13,7 +13,7 @@ # { # "dut": ["transformer"], # "seed": [12], - # "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", + # "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", # "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256", # "models/gpt_c_gpt2-s512-t2048-l2-h4-e512", # "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"], diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 8233707260..995b3b565c 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -1,18 +1,16 @@ import itertools -import os import json -import yaml +import onnxruntime as ort +import os import time import traceback -import onnxruntime as ort +import yaml -from finn.benchmarking.util import delete_dir_contents from finn.benchmarking.bench_base import bench - from finn.benchmarking.dut.mvau import bench_mvau from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear from finn.benchmarking.dut.transformer import bench_transformer - +from finn.benchmarking.util import delete_dir_contents # Register custom bench subclasses that offer more control than YAML-based flow dut = dict() @@ -27,19 +25,24 @@ def start_bench_run(config_name): # See https://github.com/microsoft/onnxruntime/issues/8313 # This seems to happen only when assigned CPU cores are not contiguous _default_session_options = ort.capi._pybind_state.get_default_session_options() + def get_default_session_options_new(): _default_session_options.inter_op_num_threads = 1 _default_session_options.intra_op_num_threads = 1 return _default_session_options + ort.capi._pybind_state.get_default_session_options = get_default_session_options_new try: # Launched via SLURM, expect additional CI env vars job_id = int(os.environ["SLURM_JOB_ID"]) - # experiment_dir = os.environ.get("EXPERIMENT_DIR") # original experiment dir (before potential copy to ramdisk) + # original experiment dir (before potential copy to ramdisk): + # experiment_dir = os.environ.get("EXPERIMENT_DIR") experiment_dir = os.environ.get("CI_PROJECT_DIR") - save_dir = os.path.join(os.environ.get("LOCAL_ARTIFACT_DIR"), - "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME")) + save_dir = os.path.join( + os.environ.get("LOCAL_ARTIFACT_DIR"), + "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"), + ) work_dir = os.environ["PATH_WORKDIR"] # Gather benchmarking configs @@ -48,7 +51,9 @@ def get_default_session_options_new(): config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml") if not os.path.exists(config_path): # Otherwise look in LOCAL_CFG_DIR for the filename - config_path = os.path.join(os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH")) + config_path = os.path.join( + os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH") + ) else: config_path = os.path.join("ci", "cfg", config_name + ".yml") print("Job launched with SLURM ID: %d" % (job_id)) @@ -60,7 +65,7 @@ def get_default_session_options_new(): work_dir = "bench_work" os.makedirs(work_dir, exist_ok=True) delete_dir_contents(work_dir) - config_path = config_name # expect caller to provide direct path to a single config file + config_path = config_name # expect caller to provide direct path to a single config file print("Local test job launched without SLURM") try: @@ -129,7 +134,8 @@ def get_default_session_options_new(): # Run benchmark # TODO: integrate this loop (especially status logging) into the bench class - # TODO: log stdout of individual tasks of the job array into seperate files as artifacts (GitLab web interface is not readable), coordinate with new logging + # TODO: log stdout of individual tasks of the job array into seperate files as artifacts + # (GitLab web interface is not readable), coordinate with new logging for run, run_id in enumerate(selected_runs): print( "Starting run %d/%d (id %d of %d total runs)" @@ -144,7 +150,9 @@ def get_default_session_options_new(): # Create bench object for respective DUT if "dut" in params: if params["dut"] in dut: - bench_object = dut[params["dut"]](params, task_id, run_id, work_dir, artifacts_dir, save_dir) + bench_object = dut[params["dut"]]( + params, task_id, run_id, work_dir, artifacts_dir, save_dir + ) else: # If no custom bench subclass is defined, fall back to base class, # expect DUT-specific YAML definition instead @@ -168,7 +176,7 @@ def get_default_session_options_new(): log_dict["output"] = bench_object.output_dict - # examine status reported by builder (which catches all exceptions before they reach us here) + # examine status reported by builder (which catches all exceptions before they reach us) # we could also fail the pipeline if functional verification fails (TODO) builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json") if os.path.isfile(builder_log_path): diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py index 4fe8e77168..5cebe09878 100644 --- a/src/finn/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -1,49 +1,25 @@ -import itertools -import os -import subprocess -import copy -import json -import yaml -import time -import traceback import glob +import json +import os import shutil -import numpy as np +import subprocess from shutil import copy as shcopy from shutil import copytree -import finn.core.onnx_exec as oxe -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.base import Transformation -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation -from finn.analysis.fpgadataflow.res_estimation import res_estimation -from finn.transformation.fpgadataflow.make_zynq_proj import collect_ip_dirs + +import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg -from finn.util.basic import make_build_dir, pynq_native_port_width, part_map, alveo_default_platform, alveo_part_map -from finn.benchmarking.templates import template_open, template_single_test, template_sim_power, template_switching_simulation_tb, zynq_harness_template -from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents -from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( - ReplaceVerilogRelPaths, +from finn.benchmarking.templates import ( + template_open, + template_sim_power, + template_single_test, + template_switching_simulation_tb, ) -from qonnx.util.basic import ( - gen_finn_dt_tensor, - roundup_to_integer_multiple, -) -import finn.builder.build_dataflow as build -from finn.analysis.fpgadataflow.post_synth_res import post_synth_res -from qonnx.core.modelwrapper import ModelWrapper +from finn.benchmarking.util import delete_dir_contents, power_xml_to_dict from finn.builder.build_dataflow_config import DataflowBuildConfig -import pandas as pd -import onnxruntime as ort -#TODO: merge this file into bench.py once most functionality has been moved to builder +from finn.util.basic import alveo_default_platform, alveo_part_map, part_map + +# TODO: merge this file into bench.py once most functionality has been moved to builder + def start_test_batch_fast(results_path, project_path, run_target, pairs): # Prepare tcl script @@ -87,7 +63,7 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif") script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns))) script = script.replace("$REPORT_PATH$", results_path) - script = script.replace("$REPORT_NAME$", f"sim") + script = script.replace("$REPORT_NAME$", "sim") with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file: tcl_file.write(script) @@ -117,7 +93,8 @@ def sim_power_report(results_path, project_path, in_width, out_width, dtype_widt with open(power_report_json, "w") as json_file: json_file.write(json.dumps(power_report_dict, indent=2)) -class bench(): + +class bench: def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True): super().__init__() self.params = params @@ -128,8 +105,8 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d self.save_dir = save_dir self.debug = debug - #TODO: setup a logger so output can go to console (with task id prefix) and log simultaneously - #TODO: coordinate with new builder loggin setup + # TODO: setup a logger so output can go to console (with task id prefix) + # TODO: coordinate with new builder loggin setup # Setup some basic global default configuration # TODO: are these class members even used anymore? @@ -152,42 +129,46 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d self.part = part_map[self.board] else: raise Exception("No part specified for board %s" % self.board) - + if self.board in alveo_part_map: self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO self.params["vitis_platform"] = alveo_default_platform[self.board] else: self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ - # Clear FINN tmp build dir before every run (to avoid excessive ramdisk usage and duplicate debug artifacts) + # Clear FINN tmp build dir before every run print("Clearing FINN BUILD DIR ahead of run") delete_dir_contents(os.environ["FINN_BUILD_DIR"]) # Initialize dictionary to collect all benchmark results - # TODO: remove completely or only use for meta data, actual results go into run-specific .json files within /report + # TODO: remove completely or only use for meta data, + # actual results go into run-specific .json files within /report self.output_dict = {} - # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) for custom FINN build flow + # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) self.build_inputs = {} - # Collect tuples of (name, source path, archive?) to save as pipeline artifacts upon run completion or fail by exception + # Collect tuples of (name, source path, archive?) to save as pipeline artifacts self.artifacts_collection = [] - # Collect tuples of (name, source path, archive?) to save as local artifacts upon run completion or fail by exception + # Collect tuples of (name, source path, archive?) to save as local artifacts self.local_artifacts_collection = [] if self.debug: - # Save entire FINN build dir and working dir - # TODO: add option to only save upon exception (in FINN builder or benchmarking infrastructure) - self.local_artifacts_collection.append(("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True)) - #self.local_artifacts_collection.append(("debug_finn_cwd", os.environ["FINN_ROOT"], False)) + # Save entire FINN_BUILD_DIR + # TODO: add option to only save upon error/exception + self.local_artifacts_collection.append( + ("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True) + ) - ### SETUP ### + # SETUP # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) # Ensure it exists but is empty (clear potential artifacts from previous runs) tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow") os.makedirs(tmp_buildflow_dir, exist_ok=True) delete_dir_contents(tmp_buildflow_dir) - self.build_inputs["build_dir"] = os.path.join(tmp_buildflow_dir, "build_output") # TODO remove in favor of self.build_dir + self.build_inputs["build_dir"] = os.path.join( + tmp_buildflow_dir, "build_output" + ) # TODO remove in favor of self.build_dir self.build_dir = os.path.join(tmp_buildflow_dir, "build_output") self.report_dir = os.path.join(self.build_dir, "report") os.makedirs(self.report_dir, exist_ok=True) @@ -196,7 +177,9 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d self.local_artifacts_collection.append(("build_output", self.build_dir, False)) # Save reports and deployment package as pipeline artifacts self.artifacts_collection.append(("reports", self.report_dir, False)) - self.artifacts_collection.append(("reports", os.path.join(self.build_dir, "build_dataflow.log"), False)) + self.artifacts_collection.append( + ("reports", os.path.join(self.build_dir, "build_dataflow.log"), False) + ) self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True)) def save_artifact(self, target_path, source_path, archive=False): @@ -213,13 +196,15 @@ def save_artifact(self, target_path, source_path, archive=False): def save_artifacts_collection(self): # this should be called upon successful or failed completion of a run - for (name, source_path, archive) in self.artifacts_collection: - target_path = os.path.join(self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name) + for name, source_path, archive in self.artifacts_collection: + target_path = os.path.join( + self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name + ) self.save_artifact(target_path, source_path, archive) def save_local_artifacts_collection(self): # this should be called upon successful or failed completion of a run - for (name, source_path, archive) in self.local_artifacts_collection: + for name, source_path, archive in self.local_artifacts_collection: target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) self.save_artifact(target_path, source_path, archive) @@ -235,7 +220,7 @@ def step_build_setup(self): with open(dut_path, "r") as f: return DataflowBuildConfig.from_yaml(f) else: - raise Exception("No DUT-specific YAML build definition found") + raise Exception("No DUT-specific YAML build definition found") # defaults to normal build flow, may be overwritten by subclass def run(self): @@ -243,31 +228,32 @@ def run(self): def step_parse_builder_output(self, build_dir): # TODO: output as .json or even add as new build step - ### CHECK FOR VERIFICATION STEP SUCCESS ### - if (os.path.exists(os.path.join(build_dir, "verification_output"))): + # CHECK FOR VERIFICATION STEP SUCCESS + if os.path.exists(os.path.join(build_dir, "verification_output")): # Collect all verification output filenames outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy")) # Extract the verification status for each verification output by matching # to the SUCCESS string contained in the filename - status = all([ - out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs - ]) - + status = all([out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs]) + # Construct a dictionary reporting the verification status as string - self.output_dict["builder_verification"] = {"verification": {True: "success", False: "fail"}[status]} + self.output_dict["builder_verification"] = { + "verification": {True: "success", False: "fail"}[status] + } # TODO: mark job as failed if verification fails? def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow - ### LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig) + # LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig) custom_params = [ - "model_dir", # used to setup onnx/npy input - "model_path", # used to setup onnx/npy input - # model-gen parameters, such as seed, simd, pe, etc. (TODO: separate from builder options) + "model_dir", # used to setup onnx/npy input + "model_path", # used to setup onnx/npy input + # model-gen parameters, such as seed, simd, pe, etc. + # TODO: separate these from builder options ] - ### MODEL CREATION/IMPORT ### + # MODEL CREATION/IMPORT # TODO: track fixed input onnx models with DVC if "model_dir" in self.params: # input ONNX model and verification input/output pairs are provided @@ -279,12 +265,14 @@ def steps_full_build_flow(self): self.build_inputs["onnx_path"] = self.params["model_path"] else: # input ONNX model (+ optional I/O pair for verification) will be generated - self.build_inputs["onnx_path"] = os.path.join(self.build_inputs["build_dir"], "model_export.onnx") + self.build_inputs["onnx_path"] = os.path.join( + self.build_inputs["build_dir"], "model_export.onnx" + ) if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped": - # microbenchmarks might skip because no valid model can be generated for given params + # microbenchmarks might skip because no model can be generated for given params return "skipped" - ### BUILD SETUP ### + # BUILD SETUP # Initialize from YAML (default) or custom script (if dedicated subclass is defined) cfg = self.step_build_setup() @@ -292,18 +280,18 @@ def steps_full_build_flow(self): cfg.output_dir = self.build_inputs["build_dir"] # enable extra performance optimizations (physopt) # TODO: check OMX synth strategy again! - cfg.vitis_opt_strategy=build_cfg.VitisOptStrategy.PERFORMANCE_BEST + cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST cfg.verbose = False cfg.enable_build_pdb_debug = False - #cfg.stitched_ip_gen_dcp = False # only needed for further manual integration + # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration cfg.force_python_rtlsim = False cfg.split_large_fifos = True - cfg.save_intermediate_models = True # Save the intermediate model graphs - cfg.verify_save_full_context = True # Output full context dump for verification steps + cfg.save_intermediate_models = True # Save the intermediate model graphs + cfg.verify_save_full_context = True # Output full context dump for verification steps cfg.enable_instrumentation = True - #rtlsim_use_vivado_comps # TODO ? - #cfg.default_swg_exception - #cfg.large_fifo_mem_style + # rtlsim_use_vivado_comps # TODO ? + # cfg.default_swg_exception + # cfg.large_fifo_mem_style # Overwrite build config settings with run-specific YAML build definition for key in self.params: @@ -312,15 +300,15 @@ def steps_full_build_flow(self): else: if key not in custom_params: pass - #TODO: be more strict? support custom extra options like MetaFi uses? - #raise Exception("Unrecognized builder config defined in YAML: %s" % key) + # TODO: be more strict? support custom extra options like MetaFi uses? + # raise Exception("Unrecognized builder config defined in YAML: %s" % key) # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) # TODO: make configurable or set on pipeline level? os.environ["LIVENESS_THRESHOLD"] = "10000000" - ### BUILD ### + # BUILD build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) - ### ANALYSIS ### + # ANALYSIS self.step_parse_builder_output(self.build_inputs["build_dir"]) diff --git a/src/finn/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py index 8ce89fdccc..2c4a6b730a 100644 --- a/src/finn/benchmarking/dut/mvau.py +++ b/src/finn/benchmarking/dut/mvau.py @@ -1,31 +1,24 @@ - +import json import math import numpy as np -import json from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import ( calculate_matvec_accumulator_range, gen_finn_dt_tensor, - qonnx_make_model + qonnx_make_model, ) -from finn.transformation.fpgadataflow.minimize_accumulator_width import ( - MinimizeAccumulatorWidth, -) -from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( - MinimizeWeightBitWidth, -) -import finn.builder.build_dataflow_config as build_cfg +import finn.builder.build_dataflow_config as build_cfg from finn.benchmarking.bench_base import bench +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth +from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth -class bench_mvau(bench): +class bench_mvau(bench): def _make_single_mvau_model( self, W, @@ -77,7 +70,7 @@ def _make_single_mvau_model( actval = 0 no_act = 1 mvau_node = helper.make_node( - "MVAU_hls", #TODO: add rtl support (configurable as param) + "MVAU_hls", # TODO: add rtl support (configurable as param) node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow.hls", @@ -101,7 +94,9 @@ def _make_single_mvau_model( runtime_writeable_weights=0, ) - graph = helper.make_graph(nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp]) + graph = helper.make_graph( + nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp] + ) model = qonnx_make_model(graph, producer_name="mvau-model") model = ModelWrapper(model) @@ -194,10 +189,14 @@ def step_export_onnx(self, onnx_export_path): W[idx] = 0.0 W = np.reshape(W, (mw, mh)) elif sparsity_type == "rows_random": - idx_mw = np.random.choice(mw, size=int(self.params["sparsity_amount"] * mw), replace=False) + idx_mw = np.random.choice( + mw, size=int(self.params["sparsity_amount"] * mw), replace=False + ) W[idx_mw, :] = 0.0 elif sparsity_type == "cols_random": - idx_mh = np.random.choice(mh, size=int(self.params["sparsity_amount"] * mh), replace=False) + idx_mh = np.random.choice( + mh, size=int(self.params["sparsity_amount"] * mh), replace=False + ) W[:, idx_mh] = 0.0 elif sparsity_type == "rows_regular": if self.params["sparsity_amount"] == 0.25: @@ -206,7 +205,11 @@ def step_export_onnx(self, onnx_export_path): idx_mw = np.arange(0, mw, step=2) elif self.params["sparsity_amount"] == 0.75: idx_mw = np.concatenate( - (np.arange(0, mw, step=4), np.arange(1, mw, step=4), np.arange(2, mw, step=4)) + ( + np.arange(0, mw, step=4), + np.arange(1, mw, step=4), + np.arange(2, mw, step=4), + ) ) else: print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") @@ -219,7 +222,11 @@ def step_export_onnx(self, onnx_export_path): idx_mh = np.arange(0, mh, step=2) elif self.params["sparsity_amount"] == 0.75: idx_mh = np.concatenate( - (np.arange(0, mh, step=4), np.arange(1, mh, step=4), np.arange(2, mh, step=4)) + ( + np.arange(0, mh, step=4), + np.arange(1, mh, step=4), + np.arange(2, mh, step=4), + ) ) else: print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") @@ -262,13 +269,15 @@ def step_export_onnx(self, onnx_export_path): odt = DataType["INT32"] else: odt = act - # set range for threshold values according to worst-case accumulator range (not weight value specific) + # set range for threshold values according to worst-case accumulator range + # (not weight value specific) # this could result in some thresholds being clipped by MinimizeAccumulatorWidth # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt) # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt) # acc_min = min(min(lower_range), min(upper_range)) # acc_max = max(max(lower_range), max(upper_range)) - # set range for threshold values according to actual accumulator range for the generated weights + # set range for threshold values according to actual accumulator range + # for the generated weights (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32) @@ -285,13 +294,26 @@ def step_export_onnx(self, onnx_export_path): # Create model model = self._make_single_mvau_model( - W, numInputVectors, pe, simd, m, wdt, idt, odt, T, tdt, mem_mode, ram_style, ram_style_thr + W, + numInputVectors, + pe, + simd, + m, + wdt, + idt, + odt, + T, + tdt, + mem_mode, + ram_style, + ram_style_thr, ) model = model.transform(GiveUniqueNodeNames()) - node = model.get_nodes_by_op_type("MVAU_hls")[0] - inst = getCustomOp(node) + # node = model.get_nodes_by_op_type("MVAU_hls")[0] + # inst = getCustomOp(node) - self.target_node = "MVAU_hls" # display results of analysis passes only for the first occurence of this op type + # display results of analysis passes only for the first occurence of this op type + self.target_node = "MVAU_hls" # log additional info about the generated model (e.g. SIMD/PE or sparsity) with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f: @@ -317,6 +339,6 @@ def step_build_setup(self): "step_synthesize_bitfile", "step_make_driver", "step_deployment_package", - ] + ], ) return cfg diff --git a/src/finn/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py index b912e8b319..ff33436976 100644 --- a/src/finn/benchmarking/dut/synthetic_nonlinear.py +++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py @@ -1,15 +1,8 @@ -import json import numpy as np -import os -import shutil -import torch -import copy -from brevitas.export import export_qonnx from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import ( GiveRandomTensorNames, GiveReadableTensorNames, @@ -21,16 +14,12 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.merge_onnx_models import MergeONNXModels from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model -import finn.builder.build_dataflow as build -import finn.builder.build_dataflow_config as build_cfg -from finn.util.basic import make_build_dir -from finn.benchmarking.util import summarize_table, summarize_section, power_xml_to_dict, delete_dir_contents -from finn.util.test import get_trained_network_and_ishape -from finn.util.basic import alveo_default_platform +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.bench_base import bench +from finn.util.basic import make_build_dir -from finn.benchmarking.bench_base import bench def generate_random_threshold_values( data_type, num_input_channels, num_steps, narrow=False, per_tensor=False @@ -50,6 +39,7 @@ def generate_random_threshold_values( def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) + def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0): # hardcoded parameters idt = DataType["UINT4"] @@ -164,7 +154,9 @@ def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window def combine_blocks(lb, rb, ifm_dim, ch, pe): - # assumes left branch (lb) and right branch (rb) each have a single (dynamic) input/output with the same shape + # assumes left branch (lb) and right branch (rb) each have a + # single (dynamic) input/output with the same shape + # to avoid mix-ups, start by giving all tensors random names lb = lb.transform(GiveRandomTensorNames()) rb = rb.transform(GiveRandomTensorNames()) @@ -249,17 +241,14 @@ def combine_blocks(lb, rb, ifm_dim, ch, pe): model = model.transform(GiveReadableTensorNames()) return model + class bench_synthetic_nonlinear(bench): def step_export_onnx(self, onnx_export_path): np.random.seed(0) tmp_output_dir = make_build_dir("test_fifosizing") - #TODO: allow manual folding/fifo config as input - - #TODO: is a scenario possible where reducing depth of a single FIFO at a time is not sufficient for testing tightness? - # e.g. reducing > 1 FIFOs simultaneously does not cause a throughput drop while reducing a single FIFO does? - - #TODO: how to determine rtlsim_n automatically? + # TODO: allow manual folding/fifo config as input + # TODO: how to determine rtlsim_n automatically? # conv parameters dim = self.params["dim"] diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py index 9023c94aff..83002ef418 100644 --- a/src/finn/benchmarking/dut/transformer.py +++ b/src/finn/benchmarking/dut/transformer.py @@ -1,52 +1,54 @@ # Adapted from Christoph's attention-dummy repository # PyTorch base package: Math and Tensor Stuff +import json +import numpy as np +import random import torch -# Brevitas wrapper around PyTorch tensors adding quantization information -from brevitas.quant_tensor import QuantTensor +from brevitas.export import export_qonnx + # Brevitas: Quantized versions of PyTorch layers from brevitas.nn import ( - QuantMultiheadAttention, QuantEltwiseAdd, QuantIdentity, QuantLinear, - QuantReLU + QuantMultiheadAttention, + QuantReLU, ) -# Progressbar -from tqdm import trange -import numpy as np -from brevitas.export import export_qonnx -import random -import json -# FINN dataflow builder -import finn.builder.build_dataflow_config as build_cfg -from finn.builder.build_dataflow_config import AutoFIFOSizingMethod + +# Brevitas wrapper around PyTorch tensors adding quantization information +from brevitas.quant_tensor import QuantTensor from qonnx.core.modelwrapper import ModelWrapper -from finn.benchmarking.bench_base import bench # Range information structure for seeding the range analysis for converting # quantized activations to MultiThreshold from qonnx.util.range_analysis import RangeInfo +# Progressbar +from tqdm import trange + +# FINN dataflow builder +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.bench_base import bench + # Custom build steps required to streamline and convert the attention operator from finn.builder.custom_step_library.transformer import ( + node_by_node_cppsim, prepare_graph, - step_streamline, + set_fifo_depths, + set_target_parallelization, + step_apply_folding_config, step_convert_attention_to_hw, + step_convert_depth_wise_to_hw, step_convert_elementwise_binary_to_hw, step_convert_lookup_to_hw, step_convert_split_concat_to_hw, - step_convert_depth_wise_to_hw, step_replicate_streams, - set_target_parallelization, - set_fifo_depths, - step_apply_folding_config, - node_by_node_rtlsim, # noqa: Maybe unused, only for debugging - node_by_node_cppsim, + step_streamline, ) -### ADAPTED FROM utils.py +# ADAPTED FROM utils.py # Seeds all relevant random number generators to the same seed for # reproducibility def seed(s): @@ -54,14 +56,15 @@ def seed(s): np.random.seed(s) torch.manual_seed(s) -### ADAPTED FROM model.py + +# ADAPTED FROM model.py # Derives a weight quantizer from the brevitas bases leaving bit-width and # signedness configurable def weight_quantizer(bits, _signed=True): # Brevitas quantizer base classes - from brevitas.quant.base import NarrowIntQuant, MaxStatsScaling - from brevitas.quant.solver import WeightQuantSolver from brevitas.inject.enum import RestrictValueType + from brevitas.quant.base import MaxStatsScaling, NarrowIntQuant + from brevitas.quant.solver import WeightQuantSolver # Derive a Quantizer from the brevitas bases class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver): @@ -103,14 +106,12 @@ class Quantizer(IntBias): # signedness configurable def act_quantizer(bits, _signed=True): # Brevitas quantizer base classes + from brevitas.inject.enum import RestrictValueType from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling from brevitas.quant.solver import ActQuantSolver - from brevitas.inject.enum import RestrictValueType # Derive a Quantizer from the brevitas bases - class Quantizer( - IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver - ): + class Quantizer(IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver): # Configure the quantization bit-width bit_width = bits # Signedness of the quantization output @@ -141,7 +142,8 @@ def forward(self, x): # noqa: May be static "layer-norm": torch.nn.LayerNorm( # Note: Disable affine parameters as potential negative scale causes # streamlining issues later - normalized_shape=normalized_shape, elementwise_affine=False + normalized_shape=normalized_shape, + elementwise_affine=False, ), # PyTorch default 1-dimensional batch normalization. Needs to transpose # embedding and sequence dimension to normalized over the embedding @@ -149,11 +151,13 @@ def forward(self, x): # noqa: May be static "batch-norm": torch.nn.Sequential( # Note: Disable affine parameters as potential negative scale causes # streamlining issues later - Transpose(), torch.nn.LazyBatchNorm1d(affine=False), Transpose() + Transpose(), + torch.nn.LazyBatchNorm1d(affine=False), + Transpose(), ), # No normalization by a PyTorch built-in identity layer. Should not # appear in the graph. - "none": torch.nn.Identity() + "none": torch.nn.Identity(), } # Select the normalization layer by key @@ -172,7 +176,7 @@ def get_mask(key, length): # probability each "random": torch.where( # noqa: Confused by types? torch.rand(length, length) > 0.5, -torch.inf, 0.0 - ) + ), } # Select the mask type by key return masks[key] @@ -181,9 +185,7 @@ def get_mask(key, length): # Single-layer scaled dot-product attention block with MLP and normalization class TransformerBlock(torch.nn.Module): # Initializes the model and registers the module parameters - def __init__( - self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits - ): + def __init__(self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits): # Initialize the PyTorch Module superclass super().__init__() @@ -197,7 +199,7 @@ def __init__( # Quantize at the output act_quant=act_quantizer(bits, _signed=True), # Pass quantization information on to the next layer. - return_quant_tensor=True + return_quant_tensor=True, ) # Quantized scaled dot-product attention operator self.sdp = QuantMultiheadAttention( @@ -232,28 +234,24 @@ def __init__( # No quantization in front of the input projections as this is # either done by a standalone quantizer preceding the whole block in_proj_input_quant=None, - # Quantize the output projections weights as configured out_proj_weight_quant=weight_quantizer(bits, _signed=True), # Quantize the bias of the output projections as configured out_proj_bias_quant=bias_quantizer(bits, _signed=True), # Quantize the input to the output projection as configured out_proj_input_quant=act_quantizer(bits, _signed=True), - # Quantizer the key after projections as configured k_transposed_quant=act_quantizer(bits, _signed=True), # Quantize the queries after projections as configured q_scaled_quant=act_quantizer(bits, _signed=True), # Quantize the values after projection as configured v_quant=act_quantizer(bits, _signed=True), - # No output quantization for now, as stacking multiple layers # results in multiple multi-thresholds in succession out_proj_output_quant=None, - # Return the quantization parameters so the next layer can # quantize the bias - return_quant_tensor=True + return_quant_tensor=True, ) # Residual branch addition skipping over the attention layer self.residual_sdp = QuantEltwiseAdd( @@ -266,7 +264,7 @@ def __init__( # fine and does not require re-quantization. output_quant=None, # Pass quantization information on to the next layer. - return_quant_tensor=True + return_quant_tensor=True, ) # Normalization following the attention layer self.norm_sdp = torch.nn.Sequential( @@ -284,7 +282,7 @@ def __init__( # Quantize at the output act_quant=act_quantizer(bits, _signed=True), # Pass quantization information on to the next layer. - return_quant_tensor=True + return_quant_tensor=True, ), # First mlp layer projecting to the mlp dimension QuantLinear( @@ -309,7 +307,7 @@ def __init__( output_quant=None, # Return the quantization parameters so the next layer can # quantize the bias - return_quant_tensor=True + return_quant_tensor=True, ), # Use the ReLU activation function instead of the more commonly used # GELU, as the latter is not mapped easily to hardware with FINN @@ -318,7 +316,7 @@ def __init__( act_quant=act_quantizer(bits, _signed=False), # Return the quantization parameters so the next layer can # quantize the bias - return_quant_tensor=True + return_quant_tensor=True, ), # Second mlp layer projecting back to the embedding dimension QuantLinear( @@ -342,7 +340,7 @@ def __init__( # quantized element-wise addition taking care of quantization output_quant=None, # Pass quantization information on to the next layer. - return_quant_tensor=True + return_quant_tensor=True, ), ) # Residual branch addition skipping over the MLP layer @@ -359,7 +357,7 @@ def __init__( # Note: Not for the last layer to allow this to be combined with # standard pytorch calls like .detach() or .numpy(), which are # not directly available on QuantTensor. - return_quant_tensor=True + return_quant_tensor=True, ) # Normalization following the attention layer self.norm_mlp = torch.nn.Sequential( @@ -378,9 +376,7 @@ def forward(self, x): # Quantize the input to the attention block q = self.sdp_input_quant(x) # Scaled dot-product attention with residual branch and normalization - x = self.norm_sdp( - self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0]) - ) + x = self.norm_sdp(self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0])) # MLP layer with residual branch and normalization return self.norm_mlp(self.residual_mlp(x, self.mlp(x))) @@ -399,7 +395,7 @@ def __init__(self, input_quant, output_quant, return_quant_tensor): # Quantize the outputs after adding input and positional encoding output_quant=output_quant, # Returns quantization information to the next layer - return_quant_tensor=return_quant_tensor + return_quant_tensor=return_quant_tensor, ) # Forward pass adding positional encoding to the input tensor @@ -426,14 +422,7 @@ def forward(self, x): # Quantized learned positional encoding layer class QuantLearnedPositionalEncoding(torch.nn.Module): # Initializes the model and registers the module parameters - def __init__( - self, - seq_len, - emb_dim, - input_quant, - output_quant, - return_quant_tensor - ): + def __init__(self, seq_len, emb_dim, input_quant, output_quant, return_quant_tensor): # Initialize the PyTorch Module superclass super().__init__() # Adds the quantized input and positional encoding @@ -444,7 +433,7 @@ def __init__( # Quantize the outputs after adding input and positional encoding output_quant=output_quant, # Returns quantization information to the next layer - return_quant_tensor=return_quant_tensor + return_quant_tensor=return_quant_tensor, ) # Register a parameter tensor representing the not quantized positional # encoding @@ -467,7 +456,7 @@ def forward(self, x): # Lazy version of the learned encoding not requiring input dimensions at # initialization, inferring these at the first forward pass class LazyQuantLearnedPositionalEncoding( - torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa + torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa ): # Once initialized, this will become a QuantLearnedPositionalEncoding as # defined above @@ -520,7 +509,7 @@ def __init__(self, input_quant, output_quant, return_quant_tensor): # Quantize the outputs after adding input and positional encoding output_quant=output_quant, # Returns quantization information to the next layer - return_quant_tensor=return_quant_tensor + return_quant_tensor=return_quant_tensor, ) # Forward pass adding positional encoding to the input tensor @@ -530,9 +519,7 @@ def forward(self, x): _, seq, emb = x.shape # Binary positional encoding fills the embedding dimension with the bit # pattern corresponding to the position in the sequence - pos = torch.as_tensor([ - [(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq) - ]) + pos = torch.as_tensor([[(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)]) # Move the encoding tensor to the same device as the input tensor pos = pos.to(x.device, dtype=x.dtype) # Add the quantized encoding tp the quantized input @@ -542,28 +529,22 @@ def forward(self, x): # Gets the positional encoding layer from configuration key, quantizers and # shape -def get_positional_encoding( - key, input_quant, output_quant, return_quant_tensor -): +def get_positional_encoding(key, input_quant, output_quant, return_quant_tensor): # Dictionary mapping keys to supported normalization layer implementations masks = { # No positional encoding - "none": QuantIdentity( - act_quant=input_quant, return_quant_tensor=return_quant_tensor - ), + "none": QuantIdentity(act_quant=input_quant, return_quant_tensor=return_quant_tensor), # Fixed, sinusoidal positional encoding according to Vaswani et al. with # added quantizers "sinusoidal": QuantSinusoidalPositionalEncoding( input_quant, output_quant, return_quant_tensor ), # Fixed, binary positional encoding with quantizers - "binary": QuantBinaryPositionalEncoding( - input_quant, output_quant, return_quant_tensor - ), + "binary": QuantBinaryPositionalEncoding(input_quant, output_quant, return_quant_tensor), # Learned positional encoding with quantizers "learned": LazyQuantLearnedPositionalEncoding( input_quant, output_quant, return_quant_tensor - ) + ), } # Select the positional encoding type by key return masks[key] @@ -583,31 +564,31 @@ def unpack_from_quant(tensor: torch.Tensor | QuantTensor): class DummyTransformer(torch.nn.Module): # Initializes the model and registers the module parameters def __init__( - self, - # Number of layers of attention blocks - num_layers, - # Number of attention heads per block - num_heads, - # Size of embedding dimension going into/out of the attention block - emb_dim, - # Size of MLP dimension in each attention block - mlp_dim, - # Length of the input sequence, i.e., context size - seq_len, - # Enables bias term added to Linear layers - bias, - # Quantization bit-width: For now all layers are quantized to the - # same bit-width - bits, - # Type of normalization layer to use in the transformer blocks - # Options are: layer-norm, batch-norm and none - norm="none", - # Type of attention mask to use - # Options are: none, causal or const - mask="none", - # Type of positional encoding to use at the input - # Options are: none, sinusoidal, binary, learned - positional_encoding="none" + self, + # Number of layers of attention blocks + num_layers, + # Number of attention heads per block + num_heads, + # Size of embedding dimension going into/out of the attention block + emb_dim, + # Size of MLP dimension in each attention block + mlp_dim, + # Length of the input sequence, i.e., context size + seq_len, + # Enables bias term added to Linear layers + bias, + # Quantization bit-width: For now all layers are quantized to the + # same bit-width + bits, + # Type of normalization layer to use in the transformer blocks + # Options are: layer-norm, batch-norm and none + norm="none", + # Type of attention mask to use + # Options are: none, causal or const + mask="none", + # Type of positional encoding to use at the input + # Options are: none, sinusoidal, binary, learned + positional_encoding="none", ): # Initialize the PyTorch Module superclass super().__init__() @@ -623,15 +604,16 @@ def __init__( # bit-width as the input output_quant=None, # Pass quantization information on to the next layer - return_quant_tensor=True + return_quant_tensor=True, ) # Sequence of num_layers transformer encoder blocks - self.encoder = torch.nn.Sequential(*[ - TransformerBlock( - num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits - ) for _ in range(num_layers) - ]) + self.encoder = torch.nn.Sequential( + *[ + TransformerBlock(num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits) + for _ in range(num_layers) + ] + ) # Model forward pass taking an input sequence and returning a single set of # class probabilities @@ -642,7 +624,9 @@ def forward(self, x): # single output from the model. return unpack_from_quant(self.encoder(self.pos(x))) -### ADAPTED FROM export.py + +# ADAPTED FROM export.py + # Check whether a layer is a normalization layer of some supported type def is_norm_layer(module): @@ -672,21 +656,18 @@ def patch_non_affine_norms(model: torch.nn.Module): # noqa: Shadows model if hasattr(module, "running_var"): # Patch the affine bias by all 1 tensor of the same shape, # type and device as the running variance - module.weight = torch.nn.Parameter( - torch.ones_like(module.running_var) - ) + module.weight = torch.nn.Parameter(torch.ones_like(module.running_var)) # Check whether affine bias parameters are missing if hasattr(module, "bias") and module.bias is None: # There need to be running statistics to patch the scales if hasattr(module, "running_mean"): # Patch the affine bias by all 0 tensor of the same shape, # type and device as the running mean - module.bias = torch.nn.Parameter( - torch.zeros_like(module.running_var) - ) + module.bias = torch.nn.Parameter(torch.zeros_like(module.running_var)) # Return the patched model container return model + template_folding_yaml = """ # Per operator type default configurations defaults: @@ -780,30 +761,31 @@ def patch_non_affine_norms(model: torch.nn.Module): # noqa: Shadows model # ... """ + class bench_transformer(bench): def step_export_onnx(self, output_onnx_path): # Generates a dummy transformer block, # not used for actual models (RadioML, GPT, etc.) # Load the parameters file - #params = dvc.api.params_show("params.yaml") + # params = dvc.api.params_show("params.yaml") # Seed all RNGs seed(self.params["seed"]) # Make PyTorch behave deterministically if possible torch.use_deterministic_algorithms(mode=True, warn_only=True) # Create a model instance from the configuration parameters - #model = DummyTransformer(**params["model"]) + # model = DummyTransformer(**params["model"]) model = DummyTransformer( - num_layers = self.params["model_num_layers"], - num_heads = self.params["model_num_heads"], - emb_dim = self.params["model_emb_dim"], - mlp_dim = self.params["model_mlp_dim"], - seq_len = self.params["model_seq_len"], - bias = self.params["model_bias"], - bits = self.params["model_bits"], - norm = self.params["model_norm"], - mask = self.params["model_mask"], - positional_encoding = self.params["model_positional_encoding"], + num_layers=self.params["model_num_layers"], + num_heads=self.params["model_num_heads"], + emb_dim=self.params["model_emb_dim"], + mlp_dim=self.params["model_mlp_dim"], + seq_len=self.params["model_seq_len"], + bias=self.params["model_bias"], + bits=self.params["model_bits"], + norm=self.params["model_norm"], + mask=self.params["model_mask"], + positional_encoding=self.params["model_positional_encoding"], ) # Get the configured sequence length and embedding dimension to generate @@ -813,7 +795,7 @@ def step_export_onnx(self, output_onnx_path): with torch.no_grad(): # Check whether GPU training is available and select the appropriate # device - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Move the model to the training device model = model.to(device) # Multiple passes of calibration might be necessary for larger/deep @@ -840,13 +822,11 @@ def step_export_onnx(self, output_onnx_path): self.build_inputs["input_npy_path"] = "inp.npy" self.build_inputs["output_npy_path"] = "out.npy" # Export the model graph to QONNX - #export_qonnx(model, (x,), "attention.onnx", **self.params["export"]) - export_qonnx(model, (x,), output_onnx_path, - opset_version = 14, - do_constant_folding = True) + # export_qonnx(model, (x,), "attention.onnx", **self.params["export"]) + export_qonnx(model, (x,), output_onnx_path, opset_version=14, do_constant_folding=True) def step_build_setup(self): - #with open("params.yaml") as file: + # with open("params.yaml") as file: # params = yaml.safe_load(file) # Seed all RNGs seed(self.params["seed"]) @@ -863,41 +843,38 @@ def step_build_setup(self): else: # for GPTs (why is this different?) model = ModelWrapper(self.build_inputs["onnx_path"]) - _, seq_len, emb_dim = model.get_tensor_shape("/emb_add/input_quant/export_handler/Quant_output_0") + _, seq_len, emb_dim = model.get_tensor_shape( + "/emb_add/input_quant/export_handler/Quant_output_0" + ) # Read the input value range information for the dataset from the parameters # Note: Consider calibrating this on the fly from the dataset - value_range = [ -100, +100 ] # params["build"]["range"] # TODO: make configurable? + value_range = [-100, +100] # params["build"]["range"] # TODO: make configurable? input_range = tuple(np.array([value_range]).T) # Construct the seed range information of the input tensor range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range) - + # Prepare config files # TODO: make configurable - # TODO: log intermediate files such as inp.npy, folding.yaml, or specialize_layers.jon as artifacts, maybe create in unique temp dirs + # TODO: log intermediate files such as inp.npy, folding.yaml, + # or specialize_layers.jon as artifacts, maybe create in unique temp dirs specialize_layers_dict = { - "Defaults": { - "preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]] - }, - "": { - "preferred_impl_style": "" - } + "Defaults": {"preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]}, + "": {"preferred_impl_style": ""}, } with open("specialize_layers.json", "w") as f: - json.dump(specialize_layers_dict, f, indent=2) + json.dump(specialize_layers_dict, f, indent=2) with open("folding.yaml", "w") as f: - f.write(template_folding_yaml) - + f.write(template_folding_yaml) # Create a configuration for building the scaled dot-product attention # operator to a hardware accelerator cfg = build_cfg.DataflowBuildConfig( - folding_config_file = "folding.yaml", - specialize_layers_config_file = "specialize_layers.json", - standalone_thresholds = True, - max_multithreshold_bit_width = 16, - mvau_wwidth_max = 2048, - + folding_config_file="folding.yaml", + specialize_layers_config_file="specialize_layers.json", + standalone_thresholds=True, + max_multithreshold_bit_width=16, + mvau_wwidth_max=2048, verify_steps=[ # Verify the model after converting to the FINN onnx dialect build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, @@ -908,7 +885,8 @@ def step_build_setup(self): # converting to HLS build_cfg.VerificationStepType.TIDY_UP_PYTHON, # Verify the model after generating C++ HLS and applying folding - #build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, #only inserted if live FIFO-sizing is off + # only inserted if live FIFO-sizing is off: + # build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, # No RTL Simulation support for now ], # File with test inputs for verification @@ -963,17 +941,17 @@ def step_build_setup(self): # model before creating the stitched IP # Note: end-to-end verification of the stitched IP in RTL simulation # is still not possible due to missing float IPs - #node_by_node_cppsim, #only inserted if live FIFO-sizing is off + # node_by_node_cppsim, #only inserted if live FIFO-sizing is off # Only for debugging for now, does not work if "vivado" style # StreamingFIFOs are used # node_by_node_rtlsim, "step_create_stitched_ip", # "step_measure_rtlsim_performance", # not possible due to float components - "step_out_of_context_synthesis", # for synthesis results (e.g. utilization) - "step_synthesize_bitfile", + "step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + "step_synthesize_bitfile", "step_make_driver", "step_deployment_package", - ] + ], ) # TESTING custom vs live FIFO-sizing @@ -981,14 +959,16 @@ def step_build_setup(self): # insert default FIFO-sizing step (behind step_generate_estimate_reports) for i in range(len(cfg.steps)): if cfg.steps[i] == "step_generate_estimate_reports": - cfg.steps.insert(i+1, "step_set_fifo_depths") + cfg.steps.insert(i + 1, "step_set_fifo_depths") else: # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen) for i in range(len(cfg.steps)): if cfg.steps[i] == "step_hw_ipgen": - cfg.steps.insert(i+1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len)) + cfg.steps.insert( + i + 1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len) + ) # also enable cppsim, which doesn't work with virtual FIFOs - cfg.steps.insert(i+2, node_by_node_cppsim) + cfg.steps.insert(i + 2, node_by_node_cppsim) cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM) return cfg diff --git a/src/finn/benchmarking/templates.py b/src/finn/benchmarking/templates.py index c8bf944380..44c2ebced8 100644 --- a/src/finn/benchmarking/templates.py +++ b/src/finn/benchmarking/templates.py @@ -1,5 +1,6 @@ # Template strings for benchmarking +# flake8: noqa # power report scripting based on Lucas Reuter: template_open = """ diff --git a/src/finn/benchmarking/util.py b/src/finn/benchmarking/util.py index 23ecc0a984..1e08bd2501 100644 --- a/src/finn/benchmarking/util.py +++ b/src/finn/benchmarking/util.py @@ -1,8 +1,10 @@ # Utility functions for benchmarking -import os, shutil import json +import os +import shutil import xml.etree.ElementTree as ET + def _find_rows_and_headers(table): rows = table.findall("tablerow") headers = [] @@ -13,6 +15,7 @@ def _find_rows_and_headers(table): break return (rows, headers) + def summarize_table(table): table_summary = {} table_summary["headers"] = [] @@ -38,6 +41,7 @@ def summarize_table(table): return table_summary + def summarize_section(section): section_summary = {} section_summary["tables"] = [] @@ -54,6 +58,7 @@ def summarize_section(section): return section_summary + def power_xml_to_dict(xml_path): tree = ET.parse(xml_path) root = tree.getroot() @@ -65,6 +70,7 @@ def power_xml_to_dict(xml_path): return result + def delete_dir_contents(dir): for filename in os.listdir(dir): file_path = os.path.join(dir, filename) @@ -74,7 +80,8 @@ def delete_dir_contents(dir): elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: - print('Failed to delete %s. Reason: %s' % (file_path, e)) + print("Failed to delete %s. Reason: %s" % (file_path, e)) + def merge_dicts(a: dict, b: dict): for key in b: @@ -87,6 +94,7 @@ def merge_dicts(a: dict, b: dict): a[key] = b[key] return a + def merge_logs(log_a, log_b, log_out): # merges json log (list of nested dicts) b into a, not vice versa (TODO) @@ -98,8 +106,8 @@ def merge_logs(log_a, log_b, log_out): for idx, run_a in enumerate(a): for run_b in b: if run_a["run_id"] == run_b["run_id"]: - #a[idx] |= run_b # requires Python >= 3.9 - #a[idx] = {**run_a, **run_b} + # a[idx] |= run_b # requires Python >= 3.9 + # a[idx] = {**run_a, **run_b} a[idx] = merge_dicts(run_a, run_b) break diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py index 5545f66536..f6de8edaae 100644 --- a/src/finn/builder/custom_step_library/conv1d.py +++ b/src/finn/builder/custom_step_library/conv1d.py @@ -1,9 +1,10 @@ from qonnx.core.modelwrapper import ModelWrapper -from finn.builder.build_dataflow_config import DataflowBuildConfig from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors from qonnx.transformation.general import GiveUniqueNodeNames + import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb +from finn.builder.build_dataflow_config import DataflowBuildConfig def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): @@ -11,6 +12,7 @@ def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) return model + def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(to_hw.InferChannelwiseLinearLayer()) model = model.transform(to_hw.InferLabelSelectLayer()) diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py index 6a2d8053b2..0c251ad299 100644 --- a/src/finn/builder/custom_step_library/mobilenet.py +++ b/src/finn/builder/custom_step_library/mobilenet.py @@ -1,12 +1,7 @@ -from finn.benchmarking.bench_base import bench from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d from qonnx.transformation.double_to_single_float import DoubleToSingleFloat -from qonnx.transformation.general import ( - ApplyConfig, - GiveReadableTensorNames, - GiveUniqueNodeNames, -) +from qonnx.transformation.general import ApplyConfig, GiveReadableTensorNames, GiveUniqueNodeNames from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes @@ -116,4 +111,4 @@ def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: Da model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - return model \ No newline at end of file + return model diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py index a4082b1adf..3e1c61063b 100644 --- a/src/finn/builder/custom_step_library/resnet.py +++ b/src/finn/builder/custom_step_library/resnet.py @@ -34,7 +34,6 @@ from qonnx.transformation.double_to_single_float import DoubleToSingleFloat from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import ( - ApplyConfig, ConvertDivToMul, ConvertSubToAdd, GiveReadableTensorNames, @@ -52,7 +51,7 @@ from qonnx.transformation.remove import RemoveIdentityOps import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw -from finn.builder.build_dataflow_config import DataflowBuildConfig, ShellFlowType +from finn.builder.build_dataflow_config import DataflowBuildConfig from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline.absorb import ( Absorb1BitMulIntoConv, diff --git a/src/finn/builder/custom_step_library/transformer.py b/src/finn/builder/custom_step_library/transformer.py index 5b0d39c756..79cfa29353 100644 --- a/src/finn/builder/custom_step_library/transformer.py +++ b/src/finn/builder/custom_step_library/transformer.py @@ -4,6 +4,7 @@ # Copies (deep-copies) python objects import copy +import json # Numpy for loading and comparing the verification input/output import numpy as np @@ -11,8 +12,6 @@ # YAML for loading experiment configurations import yaml -import json - # QONNX quantization data types from qonnx.core.datatype import DataType @@ -113,10 +112,7 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode # Splitting and removing of FIFOs from the model graph -from finn.transformation.fpgadataflow.set_fifo_depths import ( - RemoveShallowFIFOs, - SplitLargeFIFOs, -) +from finn.transformation.fpgadataflow.set_fifo_depths import RemoveShallowFIFOs, SplitLargeFIFOs # Graph transformation setting the folding, i.e., parallelization configuration from finn.transformation.fpgadataflow.set_folding import SetFolding @@ -130,15 +126,12 @@ # Folds quantizers into weight tensor initializers, needed for lowering # convolutions to MatMuls from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights -from finn.transformation.qonnx.quant_act_to_multithreshold import ( - default_filter_function_generator, -) +from finn.transformation.qonnx.quant_act_to_multithreshold import default_filter_function_generator # Cleanup transformation getting rid of 3d data layout from finn.transformation.squeeze import Squeeze from finn.transformation.streamline.absorb import ( AbsorbAddIntoMultiThreshold, - AbsorbMulIntoMultiThreshold, AbsorbSignBiasIntoMultiThreshold, ) @@ -148,14 +141,10 @@ # FINN streamlining transformations removing nodes without real effect from the # graph -from finn.transformation.streamline.remove import ( - RemoveIdentityReshape, - RemoveIdentityTranspose, -) +from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose # FINN streamlining transformations reordering the graph from finn.transformation.streamline.reorder import ( - MoveAddPastMul, MoveMulPastAdd, MoveSqueezePastMatMul, MoveSqueezePastMultiThreshold, diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index 16f1e7a029..0e2bc27114 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -27,14 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import os +import json import numpy as np -from PIL import Image +import os from dataset_loading import FileQueue, ImgQueue -import json -from pynq import PL from driver import io_shape_dict from driver_base import FINNExampleOverlay +from PIL import Image +from pynq import PL + def img_resize(img, size): w, h = img.size @@ -49,13 +50,15 @@ def img_resize(img, size): ow = int(size * w / h) return img.resize((ow, oh), Image.BILINEAR) + def img_center_crop(img, size): crop_height, crop_width = (size, size) image_width, image_height = img.size - crop_top = int(round((image_height - crop_height) / 2.)) - crop_left = int(round((image_width - crop_width) / 2.)) + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height)) + def pre_process(img_np): img = Image.fromarray(img_np.astype(np.uint8)) img = img_resize(img, 256) @@ -63,9 +66,10 @@ def pre_process(img_np): img = np.array(img, dtype=np.uint8) return img -def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images = 50000): + +def setup_dataloader(val_path, label_file_path=None, batch_size=100, n_images=50000): if label_file_path is None: - val_folders = [ f.name for f in os.scandir(val_path) if f.is_dir() ] + val_folders = [f.name for f in os.scandir(val_path) if f.is_dir()] val_folders = sorted(val_folders) assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val" files = [] @@ -74,18 +78,19 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images current_files = sorted(os.listdir(os.path.join(val_path, folder))) current_files = [os.path.join(folder, file) for file in current_files] files.extend(current_files) - labels.extend([idx]*len(current_files)) + labels.extend([idx] * len(current_files)) files = files[:n_images] else: - files = ['ILSVRC2012_val_{:08d}.JPEG'.format(i) for i in range(1,n_images+1)] + files = ["ILSVRC2012_val_{:08d}.JPEG".format(i) for i in range(1, n_images + 1)] labels = np.loadtxt(label_file_path, dtype=int, usecols=1) file_queue = FileQueue() - file_queue.load_epochs(list(zip(files,labels)), shuffle=False) + file_queue.load_epochs(list(zip(files, labels)), shuffle=False) img_queue = ImgQueue(maxsize=batch_size) img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process) return img_queue + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Validate top-1 accuracy for FINN-generated accelerator" @@ -93,7 +98,9 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images parser.add_argument( "--batchsize", help="number of samples for inference", type=int, default=100 ) - parser.add_argument("--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="") + parser.add_argument( + "--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="" + ) parser.add_argument( "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma" ) @@ -154,6 +161,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images ) elif dataset == "cifar100": from dataset_loading import cifar + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( dataset_root, download=True, one_hot=False, cifar10=False ) @@ -184,7 +192,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images ibuf_normal = test_imgs[i].reshape(driver.ishape_normal()) exp = test_labels[i] obuf_normal = driver.execute(ibuf_normal) - #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] if obuf_normal.shape[1] > 1: obuf_normal = np.argmax(obuf_normal, axis=1) ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) @@ -202,7 +210,7 @@ def setup_dataloader(val_path, label_file_path = None, batch_size=100, n_images exp = np.array(lbls) ibuf_normal = imgs.reshape(driver.ishape_normal()) obuf_normal = driver.execute(ibuf_normal) - #obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] if obuf_normal.shape[1] > 1: obuf_normal = np.argmax(obuf_normal, axis=1) ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) From 3a84a57f8584e669cfa80bdc65465aa52d8a21bb Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 16:08:48 +0200 Subject: [PATCH 115/125] Change log level --- src/finn/benchmarking/bench_base.py | 40 ++++++++++++++++------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py index 5cebe09878..01e42b9c2a 100644 --- a/src/finn/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -3,6 +3,7 @@ import os import shutil import subprocess +import yaml from shutil import copy as shcopy from shutil import copytree @@ -18,8 +19,6 @@ from finn.builder.build_dataflow_config import DataflowBuildConfig from finn.util.basic import alveo_default_platform, alveo_part_map, part_map -# TODO: merge this file into bench.py once most functionality has been moved to builder - def start_test_batch_fast(results_path, project_path, run_target, pairs): # Prepare tcl script @@ -109,7 +108,7 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d # TODO: coordinate with new builder loggin setup # Setup some basic global default configuration - # TODO: are these class members even used anymore? + # TODO: clean up or remove these attributes if "synth_clk_period_ns" in params: self.clock_period_ns = params["synth_clk_period_ns"] else: @@ -136,6 +135,23 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d else: self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ + # Load custom (= non build_dataflow_config) parameters from topology-specific .yml + custom_params = [ + "model_dir", # used to setup onnx/npy input + "model_path", # used to setup onnx/npy input + # model-gen parameters, such as seed, simd, pe, etc. + # TODO: separate these more cleanly from builder options + ] + + dut_yaml_name = self.params["dut"] + ".yml" + dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name) + if os.path.isfile(dut_path): + with open(dut_path, "r") as f: + dut_cfg = yaml.load(f, Loader=yaml.SafeLoader) + for key in dut_cfg: + if key in custom_params: + self.params[key] = dut_cfg[key] + # Clear FINN tmp build dir before every run print("Clearing FINN BUILD DIR ahead of run") delete_dir_contents(os.environ["FINN_BUILD_DIR"]) @@ -244,15 +260,6 @@ def step_parse_builder_output(self, build_dir): def steps_full_build_flow(self): # Default step sequence for benchmarking a full FINN builder flow - - # LIST OF ADDITIONAL YAML OPTIONS (beyond DataflowBuildConfig) - custom_params = [ - "model_dir", # used to setup onnx/npy input - "model_path", # used to setup onnx/npy input - # model-gen parameters, such as seed, simd, pe, etc. - # TODO: separate these from builder options - ] - # MODEL CREATION/IMPORT # TODO: track fixed input onnx models with DVC if "model_dir" in self.params: @@ -281,7 +288,8 @@ def steps_full_build_flow(self): # enable extra performance optimizations (physopt) # TODO: check OMX synth strategy again! cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST - cfg.verbose = False + cfg.verbose = True + cfg.console_log_level = "ERROR" cfg.enable_build_pdb_debug = False # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration cfg.force_python_rtlsim = False @@ -294,14 +302,10 @@ def steps_full_build_flow(self): # cfg.large_fifo_mem_style # Overwrite build config settings with run-specific YAML build definition + # TODO: warn/error if there are unrecognized options set? for key in self.params: if hasattr(cfg, key): setattr(cfg, key, self.params[key]) - else: - if key not in custom_params: - pass - # TODO: be more strict? support custom extra options like MetaFi uses? - # raise Exception("Unrecognized builder config defined in YAML: %s" % key) # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) # TODO: make configurable or set on pipeline level? From 6054c6b02a7ba4f9aaff94fb21bbf2e401e4166b Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 17:50:38 +0200 Subject: [PATCH 116/125] dvc pull before saving dvclive experiments --- ci/.gitlab-bench.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml index b5d17d7fdc..6ddeb11858 100644 --- a/ci/.gitlab-bench.yml +++ b/ci/.gitlab-bench.yml @@ -75,5 +75,7 @@ Result Collection: # Also run on failure of previous tasks to collect partial results - when: always script: + # pulling models seems to be needed for dvclive to save experiments, even though they are not used or modified + - dvc pull - python3.10 ci/collect.py - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git From 8dd0c0854df0c121c2c1afe7c62970775d8d0ba2 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Fri, 23 May 2025 22:16:01 +0200 Subject: [PATCH 117/125] Fix report dir creation --- src/finn/builder/build_dataflow.py | 8 ++++---- src/finn/transformation/fpgadataflow/make_driver.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 2d38be3ab3..020571a6ad 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -171,7 +171,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): print(f"Final outputs will be generated in {cfg.output_dir}") print(f"Build log is at {cfg.output_dir}/build_dataflow.log") # create the output dir if it doesn't exist - os.makedirs(cfg.output_dir, exist_ok=True) + os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) # set up logger logpath = os.path.join(cfg.output_dir, "build_dataflow.log") @@ -285,7 +285,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): "status": "failed", "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), } - with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: json.dump(metadata, f, indent=2) return -1 # A user error shouldn't be need to be fixed using PDB @@ -297,7 +297,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): "status": "failed", "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), } - with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: json.dump(metadata, f, indent=2) return -1 @@ -308,7 +308,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): "status": "ok", "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), } - with open(cfg.output_dir + "/report/metadata_builder.json", "w") as f: + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: json.dump(metadata, f, indent=2) print("Completed successfully") return 0 diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py index 76880cb558..4b1e70369b 100644 --- a/src/finn/transformation/fpgadataflow/make_driver.py +++ b/src/finn/transformation/fpgadataflow/make_driver.py @@ -312,7 +312,7 @@ class MakePYNQDriverIODMA(Transformation): under the runtime_weights/ subfolder of the pynq_driver_dir. """ - def __init__(self, platform, validation_datset): + def __init__(self, platform, validation_datset=None): super().__init__() self.platform = platform self.validation_datset = validation_datset From 807950b2a8764b1da3a68cc8ef5b6976b3456e77 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 26 May 2025 12:01:49 +0200 Subject: [PATCH 118/125] Use live FIFO sizes for MNv1, RN50 --- models.dvc | 6 +++--- src/finn/benchmarking/dut/mobilenetv1.yml | 2 +- src/finn/benchmarking/dut/resnet50.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/models.dvc b/models.dvc index 784500a21f..35b5292128 100644 --- a/models.dvc +++ b/models.dvc @@ -1,6 +1,6 @@ outs: -- md5: 5db49af689e7827c32280837e0c80470.dir - size: 202993533 - nfiles: 40 +- md5: 20c3f996d17ef035c8189c0d0ac44cf6.dir + size: 203029833 + nfiles: 42 hash: md5 path: models diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml index bb3b26f436..16a68f4143 100644 --- a/src/finn/benchmarking/dut/mobilenetv1.yml +++ b/src/finn/benchmarking/dut/mobilenetv1.yml @@ -1,5 +1,5 @@ model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx -folding_config_file: models/mobilenetv1/ZCU102_folding_config.json +folding_config_file: models/mobilenetv1/ZCU102_folding_config_live_fifo.json specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json steps: diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml index 3a3211aad1..c8779e5654 100644 --- a/src/finn/benchmarking/dut/resnet50.yml +++ b/src/finn/benchmarking/dut/resnet50.yml @@ -1,5 +1,5 @@ model_path: models/resnet50/resnet50_w1a2_exported.onnx -folding_config_file: models/resnet50/U250_folding_config.json +folding_config_file: models/resnet50/U250_folding_config_live_fifo.json specialize_layers_config_file: models/resnet50/U250_specialize_layers.json vitis_floorplan_file: models/resnet50/floorplan_resnet50.json From 94abf2c7c795650476b35fc4bbd180451524d9e3 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 26 May 2025 14:30:49 +0200 Subject: [PATCH 119/125] Make console and log output more consistent --- src/finn/benchmarking/bench.py | 13 ++- src/finn/builder/build_dataflow.py | 120 +++++++++++----------- src/finn/builder/build_dataflow_config.py | 8 +- 3 files changed, 70 insertions(+), 71 deletions(-) diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 995b3b565c..765e14e587 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -165,13 +165,12 @@ def get_default_session_options_new(): result = bench_object.run() if result == "skipped": log_dict["status"] = "skipped" - print("Run skipped") + print("BENCH RUN SKIPPED") else: log_dict["status"] = "ok" - print("Run successfully completed") except Exception: log_dict["status"] = "failed" - print("Run failed: " + traceback.format_exc()) + print("BENCH RUN FAILED WITH EXCEPTION: " + traceback.format_exc()) exit_code = 1 log_dict["output"] = bench_object.output_dict @@ -183,8 +182,12 @@ def get_default_session_options_new(): with open(builder_log_path, "r") as f: builder_log = json.load(f) if builder_log["status"] == "failed": - print("Run failed (builder reported failure)") + print("BENCH RUN FAILED (BUILDER REPORTED FAILURE)") exit_code = 1 + else: + print("BENCH RUN COMPLETED (BUILDER REPORTED SUCCESS)") + else: + print("BENCH RUN COMPLETED") # log metadata of this run to its own report directory log_path = os.path.join(bench_object.report_dir, "metadata_bench.json") @@ -196,5 +199,5 @@ def get_default_session_options_new(): # save local artifacts of this run (e.g., full build dir, detailed debug info) bench_object.save_local_artifacts_collection() - print("Stopping job") + print("STOPPING JOB") return exit_code diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 020571a6ad..f96f205e72 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -39,8 +39,10 @@ import sys import time from qonnx.core.modelwrapper import ModelWrapper +from rich import print as rprint from rich.console import Console from rich.logging import RichHandler +from rich.traceback import Traceback from finn.builder.build_dataflow_config import DataflowBuildConfig, default_build_dataflow_steps from finn.builder.build_dataflow_steps import build_dataflow_step_lookup @@ -159,21 +161,41 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: return filename +def log_and_exit(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0): + if exit_code: + print("Build failed") + status = "failed" + else: + print("Build completed successfully") + status = "ok" + + # Generate metadata_builder.json + metadata = { + "status": status, + "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), + } + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: + json.dump(metadata, f, indent=2) + + # Generate time_per_step.json + if time_per_step is not None: + time_per_step["total_build_time"] = sum(time_per_step.values()) + with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f: + json.dump(time_per_step, f, indent=2) + + return exit_code + + def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): """Best-effort build a dataflow accelerator using the given configuration. :param model_filename: ONNX model filename to build :param cfg: Build configuration """ - finn_build_dir = os.environ["FINN_BUILD_DIR"] - - print(f"Intermediate outputs will be generated in {finn_build_dir}") - print(f"Final outputs will be generated in {cfg.output_dir}") - print(f"Build log is at {cfg.output_dir}/build_dataflow.log") - # create the output dir if it doesn't exist + # Create the output (report) dir if it doesn't exist os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) - # set up logger + # Set up logger logpath = os.path.join(cfg.output_dir, "build_dataflow.log") if cfg.verbose: logging.basicConfig( @@ -195,15 +217,16 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): log = logging.getLogger("build_dataflow") - # mirror stdout and stderr to log + # Mirror stdout and stderr to log sys.stdout = PrintLogger(log, logging.INFO, sys.stdout) sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr) console = Console(file=sys.stdout.console) + # Set up console logger if cfg.console_log_level != "NONE": - # set up console logger - consoleHandler = RichHandler(show_time=True, show_path=False, console=console) - + consoleHandler = RichHandler( + show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console + ) if cfg.console_log_level == "DEBUG": consoleHandler.setLevel(logging.DEBUG) elif cfg.console_log_level == "INFO": @@ -216,9 +239,13 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): consoleHandler.setLevel(logging.CRITICAL) logging.getLogger().addHandler(consoleHandler) - # Setup done, start processing + print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}") + print(f"Final outputs will be generated in {cfg.output_dir}") + print(f"Build log is at {cfg.output_dir}/build_dataflow.log") + + # Setup done, start build flow try: - # if start_step is specified, override the input model + # If start_step is specified, override the input model if cfg.start_step is None: print(f"Building dataflow accelerator from {model_filename}") model = ModelWrapper(model_filename) @@ -240,7 +267,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): model = ModelWrapper(intermediate_model_filename) assert type(model) is ModelWrapper - # start processing + # Start processing step_num = 1 time_per_step = dict() build_dataflow_steps = resolve_build_steps(cfg) @@ -249,7 +276,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): step_name = transform_step.__name__ print(f"Running step: {step_name} [{step_num}/{len(build_dataflow_steps)}]") - # run the step + # Run the step step_start = time.time() model = transform_step(model, cfg) step_end = time.time() @@ -263,55 +290,24 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): step_num += 1 except KeyboardInterrupt: print("KeyboardInterrupt detected. Aborting...") - print("Build failed") - return -1 + return log_and_exit(cfg, time_per_step, -1) except (Exception, FINNError) as e: - # Print full traceback if we are on debug log level - # or encountered a non-user error - print_full_traceback = True - if issubclass(type(e), FINNUserError) and log.level != logging.DEBUG: - print_full_traceback = False - - extype, value, tb = sys.exc_info() - if print_full_traceback: - # print exception info and traceback - log.error("FINN Internal compiler error:") - console.print_exception(show_locals=False) + if issubclass(type(e), FINNUserError): + # Handle FINN USER ERROR + log.error(f"FINN ERROR: {e}") else: - console.print(f"[bold red]FINN Error: [/bold red]{e}") - log.error(f"{e}") - print("Build failed") - metadata = { - "status": "failed", - "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), - } - with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: - json.dump(metadata, f, indent=2) - return -1 # A user error shouldn't be need to be fixed using PDB - - # start postmortem debug if configured - if cfg.enable_build_pdb_debug: - pdb.post_mortem(tb) - print("Build failed") - metadata = { - "status": "failed", - "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), - } - with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: - json.dump(metadata, f, indent=2) - return -1 - - time_per_step["total_build_time"] = sum(time_per_step.values()) - with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f: - json.dump(time_per_step, f, indent=2) - metadata = { - "status": "ok", - "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), - } - with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: - json.dump(metadata, f, indent=2) - print("Completed successfully") - return 0 + # Handle remaining errors (= FINN INTERNAL COMPILER ERROR) + log.error(f"FINN INTERNAL COMPILER ERROR: {e}") + + # Print traceback for interal errors or if in debug mode + if not issubclass(type(e), FINNUserError) or log.level == logging.DEBUG: + rprint(Traceback(show_locals=False)) + # Start postmortem debug if configured + if cfg.enable_build_pdb_debug: + pdb.post_mortem(e.__traceback__) + + return log_and_exit(cfg, time_per_step, -1) + return log_and_exit(cfg, time_per_step, 0) def build_dataflow_directory(path_to_cfg_dir: str): diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index b14bcab1d4..57204c5745 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -350,14 +350,14 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin): #: Whether pdb postmortem debuggig will be launched when the build fails enable_build_pdb_debug: Optional[bool] = False - #: When True, additional verbose information will be written to the log file. - #: Otherwise, these additional information will be suppressed. + #: When True, additional information (level = DEBUG) will be written to the log file. + #: Otherwise, this additional information will be suppressed (level = INFO). verbose: Optional[bool] = False #: Log level to be used on the command line for finn-plus internal logging. - #: This is different from the log level used for the build process, + #: This is different from the log level used for build_dataflow.log, #: which is controlled using the verbose flag. - console_log_level: Optional[LogLevel] = LogLevel.NONE + console_log_level: Optional[LogLevel] = LogLevel.ERROR #: If given, only run the steps in the list. If not, run default steps. #: See `default_build_dataflow_steps` for the default list of steps. From 9899d542efbee4756842d2b01ce15e5f47539680 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 26 May 2025 16:25:51 +0200 Subject: [PATCH 120/125] More verbose benchmarking logging --- src/finn/benchmarking/bench.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 765e14e587..5f29959712 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -134,8 +134,9 @@ def get_default_session_options_new(): # Run benchmark # TODO: integrate this loop (especially status logging) into the bench class - # TODO: log stdout of individual tasks of the job array into seperate files as artifacts - # (GitLab web interface is not readable), coordinate with new logging + successful_runs = [] + skipped_runs = [] + failed_runs = [] for run, run_id in enumerate(selected_runs): print( "Starting run %d/%d (id %d of %d total runs)" @@ -165,12 +166,14 @@ def get_default_session_options_new(): result = bench_object.run() if result == "skipped": log_dict["status"] = "skipped" - print("BENCH RUN SKIPPED") + print("BENCH RUN %d SKIPPED" % run_id) + skipped_runs.append(run_id) else: log_dict["status"] = "ok" except Exception: log_dict["status"] = "failed" - print("BENCH RUN FAILED WITH EXCEPTION: " + traceback.format_exc()) + print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc())) + failed_runs.append(run_id) exit_code = 1 log_dict["output"] = bench_object.output_dict @@ -182,12 +185,15 @@ def get_default_session_options_new(): with open(builder_log_path, "r") as f: builder_log = json.load(f) if builder_log["status"] == "failed": - print("BENCH RUN FAILED (BUILDER REPORTED FAILURE)") + print("BENCH RUN %d FAILED (BUILDER REPORTED FAILURE)" % run_id) + failed_runs.append(run_id) exit_code = 1 else: - print("BENCH RUN COMPLETED (BUILDER REPORTED SUCCESS)") + print("BENCH RUN %d COMPLETED (BUILDER REPORTED SUCCESS)" % run_id) + successful_runs.append(run_id) else: - print("BENCH RUN COMPLETED") + print("BENCH RUN %d COMPLETED" % run_id) + successful_runs.append(run_id) # log metadata of this run to its own report directory log_path = os.path.join(bench_object.report_dir, "metadata_bench.json") @@ -199,5 +205,8 @@ def get_default_session_options_new(): # save local artifacts of this run (e.g., full build dir, detailed debug info) bench_object.save_local_artifacts_collection() - print("STOPPING JOB") + print("STOPPING JOB %d (of %d total jobs)" % (task_id, task_count)) + print("JOB %d SUCCESSFUL RUNS: %s" % (task_id, successful_runs)) + print("JOB %d SKIPPED RUNS: %s" % (task_id, skipped_runs)) + print("JOB %d FAILED RUNS: %s" % (task_id, failed_runs)) return exit_code From 83a328d79cb0035a4f7cc015e6ed1761d43142a3 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Mon, 26 May 2025 17:56:57 +0200 Subject: [PATCH 121/125] Fix paths to moved report files --- notebooks/advanced/4_advanced_builder_settings.ipynb | 4 ++-- .../cybersecurity/3-build-accelerator-with-finn.ipynb | 4 ++-- src/finn/builder/build_dataflow.py | 4 ++++ tests/end2end/test_end2end_cybsec_mlp.py | 4 ++-- tests/fpgadataflow/test_fifosizing.py | 2 +- tests/util/test_build_dataflow.py | 6 +++--- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb index 1e544cf513..73ae7f555c 100644 --- a/notebooks/advanced/4_advanced_builder_settings.ipynb +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -964,7 +964,7 @@ "source": [ "import json\n", "\n", - "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n", " folding_config = json.load(json_file)\n", "\n", "print(json.dumps(folding_config, indent=1))" @@ -1035,7 +1035,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n", " folding_config = json.load(json_file)\n", "\n", "# Set all ram_style to LUT RAM\n", diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index 7a23a3628e..39ae1dd5f6 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -323,7 +323,7 @@ "source": [ "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n", "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n", - "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")" + "assert os.path.exists(rtlsim_output_dir + \"/report/final_hw_config.json\")" ] }, { @@ -410,7 +410,7 @@ "metadata": {}, "outputs": [], "source": [ - "! cat {rtlsim_output_dir}/final_hw_config.json" + "! cat {rtlsim_output_dir}/report/final_hw_config.json" ] }, { diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index f96f205e72..91dec71140 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -292,6 +292,10 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): print("KeyboardInterrupt detected. Aborting...") return log_and_exit(cfg, time_per_step, -1) except (Exception, FINNError) as e: + # Re-raise exception if we are in a PyTest session so we don't miss it + if "PYTEST_CURRENT_TEST" in os.environ: + raise + if issubclass(type(e), FINNUserError): # Handle FINN USER ERROR log.error(f"FINN ERROR: {e}") diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index 4770066117..cf75fd273b 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -165,8 +165,8 @@ def test_end2end_cybsec_mlp_build(self): ) build.build_dataflow_cfg(model_file, cfg) # check the generated files - assert os.path.isfile(output_dir + "/time_per_step.json") - assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/report/time_per_step.json") + assert os.path.isfile(output_dir + "/report/final_hw_config.json") assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/driver/driver.py") est_cycles_report = output_dir + "/report/estimate_layer_cycles.json" diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index bb89e8ab84..97686235d0 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -95,7 +95,7 @@ def test_fifosizing_linear(method, topology): cfg_cmp.auto_fifo_depths = False cfg_cmp.target_fps = None cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP] - cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json" + cfg_cmp.folding_config_file = tmp_output_dir + "/report/final_hw_config.json" build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp) model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx") diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 18f574bc8f..65d1942bed 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -48,9 +48,9 @@ def test_end2end_build_dataflow_directory(): build_dataflow_directory(target_dir) # check the generated files output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1" - assert os.path.isfile(output_dir + "/time_per_step.json") - assert os.path.isfile(output_dir + "/auto_folding_config.json") - assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/report/time_per_step.json") + assert os.path.isfile(output_dir + "/report/auto_folding_config.json") + assert os.path.isfile(output_dir + "/report/final_hw_config.json") assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml") assert os.path.isfile(output_dir + "/driver/driver.py") From b8c9f74c39442efc412cb81e09def97f87f9ed57 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 27 May 2025 17:29:33 +0200 Subject: [PATCH 122/125] Increase logging robustness, verbosity --- src/finn/benchmarking/bench.py | 40 +++++++++-- src/finn/benchmarking/bench_base.py | 3 - src/finn/builder/build_dataflow.py | 101 ++++++++++++++++------------ 3 files changed, 92 insertions(+), 52 deletions(-) diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py index 5f29959712..1a478a466c 100644 --- a/src/finn/benchmarking/bench.py +++ b/src/finn/benchmarking/bench.py @@ -2,6 +2,7 @@ import json import onnxruntime as ort import os +import sys import time import traceback import yaml @@ -19,6 +20,24 @@ dut["transformer"] = bench_transformer +class PrefixPrinter(object): + """ + Create a custom stream handler that adds a prefix + """ + + def __init__(self, prefix, originalstream): + self.console = originalstream + self.prefix = prefix + self.linebuf = "" + + def write(self, buf): + for line in buf.rstrip().splitlines(): + self.console.write(f"[{self.prefix}] " + line + "\n") + + def flush(self): + self.console.flush() + + def start_bench_run(config_name): exit_code = 0 # Attempt to work around onnxruntime issue on Slurm-managed clusters: @@ -130,21 +149,23 @@ def get_default_session_options_new(): while idx < total_runs: selected_runs.append(idx) idx = idx + task_count - print("This job will perform %d out of %d total runs" % (len(selected_runs), total_runs)) + print( + "STARTING JOB %d. IT WILL PERFORM %d OUT OF %d TOTAL RUNS" + % (task_id, len(selected_runs), total_runs) + ) # Run benchmark - # TODO: integrate this loop (especially status logging) into the bench class successful_runs = [] skipped_runs = [] failed_runs = [] for run, run_id in enumerate(selected_runs): print( - "Starting run %d/%d (id %d of %d total runs)" + "STARTING RUN %d/%d (ID %d OF %d TOTAL RUNS)" % (run + 1, len(selected_runs), run_id, total_runs) ) params = config_expanded[run_id] - print("Run parameters: %s" % (str(params))) + print("RUN %d PARAMETERS: %s" % (run_id, str(params))) log_dict = {"run_id": run_id, "task_id": task_id, "params": params} @@ -159,11 +180,18 @@ def get_default_session_options_new(): # expect DUT-specific YAML definition instead bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir) else: - print("ERROR: no DUT specified") + print("ERROR: NO DUT SPECIFIED") return 1 + # Wrap stdout/stderr with an additional prefix to identify the run in the live console + original_stdout = sys.stdout + original_stderr = sys.stderr + sys.stdout = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stdout) + sys.stderr = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stderr) try: result = bench_object.run() + sys.stdout = original_stdout + sys.stderr = original_stderr if result == "skipped": log_dict["status"] = "skipped" print("BENCH RUN %d SKIPPED" % run_id) @@ -171,6 +199,8 @@ def get_default_session_options_new(): else: log_dict["status"] = "ok" except Exception: + sys.stdout = original_stdout + sys.stderr = original_stderr log_dict["status"] = "failed" print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc())) failed_runs.append(run_id) diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py index 01e42b9c2a..e0bea7ee13 100644 --- a/src/finn/benchmarking/bench_base.py +++ b/src/finn/benchmarking/bench_base.py @@ -104,9 +104,6 @@ def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, d self.save_dir = save_dir self.debug = debug - # TODO: setup a logger so output can go to console (with task id prefix) - # TODO: coordinate with new builder loggin setup - # Setup some basic global default configuration # TODO: clean up or remove these attributes if "synth_clk_period_ns" in params: diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 91dec71140..7760cdbae7 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -161,41 +161,12 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: return filename -def log_and_exit(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0): - if exit_code: - print("Build failed") - status = "failed" - else: - print("Build completed successfully") - status = "ok" - - # Generate metadata_builder.json - metadata = { - "status": status, - "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), - } - with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: - json.dump(metadata, f, indent=2) - - # Generate time_per_step.json - if time_per_step is not None: - time_per_step["total_build_time"] = sum(time_per_step.values()) - with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f: - json.dump(time_per_step, f, indent=2) - - return exit_code - - -def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): - """Best-effort build a dataflow accelerator using the given configuration. - - :param model_filename: ONNX model filename to build - :param cfg: Build configuration - """ - # Create the output (report) dir if it doesn't exist - os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) - - # Set up logger +def setup_logging(cfg: DataflowBuildConfig): + # Set up global logger, the force=True has the following effects: + # - If multiple build are run in a row, the log file will be re-created for each, + # which is needed if the file was deleted/moved or the output dir changed + # - In a PyTest session, this logger will replace the PyTest log handlers, so logs + # (+ captured warnings!) will end up in the log file instead of being collected by PyTest logpath = os.path.join(cfg.output_dir, "build_dataflow.log") if cfg.verbose: logging.basicConfig( @@ -203,6 +174,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): format="[%(asctime)s]%(levelname)s: %(pathname)s:%(lineno)d: %(message)s", filename=logpath, filemode="w", + force=True, ) else: logging.basicConfig( @@ -210,19 +182,21 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): format="[%(asctime)s]%(levelname)s: %(message)s", filename=logpath, filemode="w", + force=True, ) - # Capture all warnings.warn calls of qonnx,... + # Capture all warnings.warn calls of qonnx, ... logging.captureWarnings(True) - log = logging.getLogger("build_dataflow") - # Mirror stdout and stderr to log - sys.stdout = PrintLogger(log, logging.INFO, sys.stdout) - sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr) + log = logging.getLogger("build_dataflow") + if not isinstance(sys.stdout, PrintLogger): + # Prevent rediricting stdout/sterr multiple times + sys.stdout = PrintLogger(log, logging.INFO, sys.stdout) + sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr) console = Console(file=sys.stdout.console) - # Set up console logger + # Mirror a configurable log level to console (default = ERROR) if cfg.console_log_level != "NONE": consoleHandler = RichHandler( show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console @@ -239,6 +213,45 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): consoleHandler.setLevel(logging.CRITICAL) logging.getLogger().addHandler(consoleHandler) + return log + + +def exit_buildflow(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0): + if exit_code: + print("Build failed") + status = "failed" + else: + print("Build completed successfully") + status = "ok" + + # Generate metadata_builder.json + metadata = { + "status": status, + "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), + } + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: + json.dump(metadata, f, indent=2) + + # Generate time_per_step.json + if time_per_step is not None: + time_per_step["total_build_time"] = sum(time_per_step.values()) + with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f: + json.dump(time_per_step, f, indent=2) + + return exit_code + + +def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): + """Best-effort build a dataflow accelerator using the given configuration. + + :param model_filename: ONNX model filename to build + :param cfg: Build configuration + """ + log = setup_logging(cfg) + + # Create the output (report) dir if it doesn't exist + os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) + print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}") print(f"Final outputs will be generated in {cfg.output_dir}") print(f"Build log is at {cfg.output_dir}/build_dataflow.log") @@ -290,7 +303,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): step_num += 1 except KeyboardInterrupt: print("KeyboardInterrupt detected. Aborting...") - return log_and_exit(cfg, time_per_step, -1) + return exit_buildflow(cfg, time_per_step, -1) except (Exception, FINNError) as e: # Re-raise exception if we are in a PyTest session so we don't miss it if "PYTEST_CURRENT_TEST" in os.environ: @@ -310,8 +323,8 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): if cfg.enable_build_pdb_debug: pdb.post_mortem(e.__traceback__) - return log_and_exit(cfg, time_per_step, -1) - return log_and_exit(cfg, time_per_step, 0) + return exit_buildflow(cfg, time_per_step, -1) + return exit_buildflow(cfg, time_per_step, 0) def build_dataflow_directory(path_to_cfg_dir: str): From c6cce9877333f1543df32d58d064a8554833b952 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 27 May 2025 18:19:50 +0200 Subject: [PATCH 123/125] Switch RN-50 to U280 --- ci/cfg/regression_extended.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml index f40c11ab11..a95dfa06d8 100644 --- a/ci/cfg/regression_extended.yml +++ b/ci/cfg/regression_extended.yml @@ -2,7 +2,7 @@ # ResNet-50 { "dut": ["resnet50"], - "board": ["U250"], + "board": ["U280"], "synth_clk_period_ns": [4], "rtlsim_batch_size": [3], # no deployment package because Alveo deployment is not yet supported by CI From dbfd95509d673663c39361ed09a6ba3caa919583 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Tue, 27 May 2025 19:31:29 +0200 Subject: [PATCH 124/125] Extend launch_process_helper and use it in more places --- src/finn/core/rtlsim_exec.py | 12 +- .../fpgadataflow/create_stitched_ip.py | 19 ++-- .../fpgadataflow/make_zynq_proj.py | 26 +++-- .../fpgadataflow/vitis_build.py | 53 ++++----- src/finn/util/basic.py | 104 ++++++++++-------- src/finn/util/hls.py | 13 +-- 6 files changed, 115 insertions(+), 112 deletions(-) diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index 61f2762039..46616599cb 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -28,8 +28,8 @@ import numpy as np import os -import sys from qonnx.custom_op.registry import getCustomOp +from subprocess import CalledProcessError from finn.util.basic import ( get_liveness_threshold_cycles, @@ -39,6 +39,7 @@ ) from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy from finn.util.deps import get_deps_path +from finn.util.exception import FINNError from finn.util.logging import log try: @@ -294,11 +295,12 @@ def rtlsim_exec_cppxsi( # write compilation command to a file for easy re-running/debugging with open(sim_base + "/compile_rtlsim.sh", "w") as f: f.write(" ".join(build_cmd)) - stdout, stderr = launch_process_helper(build_cmd, cwd=sim_base) + try: + launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False) + except CalledProcessError: + raise FINNError("Failed to compile rtlsim executable") if not os.path.isfile(sim_base + "/rtlsim_xsi"): - print(stdout) - print(stderr, file=sys.stderr) - raise RuntimeError("Failed to compile rtlsim executable") + raise FINNError("Failed to compile rtlsim executable") # launch the rtlsim executable # important to specify LD_LIBRARY_PATH here for XSI to work correctly diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 7a8d38182d..39bed71c82 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -30,14 +30,15 @@ import json import multiprocessing as mp import os -import subprocess from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.util.basic import get_num_default_workers from shutil import copytree +from subprocess import CalledProcessError from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths -from finn.util.basic import make_build_dir +from finn.util.basic import launch_process_helper, make_build_dir +from finn.util.exception import FINNError from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.logging import log @@ -633,14 +634,12 @@ def apply(self, model): f.write("vivado -mode batch -source make_project.tcl\n") f.write("cd {}\n".format(working_dir)) bash_command = ["bash", make_project_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - (_, stderr_data) = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for wrapper HDL + pass # wrapper may be created in different location depending on Vivado version if not os.path.isfile(wrapper_filename): @@ -649,7 +648,7 @@ def apply(self, model): if os.path.isfile(wrapper_filename_alt): model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise Exception( + raise FINNError( """CreateStitchedIP failed, no wrapper HDL found under %s or %s. Please check logs under the parent directory.""" % (wrapper_filename, wrapper_filename_alt) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 59d4293323..e280fba016 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -29,13 +29,13 @@ import math import os -import subprocess from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames from qonnx.transformation.infer_data_layouts import InferDataLayouts from shutil import copy +from subprocess import CalledProcessError from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP @@ -47,9 +47,14 @@ from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map +from finn.util.basic import ( + launch_process_helper, + make_build_dir, + pynq_native_port_width, + pynq_part_map, +) from finn.util.deps import get_deps_path -from finn.util.logging import log +from finn.util.exception import FINNError from . import templates @@ -399,16 +404,15 @@ def apply(self, model): # call the synthesis script bash_command = ["bash", synth_project_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for bitfile + pass + bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" if not os.path.isfile(bitfile_name): - raise Exception( + raise FINNError( "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir ) deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit" diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index 222c9c2336..1c5a5eff91 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -29,7 +29,6 @@ import json import os -import subprocess from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -38,6 +37,7 @@ GiveUniqueNodeNames, RemoveUnusedTensors, ) +from subprocess import CalledProcessError from finn.builder.build_dataflow_config import FpgaMemoryType, VitisOptStrategy from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition @@ -49,8 +49,8 @@ from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.basic import make_build_dir -from finn.util.logging import log +from finn.util.basic import launch_process_helper, make_build_dir +from finn.util.exception import FINNError from . import templates @@ -142,16 +142,14 @@ def apply(self, model): f.write("vivado -mode batch -source gen_xo.tcl\n") f.write("cd {}\n".format(working_dir)) bash_command = ["bash", package_xo_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical - assert os.path.isfile(xo_path), ( - "Vitis .xo file not created, check logs under %s" % vivado_proj_dir - ) + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for .xo file + pass + if not os.path.isfile(xo_path): + raise FINNError("Vitis .xo file not created, check logs under %s" % vivado_proj_dir) + return (model, False) @@ -327,18 +325,17 @@ def apply(self, model): ) f.write("cd {}\n".format(working_dir)) bash_command = ["bash", script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical - # TODO rename xclbin appropriately here? + + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for .xo file + pass xclbin = link_dir + "/a.xclbin" - assert os.path.isfile(xclbin), ( - "Vitis .xclbin file not created, check logs under %s" % link_dir - ) + if not os.path.isfile(xclbin): + raise FINNError("Vitis .xclbin file not created, check logs under %s" % link_dir) + + # TODO rename xclbin appropriately here? model.set_metadata_prop("bitfile", xclbin) # run Vivado to gen xml report @@ -350,13 +347,7 @@ def apply(self, model): f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl")) f.write("cd {}\n".format(working_dir)) bash_command = ["bash", gen_rep_xml_sh] - process_genxml = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_genxml.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + launch_process_helper(bash_command, print_stdout=False) # filename for the synth utilization report synth_report_filename = link_dir + "/synth_report.xml" model.set_metadata_prop("vivado_synth_rpt", synth_report_filename) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index acb8bb1303..7f7e658146 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -151,6 +151,65 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path return str(tmpdir) +def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): + """Helper function to launch a process in a way that facilitates logging + stdout/stderr with Python loggers. + Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise.""" + process = subprocess.run(args, capture_output=True, env=proc_env, cwd=cwd, text=True) + cmd_out = process.stdout.strip() + cmd_err = process.stderr.strip() + + # Handle stdout + if cmd_out: + if print_stdout is True: + log.info(cmd_out) + else: + # Print with DEBUG level regardless + log.debug(cmd_out) + + # Handle stderr, depending on return code + if process.returncode == 0: + # Process completed successfully, log stderr only as WARNING + if cmd_err: + log.warning(cmd_err) + else: + # Process failed, log stderr as ERROR + if cmd_err: + log.error(cmd_err) + + # Log additional ERROR message + if isinstance(args, list): + cmd = " ".join(args) + else: + cmd = args + log.error(f"Launched process returned non-zero exit code ({process.returncode}): {cmd}") + + # Raise CalledProcessError for non-zero return code + process.check_returncode() + return (cmd_out, cmd_err) + + +def which(program): + "Python equivalent of the shell cmd 'which'." + + # source: + # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + + class CppBuilder: """Builds the g++ compiler command to produces the executable of the c++ code in code_gen_dir which is passed to the function build() of this class.""" @@ -194,50 +253,7 @@ def build(self, code_gen_dir): f.write("#!/bin/bash \n") f.write(bash_compile + "\n") bash_command = ["bash", self.compile_script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - _, stderr_data = process_compile.communicate() - if stderr_data.strip(): - log.critical(stderr_data.strip()) # Decode bytes and log as critical - - -def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): - """Helper function to launch a process in a way that facilitates logging - stdout/stderr with Python loggers. - Returns (cmd_out, cmd_err).""" - if proc_env is None: - proc_env = os.environ.copy() - with subprocess.Popen( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=proc_env, cwd=cwd, text=True - ) as proc: - (cmd_out, cmd_err) = proc.communicate() - if cmd_out.strip() and print_stdout is True: - log.info(cmd_out.strip()) - if cmd_err.strip(): - log.critical(cmd_err.strip()) - return (cmd_out, cmd_err) - - -def which(program): - "Python equivalent of the shell cmd 'which'." - - # source: - # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python - def is_exe(fpath): - return os.path.isfile(fpath) and os.access(fpath, os.X_OK) - - fpath, fname = os.path.split(program) - if fpath: - if is_exe(program): - return program - else: - for path in os.environ["PATH"].split(os.pathsep): - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - - return None + launch_process_helper(bash_command, print_stdout=False) mem_primitives_versal = { diff --git a/src/finn/util/hls.py b/src/finn/util/hls.py index b1b88dbafe..dc153c0f52 100644 --- a/src/finn/util/hls.py +++ b/src/finn/util/hls.py @@ -27,10 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -import subprocess - -from finn.util.basic import which -from finn.util.logging import log +from finn.util.basic import launch_process_helper, which class CallHLS: @@ -65,10 +62,4 @@ def build(self, code_gen_dir): f.write("cd {}\n".format(working_dir)) f.close() bash_command = ["bash", self.ipgen_script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + launch_process_helper(bash_command, print_stdout=False) From 8abf5fe7a5ce3fca76f3bc4c7eb91e553f348eb8 Mon Sep 17 00:00:00 2001 From: Felix Jentzsch Date: Wed, 28 May 2025 10:33:58 +0200 Subject: [PATCH 125/125] Fix build dir creation --- src/finn/builder/build_dataflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 7760cdbae7..2184531443 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -247,11 +247,11 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): :param model_filename: ONNX model filename to build :param cfg: Build configuration """ - log = setup_logging(cfg) - # Create the output (report) dir if it doesn't exist os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) + log = setup_logging(cfg) + print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}") print(f"Final outputs will be generated in {cfg.output_dir}") print(f"Build log is at {cfg.output_dir}/build_dataflow.log")