diff --git a/.gitignore b/.gitignore index 7ddc2c6d67..2d48ddac55 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ bench_input bench_output bench_save bench_work +/models diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a2f9527976..23eb8c39fe 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,5 @@ +include: ci/.gitlab-setup.yml + stages: - sync - build @@ -17,6 +19,9 @@ variables: CPU_CORES: description: "Select number of CPU cores and test workers" value: "32" + CPU_CORES_BENCH: + description: "Select number of CPU cores for benchmark runs" + value: "8" PARALLEL_JOBS: description: "Number of parallel Slurm array jobs per Benchmark job" value: "1" @@ -30,15 +35,14 @@ variables: description: "Optional QoS option (include --qos, e.g., --qos express)" value: "" MANUAL_CFG_PATH: - description: "Use this config file instead of configs stored in the repo. Path must be accessible to runner" + description: "Name (in ci/cfg/) or path (relative to LOCAL_CFG_DIR) of benchmarking config to run" value: "" workflow: name: '$PIPELINE_NAME' rules: - # Run pipeline for GitHub PRs to dev or main (does not support PRs from forks) + # Run pipeline for GitHub PRs to dev (does not support PRs from forks) - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" - - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "main" # Run pipeline for pushes to dev or main - if: $CI_COMMIT_BRANCH == "dev" || $CI_COMMIT_BRANCH == "main" # Run pipeline if manually triggered via API or web GUI @@ -67,38 +71,6 @@ Sync finn-dev: - git pull upstream dev - git push origin finn-dev -.n2_setup_general: &n2_setup_general - - module load lang/Python/3.10.4-GCCcore-11.3.0 - - module load devel/Autoconf/2.71-GCCcore-11.3.0 - - module load lang/Bison/3.8.2-GCCcore-11.3.0 - - module load lang/flex/2.6.4-GCCcore-11.3.0 - - module load compiler/GCC/11.3.0 - - module load lib/pybind11/2.9.2-GCCcore-11.3.0 - - module load devel/Boost/1.79.0-GCC-11.3.0 - - module load lib/fmt/9.1.0-GCCcore-11.3.0 - - ulimit -s unlimited # Increase stack size limit - -.n2_setup_xilinx_2022_2: &n2_setup_xilinx_2022_2 - - module load fpga - - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2 - # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH - - export PLATFORM_REPO_PATHS=$PLATFORM_PATH - -.n2_setup_xilinx_2024_2: &n2_setup_xilinx_2024_2 - - module load fpga - - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2 - - module swap xilinx/vitis/24.2 # switch to Vitis/Vivado 2024.2 - # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH - - export PLATFORM_REPO_PATHS=$PLATFORM_PATH - -.setup_venv_from_whl: &setup_venv_from_whl - # Move everything to working directory (e.g., RAMdisk) - - cp -dfR . $PATH_WORKDIR - - cd $PATH_WORKDIR - # Create fresh virtual environment and install finn-plus from .whl (artifact) - - python3 -m venv finn-plus-venv - - finn-plus-venv/bin/pip install dist/*.whl - Build: id_tokens: CI_JOB_JWT: @@ -113,8 +85,8 @@ Build: # Otherwise run - when: always before_script: - - *n2_setup_general - - *n2_setup_xilinx_2022_2 + - !reference [.n2_setup_general, before_script] + - !reference [.n2_setup_xilinx_2022_2, before_script] # Install current version of Poetry - python3 -m venv poetry-install - poetry-install/bin/pip install poetry @@ -151,6 +123,9 @@ FINN Test Suite 2022.2: # Do not run if test suite has been deselected - if: $TEST_SUITE == "none" when: never + # Do not run for PRs to dev (run only for pushes) + - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" + when: never # Always run, as long as there was no prior failure - when: on_success cache: @@ -159,13 +134,10 @@ FINN Test Suite 2022.2: paths: - deps variables: - GIT_STRATEGY: empty # Do not pull repository, use PyPI installation instead + GIT_STRATEGY: empty # Do not pull repository, install from wheel (artifact) instead SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES --exclusive" PYTEST_PARALLEL: "$CPU_CORES" - before_script: - - *n2_setup_general - - *n2_setup_xilinx_2022_2 - - *setup_venv_from_whl + extends: .setup_full_2022_2 script: # Launch additional monitoring - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & @@ -182,8 +154,71 @@ FINN Test Suite 2022.2: junit: reports/*.xml FINN Test Suite 2024.2: - extends: FINN Test Suite 2022.2 - before_script: - - *n2_setup_general - - *n2_setup_xilinx_2024_2 - - *setup_venv_from_whl + extends: + - FINN Test Suite 2022.2 + - .setup_full_2024_2 + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Do not run if test suite has been deselected + - if: $TEST_SUITE == "none" + when: never + # Always run, as long as there was no prior failure + - when: on_success + +Bench (Manual): + stage: test + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + - if: $MANUAL_CFG_PATH != "" + trigger: + include: ci/.gitlab-bench.yml + strategy: depend + forward: + pipeline_variables: true + variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + BENCH_CFG: "manual" + +Bench (Basic): + stage: test + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + - if: $MANUAL_CFG_PATH == "" + trigger: + include: ci/.gitlab-bench.yml + strategy: depend + forward: + pipeline_variables: true + variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + parallel: + matrix: + - BENCH_CFG: [regression_basic] + +Bench (Extended): + stage: test + rules: + # Do not run on a schedule + - if: $CI_PIPELINE_SOURCE == "schedule" + when: never + # Do not run for PRs to dev (run only for pushes) + - if: $CI_PIPELINE_SOURCE == "external_pull_request_event" && $CI_EXTERNAL_PULL_REQUEST_TARGET_BRANCH_NAME == "dev" + when: never + - if: $MANUAL_CFG_PATH == "" + trigger: + include: ci/.gitlab-bench.yml + strategy: depend + forward: + pipeline_variables: true + variables: + PARENT_PIPELINE_ID: $CI_PIPELINE_ID + PARALLEL_JOBS: "4" + parallel: + matrix: + - BENCH_CFG: [regression_extended, microbenchmark_basic] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 048a3becda..10ff4d4415 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,7 @@ repos: - id: check-merge-conflict - id: check-xml - id: check-yaml + args: ['--unsafe'] - id: debug-statements exclude: '^src/finn/builder/build_dataflow.py$' - id: end-of-file-fixer diff --git a/ci/.gitlab-bench.yml b/ci/.gitlab-bench.yml new file mode 100644 index 0000000000..6ddeb11858 --- /dev/null +++ b/ci/.gitlab-bench.yml @@ -0,0 +1,81 @@ +include: ci/.gitlab-setup.yml + +stages: + - build + - measure + - collect + +variables: + BENCH_CFG: + description: "Select config, usually provided by parent pipeline" + value: "" + +workflow: + name: "bench_$BENCH_CFG" + +FINN Build: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: build + needs: + - job: Build + pipeline: $PARENT_PIPELINE_ID + variables: + SCHEDULER_PARAMETERS: "-A $PROJECT_ACCOUNT -p $SLURM_PARTITION -t $SLURM_TIMEOUT $SLURM_QOS --nodes 1 --ntasks 1 --cpus-per-task $CPU_CORES_BENCH --exclusive --array 0-$( expr $PARALLEL_JOBS - 1 )" + NUM_DEFAULT_WORKERS: "$CPU_CORES_BENCH" + extends: .setup_full_2022_2 + script: + # Launch additional monitoring + - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & + # Launch benchmarking script via FINN CLI, includes deps update and environment preparation + # TODO: cache dvc pull + - | + source finn-plus-venv/bin/activate + dvc pull + finn bench --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers $CPU_CORES_BENCH --bench_config $BENCH_CFG + cache: + key: $CI_COMMIT_SHA + policy: pull + paths: + - deps + artifacts: + name: "build_artifacts" + when: always + paths: + - build_artifacts/ + +Measurement: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: measure + tags: + - board + rules: + # Also run on failure of previous tasks to measure partial results + - when: always + script: + # Run as root and activate the PYNQ venv manually to use PYNQ outside of the typical Jupyter environment + - sudo bash -c "source /etc/profile.d/pynq_venv.sh && export XILINX_XRT=/usr && python ci/measure.py" + artifacts: + name: "measurement_artifacts" + when: always + paths: + - measurement_artifacts/ + +Result Collection: + id_tokens: + CI_JOB_JWT: + aud: https://git.uni-paderborn.de + stage: collect + tags: + - image_build + rules: + # Also run on failure of previous tasks to collect partial results + - when: always + script: + # pulling models seems to be needed for dvclive to save experiments, even though they are not used or modified + - dvc pull + - python3.10 ci/collect.py + - dvc exp push -f -j 4 -r push git@github.com:eki-project/finn-plus.git diff --git a/ci/.gitlab-setup.yml b/ci/.gitlab-setup.yml new file mode 100644 index 0000000000..5dad320a34 --- /dev/null +++ b/ci/.gitlab-setup.yml @@ -0,0 +1,49 @@ +# This file defines some basic scripts used to setup the FINN environment on the runner + +.n2_setup_general: + before_script: + - module load lang/Python/3.10.4-GCCcore-11.3.0 + - module load devel/Autoconf/2.71-GCCcore-11.3.0 + - module load lang/Bison/3.8.2-GCCcore-11.3.0 + - module load lang/flex/2.6.4-GCCcore-11.3.0 + - module load compiler/GCC/11.3.0 + - module load lib/pybind11/2.9.2-GCCcore-11.3.0 + - module load devel/Boost/1.79.0-GCC-11.3.0 + - module load lib/fmt/9.1.0-GCCcore-11.3.0 + - ulimit -s unlimited # Increase stack size limit + +.n2_setup_xilinx_2022_2: + before_script: + - module load fpga + - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2 + # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH + - export PLATFORM_REPO_PATHS=$PLATFORM_PATH + +.n2_setup_xilinx_2024_2: + before_script: + - module load fpga + - module load xilinx/xrt/2.14 # includes Vitis/Vivado 2022.2 + - module swap xilinx/vitis/24.2 # switch to Vitis/Vivado 2024.2 + # module load will set PLATFORM_REPO_PATHS to one specific platform, revert to top-level PLATFORM_PATH + - export PLATFORM_REPO_PATHS=$PLATFORM_PATH + +.setup_venv_from_whl: + before_script: + # Move everything to working directory (e.g., RAMdisk) + - cp -dfR . $PATH_WORKDIR + - cd $PATH_WORKDIR + # Create fresh virtual environment and install finn-plus from .whl (artifact) + - python3 -m venv finn-plus-venv + - finn-plus-venv/bin/pip install dist/*.whl + +.setup_full_2022_2: + before_script: + - !reference [.n2_setup_general, before_script] + - !reference [.n2_setup_xilinx_2022_2, before_script] + - !reference [.setup_venv_from_whl, before_script] + +.setup_full_2024_2: + before_script: + - !reference [.n2_setup_general, before_script] + - !reference [.n2_setup_xilinx_2024_2, before_script] + - !reference [.setup_venv_from_whl, before_script] diff --git a/ci/cfg/live_fifosizing.yml b/ci/cfg/live_fifosizing.yml new file mode 100644 index 0000000000..f121bacf6d --- /dev/null +++ b/ci/cfg/live_fifosizing.yml @@ -0,0 +1,50 @@ +[ + # Real models + { + "dut": ["vgg10"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["resnet50"], + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + + # Synthetic non-linear models + { + "dut": ["synthetic_nonlinear"], + "dim": [64], + "kernel_size": [5], + "ch": [8], + "simd": [8], + "pe": [8], + "parallel_window": [1], + + "lb_num_layers": [1], + "rb_num_layers": [4, 8, 16], + + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["synthetic_nonlinear"], + "dim": [64], + "kernel_size": [5], + "ch": [8], + "simd": [1], + "pe": [1], + "parallel_window": [0], + + "lb_num_layers": [1], + "rb_num_layers": [4, 8, 16], + + "live_fifo_sizing": [True], + "generate_outputs": [["bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/microbenchmark_basic.yml b/ci/cfg/microbenchmark_basic.yml new file mode 100644 index 0000000000..e9a102e51c --- /dev/null +++ b/ci/cfg/microbenchmark_basic.yml @@ -0,0 +1,48 @@ +[ + # MVAU Test + { + "dut": ["mvau"], + "idt": ["INT4","INT2"], + "wdt": ["INT4"], + "act": ["INT4"], + + "sparsity_type": ["none"], + "sparsity_amount": [0], + + "nhw": [[1,32,32]], + "mw": [64], + "mh": [64], + "sf": [-1], + "nf": [-1], + "m": [1], + + "mem_mode": ["internal_embedded"], + "ram_style": ["distributed"], + "ram_style_thr": ["distributed"], + + "dut_duplication": [1], + + "generate_outputs": [["estimate_reports", "stitched_ip", "rtlsim_performance", "out_of_context_synth", "bitfile", "pynq_driver", "deployment_package"]] + }, + + # Transformer Dummy + { + "dut": ["transformer"], + "seed": [12], + + "calibration_passes": [32], + + "model_num_heads": [1], + "model_num_layers": [1], + "model_bias":[true], + "model_emb_dim": [32], + "model_mlp_dim": [192], + "model_seq_len": [64], + "model_bits": [2], + "model_norm": ["none"], + "model_mask": ["none"], + "model_positional_encoding": ["binary"], + + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/regression_basic.yml b/ci/cfg/regression_basic.yml new file mode 100644 index 0000000000..9a7604fe19 --- /dev/null +++ b/ci/cfg/regression_basic.yml @@ -0,0 +1,10 @@ +[ + { + "dut": ["vgg10"], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + }, + { + "dut": ["mobilenetv1"], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/cfg/regression_extended.yml b/ci/cfg/regression_extended.yml new file mode 100644 index 0000000000..a95dfa06d8 --- /dev/null +++ b/ci/cfg/regression_extended.yml @@ -0,0 +1,48 @@ +[ + # ResNet-50 + { + "dut": ["resnet50"], + "board": ["U280"], + "synth_clk_period_ns": [4], + "rtlsim_batch_size": [3], + # no deployment package because Alveo deployment is not yet supported by CI + "generate_outputs": [["estimate_reports", "rtlsim_performance", "stitched_ip", "out_of_context_synth", "bitfile"]] + }, + + # 4x GPT Transformer models (currently disabled due to streamlining issues!) + # { + # "dut": ["transformer"], + # "seed": [12], + # "model_dir": ["models/gpt_a_6b_gpt2-s256-t2048-l2-h4-e256", + # "models/gpt_b_4b_gpt2-s256-t2048-l2-h4-e256", + # "models/gpt_c_gpt2-s512-t2048-l2-h4-e512", + # "models/gpt_d_gpt2-s256-t2048-l1-h2-e256"], + # "board": ["U280"], + # "synth_clk_period_ns": [10], + # "generate_outputs": [["estimate_reports", "stitched_ip", "out_of_context_synth"]] + # } + + # 5x RadioML Transformer models + { + "dut": ["transformer"], + "seed": [12], + "model_dir": ["models/rml_transformer_0", + "models/rml_transformer_a", + "models/rml_transformer_b", + "models/rml_transformer_c", + "models/rml_transformer_d",], + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + }, + + # 1x RadioML Conformer model + { + "dut": ["transformer"], + "seed": [12], + "model_dir": ["models/rml_conformer"], + "board": ["RFSoC2x2"], + "synth_clk_period_ns": [10], + "generate_outputs": [["estimate_reports", "bitfile", "pynq_driver", "deployment_package"]] + } +] diff --git a/ci/collect.py b/ci/collect.py new file mode 100644 index 0000000000..c7042abf25 --- /dev/null +++ b/ci/collect.py @@ -0,0 +1,413 @@ +import json +import os +import shutil +from dvclive.live import Live + + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print("Failed to delete %s. Reason: %s" % (file_path, e)) + + +def log_dvc_metric(live, prefix, name, value): + # sanitize '/' in name because DVC uses it to nest metrics (which we do via prefix) + live.log_metric(prefix + name.replace("/", "-"), value, plot=False) + + +def open_json_report(id, report_name): + # look in both, build & measurement, artifacts + path1 = os.path.join("build_artifacts", "runs_output", "run_%d" % (id), "reports", report_name) + path2 = os.path.join( + "measurement_artifacts", "runs_output", "run_%d" % (id), "reports", report_name + ) + if os.path.isfile(path1): + with open(path1, "r") as f: + report = json.load(f) + return report + elif os.path.isfile(path2): + with open(path2, "r") as f: + report = json.load(f) + return report + else: + return None + + +def log_all_metrics_from_report(id, live, report_name, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in report: + log_dvc_metric(live, prefix, key, report[key]) + + +def log_metrics_from_report(id, live, report_name, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + for key in keys: + if key in report: + log_dvc_metric(live, prefix, key, report[key]) + + +def log_nested_metrics_from_report(id, live, report_name, key_top, keys, prefix=""): + report = open_json_report(id, report_name) + if report: + if key_top in report: + for key in keys: + if key in report[key_top]: + log_dvc_metric(live, prefix, key, report[key_top][key]) + + +if __name__ == "__main__": + # Go through all runs found in the artifacts and log their results to DVC + run_dir_list = os.listdir(os.path.join("build_artifacts", "runs_output")) + print("Looking for runs in build artifacts") + run_ids = [] + for run_dir in run_dir_list: + if run_dir.startswith("run_"): + run_id = int(run_dir[4:]) + run_ids.append(run_id) + run_ids.sort() + print("Found %d runs" % len(run_ids)) + + follow_up_bench_cfg = list() + # Prepare (local) output directory where follow-up bench configs will be stored + output_cfg_dir = os.path.join( + os.environ.get("LOCAL_CFG_DIR_STORE"), "lfs", "CI_" + os.environ.get("CI_PIPELINE_ID") + ) + output_folding_dir = os.path.join(output_cfg_dir, "folding") + output_cfg_path = os.path.join(output_cfg_dir, "follow-up.json") + + for id in run_ids: + print("Processing run %d" % id) + experiment_name = "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + str(id) + experiment_msg = ( + "[CI] " + + os.environ.get("CI_PIPELINE_NAME") + + " (" + + os.environ.get("CI_PIPELINE_ID") + + "_" + + str(id) + + ")" + ) + # TODO: cache images once we switch to a cache provider that works with DVC Studio + with Live(exp_name=experiment_name, exp_message=experiment_msg, cache_images=False) as live: + # PARAMS + # input parameters logged by benchmarking infrastructure + metadata_bench = open_json_report(id, "metadata_bench.json") + params = {"params": metadata_bench["params"]} + live.log_params(params) + + # optional metadata logged by builder + metadata_builder = open_json_report(id, "metadata_builder.json") + if metadata_builder: + metadata = { + "metadata": { + "tool_version": metadata_builder["tool_version"], + } + } + live.log_params(metadata) + + # optional dut_info.json (additional information generated during model generation) + dut_info_report = open_json_report(id, "dut_info.json") + if dut_info_report: + dut_info = {"dut_info": dut_info_report} + live.log_params(dut_info) + + # METRICS + # TODO: for microbenchmarks, only summarize results for target node (surrounding SDP?) + # TODO: make all logs consistent (at generation), e.g., BRAM vs BRAM18 vs BRAM36) + + # status + status = metadata_bench["status"] + if status == "ok": + # mark as failed if either bench or builder indicates failure + if metadata_builder: + status_builder = metadata_builder["status"] + if status_builder == "failed": + status = "failed" + log_dvc_metric(live, "", "status", status) + + # verification steps + if "output" in metadata_bench: + if "builder_verification" in metadata_bench["output"]: + log_dvc_metric( + live, + "", + "verification", + metadata_bench["output"]["builder_verification"]["verification"], + ) + + # estimate_layer_resources.json + log_nested_metrics_from_report( + id, + live, + "estimate_layer_resources.json", + "total", + [ + "LUT", + "DSP", + "BRAM_18K", + "URAM", + ], + prefix="estimate/resources/", + ) + + # estimate_layer_resources_hls.json + log_nested_metrics_from_report( + id, + live, + "estimate_layer_resources_hls.json", + "total", + [ + "LUT", + "FF", + "DSP", + "DSP48E", + "DSP58E", # TODO: aggregate/unify DSP reporting + "BRAM_18K", + "URAM", + ], + prefix="hls_estimate/resources/", + ) + + # estimate_network_performance.json + log_metrics_from_report( + id, + live, + "estimate_network_performance.json", + [ + "critical_path_cycles", + "max_cycles", + "max_cycles_node_name", + "estimated_throughput_fps", + "estimated_latency_ns", + ], + prefix="estimate/performance/", + ) + + # rtlsim_performance.json + log_metrics_from_report( + id, + live, + "rtlsim_performance.json", + [ + "N", + "TIMEOUT", + "latency_cycles", + "cycles", + "fclk[mhz]", + "throughput[images/s]", + "stable_throughput[images/s]", + # add INPUT_DONE, OUTPUT_DONE, number transactions? + ], + prefix="rtlsim/performance/", + ) + + # fifo_sizing.json + log_metrics_from_report( + id, live, "fifo_sizing.json", ["total_fifo_size_kB"], prefix="fifosizing/" + ) + + # stitched IP DCP synth resource report + log_nested_metrics_from_report( + id, + live, + "post_synth_resources_dcp.json", + "(top)", + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(dcp)/resources/", + ) + + # stitched IP DCP synth resource breakdown + # TODO: generalize to all build flows and bitfile synth + layer_categories = ["MAC", "Eltwise", "Thresholding", "FIFO", "DWC", "SWG", "Other"] + for category in layer_categories: + log_nested_metrics_from_report( + id, + live, + "res_breakdown_build_output.json", + category, + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(dcp)/resources(breakdown)/" + category + "/", + ) + + # ooc_synth_and_timing.json (OOC synth / step_out_of_context_synthesis) + log_metrics_from_report( + id, + live, + "ooc_synth_and_timing.json", + [ + "LUT", + "LUTRAM", + "FF", + "DSP", + "BRAM", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth(ooc)/resources/", + ) + log_metrics_from_report( + id, + live, + "ooc_synth_and_timing.json", + [ + "WNS", + "fmax_mhz", + # add TNS? what is "delay"? + ], + prefix="synth(ooc)/timing/", + ) + + # post_synth_resources.json (shell synth / step_synthesize_bitfile) + log_nested_metrics_from_report( + id, + live, + "post_synth_resources.json", + "(top)", + [ + "LUT", + "FF", + "SRL", + "DSP", + "BRAM_18K", + "BRAM_36K", + "URAM", + ], + prefix="synth/resources/", + ) + + # post synth timing report + # TODO: only exported as post_route_timing.rpt, not .json + + # instrumentation measurement + log_all_metrics_from_report( + id, live, "measured_performance.json", prefix="measurement/performance/" + ) + + # IODMA validation accuracy + log_metrics_from_report( + id, + live, + "validation.json", + [ + "top-1_accuracy", + ], + prefix="measurement/validation/", + ) + + # power measurement + # TODO + + # live fifosizing report + graph png + log_metrics_from_report( + id, + live, + "fifo_sizing_report.json", + [ + "error", + "fifo_size_total_kB", + ], + prefix="fifosizing/live/", + ) + + image = os.path.join( + "measurement_artifacts", + "runs_output", + "run_%d" % (id), + "reports", + "fifo_sizing_graph.png", + ) + if os.path.isfile(image): + live.log_image("fifosizing_pass_1", image) + + # time_per_step.json + log_metrics_from_report(id, live, "time_per_step.json", ["total_build_time"]) + + # ARTIFACTS + # Log build reports as they come from GitLab artifacts, + # but copy them to a central dir first so all runs share the same path + run_report_dir1 = os.path.join( + "build_artifacts", "runs_output", "run_%d" % (id), "reports" + ) + run_report_dir2 = os.path.join( + "measurement_artifacts", "runs_output", "run_%d" % (id), "reports" + ) + dvc_report_dir = "reports" + os.makedirs(dvc_report_dir, exist_ok=True) + delete_dir_contents(dvc_report_dir) + if os.path.isdir(run_report_dir1): + shutil.copytree(run_report_dir1, dvc_report_dir, dirs_exist_ok=True) + if os.path.isdir(run_report_dir2): + shutil.copytree(run_report_dir2, dvc_report_dir, dirs_exist_ok=True) + live.log_artifact(dvc_report_dir) + + # Prepare benchmarking config for follow-up runs after live FIFO-sizing + folding_config_lfs_path = os.path.join( + "measurement_artifacts", + "runs_output", + "run_%d" % (id), + "reports", + "folding_config_lfs.json", + ) + if os.path.isfile(folding_config_lfs_path): + # Copy folding config produced by live FIFO-sizing + output_folding_path = os.path.join(output_folding_dir, experiment_name + ".json") + os.makedirs(output_folding_dir, exist_ok=True) + print( + "Saving lfs-generated folding config of this run to use in future builds: %s" + % output_folding_path + ) + shutil.copy(folding_config_lfs_path, output_folding_path) + + # Create benchmarking config + metadata_bench = open_json_report(id, "metadata_bench.json") + configuration = dict() + for key in metadata_bench["params"]: + # wrap in list + configuration[key] = [metadata_bench["params"][key]] + # overwrite FIFO-related params + import_folding_path = os.path.join( + os.environ.get("LOCAL_CFG_DIR"), + "lfs", + "CI_" + os.environ.get("CI_PIPELINE_ID"), + "folding", + experiment_name + ".json", + ) + configuration["live_fifo_sizing"] = [False] + configuration["auto_fifo_depths"] = [False] + configuration["target_fps"] = ["None"] + configuration["folding_config_file"] = [import_folding_path] + + follow_up_bench_cfg.append(configuration) + + # Save aggregated benchmarking config for follow-up job + if follow_up_bench_cfg: + print("Saving follow-up bench config for lfs: %s" % output_cfg_path) + with open(output_cfg_path, "w") as f: + json.dump(follow_up_bench_cfg, f, indent=2) + + print("Done") diff --git a/ci/measure.py b/ci/measure.py new file mode 100644 index 0000000000..42db938d33 --- /dev/null +++ b/ci/measure.py @@ -0,0 +1,95 @@ +import os +import shutil +import subprocess +import sys + + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print("Failed to delete %s. Reason: %s" % (file_path, e)) + + +if __name__ == "__main__": + exit_code = 0 + print("Looking for deployment packages in artifacts..") + # Find deployment packages from artifacts + artifacts_in_dir = os.path.join("build_artifacts", "runs_output") + artifacts_out_dir = os.path.join("measurement_artifacts", "runs_output") + for run in os.listdir(artifacts_in_dir): + run_in_dir = os.path.join(artifacts_in_dir, run) + run_out_dir = os.path.join(artifacts_out_dir, run) + reports_dir = os.path.join(run_out_dir, "reports") + deploy_archive = os.path.join(run_in_dir, "deploy.zip") + extract_dir = "measurement" + if os.path.isfile(deploy_archive): + print("Found deployment package in %s, extracting.." % run_in_dir) + + # Extract to temporary dir + shutil.unpack_archive(deploy_archive, extract_dir) + + # Run driver + print("Running driver..") + # run validate.py (from IODMA driver) if present, otherwise driver.py (instrumentation) + # TODO: unify IODMA/instrumentation shell & driver + if os.path.isfile(f"{extract_dir}/driver/validate.py"): + result = subprocess.run( + [ + "python", + f"{extract_dir}/driver/validate.py", + "--bitfile", + f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", + f"{extract_dir}/driver/settings.json", + "--reportfile", + f"{extract_dir}/validation.json", + "--dataset_root", + "/home/xilinx/datasets", # TODO: env var + ] + ) + else: + result = subprocess.run( + [ + "python", + f"{extract_dir}/driver/driver.py", + "--bitfile", + f"{extract_dir}/bitfile/finn-accel.bit", + "--settingsfile", + f"{extract_dir}/driver/settings.json", + "--reportfile", + f"{extract_dir}/measured_performance.json", + ] + ) + if result.returncode != 0: + print("Driver reported error!") + exit_code = 1 + else: + print("Driver finished successfully.") + + # Copy results back to artifact directory + for report in [ + "measured_performance.json", + "fifo_sizing_report.json", + "fifo_depth_export.json", + "fifo_sizing_graph.png", + "folding_config_lfs.json", + "validation.json", + ]: + report_path = os.path.join(extract_dir, report) + if os.path.isfile(report_path): + print("Copying %s to %s" % (report_path, reports_dir)) + os.makedirs(reports_dir, exist_ok=True) + shutil.copy(report_path, reports_dir) + + print("Clearing temporary directory..") + # Clear temporary dir + delete_dir_contents(extract_dir) + print("Done.") + print("Processed all deployment packages.") + sys.exit(exit_code) diff --git a/models.dvc b/models.dvc new file mode 100644 index 0000000000..35b5292128 --- /dev/null +++ b/models.dvc @@ -0,0 +1,6 @@ +outs: +- md5: 20c3f996d17ef035c8189c0d0ac44cf6.dir + size: 203029833 + nfiles: 42 + hash: md5 + path: models diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb index 1e544cf513..73ae7f555c 100644 --- a/notebooks/advanced/4_advanced_builder_settings.ipynb +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -964,7 +964,7 @@ "source": [ "import json\n", "\n", - "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n", " folding_config = json.load(json_file)\n", "\n", "print(json.dumps(folding_config, indent=1))" @@ -1035,7 +1035,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + "with open(build_dir+\"/output_pre_and_post_proc/report/auto_folding_config.json\", 'r') as json_file:\n", " folding_config = json.load(json_file)\n", "\n", "# Set all ram_style to LUT RAM\n", diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index 7a23a3628e..39ae1dd5f6 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -323,7 +323,7 @@ "source": [ "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n", "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n", - "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")" + "assert os.path.exists(rtlsim_output_dir + \"/report/final_hw_config.json\")" ] }, { @@ -410,7 +410,7 @@ "metadata": {}, "outputs": [], "source": [ - "! cat {rtlsim_output_dir}/final_hw_config.json" + "! cat {rtlsim_output_dir}/report/final_hw_config.json" ] }, { diff --git a/src/finn/benchmarking/bench.py b/src/finn/benchmarking/bench.py new file mode 100644 index 0000000000..1a478a466c --- /dev/null +++ b/src/finn/benchmarking/bench.py @@ -0,0 +1,242 @@ +import itertools +import json +import onnxruntime as ort +import os +import sys +import time +import traceback +import yaml + +from finn.benchmarking.bench_base import bench +from finn.benchmarking.dut.mvau import bench_mvau +from finn.benchmarking.dut.synthetic_nonlinear import bench_synthetic_nonlinear +from finn.benchmarking.dut.transformer import bench_transformer +from finn.benchmarking.util import delete_dir_contents + +# Register custom bench subclasses that offer more control than YAML-based flow +dut = dict() +dut["mvau"] = bench_mvau +dut["synthetic_nonlinear"] = bench_synthetic_nonlinear +dut["transformer"] = bench_transformer + + +class PrefixPrinter(object): + """ + Create a custom stream handler that adds a prefix + """ + + def __init__(self, prefix, originalstream): + self.console = originalstream + self.prefix = prefix + self.linebuf = "" + + def write(self, buf): + for line in buf.rstrip().splitlines(): + self.console.write(f"[{self.prefix}] " + line + "\n") + + def flush(self): + self.console.flush() + + +def start_bench_run(config_name): + exit_code = 0 + # Attempt to work around onnxruntime issue on Slurm-managed clusters: + # See https://github.com/microsoft/onnxruntime/issues/8313 + # This seems to happen only when assigned CPU cores are not contiguous + _default_session_options = ort.capi._pybind_state.get_default_session_options() + + def get_default_session_options_new(): + _default_session_options.inter_op_num_threads = 1 + _default_session_options.intra_op_num_threads = 1 + return _default_session_options + + ort.capi._pybind_state.get_default_session_options = get_default_session_options_new + + try: + # Launched via SLURM, expect additional CI env vars + job_id = int(os.environ["SLURM_JOB_ID"]) + # original experiment dir (before potential copy to ramdisk): + # experiment_dir = os.environ.get("EXPERIMENT_DIR") + experiment_dir = os.environ.get("CI_PROJECT_DIR") + save_dir = os.path.join( + os.environ.get("LOCAL_ARTIFACT_DIR"), + "CI_" + os.environ.get("CI_PIPELINE_ID") + "_" + os.environ.get("CI_PIPELINE_NAME"), + ) + work_dir = os.environ["PATH_WORKDIR"] + + # Gather benchmarking configs + if config_name == "manual": + # First check if the repo contains a config with this name (in ci/cfg/*) + config_path = os.path.join("ci", "cfg", os.environ.get("MANUAL_CFG_PATH") + ".yml") + if not os.path.exists(config_path): + # Otherwise look in LOCAL_CFG_DIR for the filename + config_path = os.path.join( + os.environ.get("LOCAL_CFG_DIR"), os.environ.get("MANUAL_CFG_PATH") + ) + else: + config_path = os.path.join("ci", "cfg", config_name + ".yml") + print("Job launched with SLURM ID: %d" % (job_id)) + except KeyError: + # Launched without SLURM, assume test run on local machine + job_id = 0 + experiment_dir = "bench_output/" + time.strftime("%d_%H_%M") + save_dir = "bench_save/" + time.strftime("%d_%H_%M") + work_dir = "bench_work" + os.makedirs(work_dir, exist_ok=True) + delete_dir_contents(work_dir) + config_path = config_name # expect caller to provide direct path to a single config file + print("Local test job launched without SLURM") + + try: + # Launched as SLURM job array + array_id = int(os.environ["SLURM_ARRAY_JOB_ID"]) + task_id = int(os.environ["SLURM_ARRAY_TASK_ID"]) + task_count = int(os.environ["SLURM_ARRAY_TASK_COUNT"]) + print( + "Launched as job array (Array ID: %d, Task ID: %d, Task count: %d)" + % (array_id, task_id, task_count) + ) + except KeyError: + # Launched as single (SLURM or non-SLURM) job + array_id = job_id + task_id = 0 + task_count = 1 + print("Launched as single job") + + # Prepare result directory + artifacts_dir = os.path.join(experiment_dir, "build_artifacts") + os.makedirs(artifacts_dir, exist_ok=True) + print("Collecting results in path: %s" % artifacts_dir) + + # Prepare local save dir for large artifacts (e.g., build output, tmp dir dump for debugging) + os.makedirs(save_dir, exist_ok=True) + print("Saving additional artifacts in path: %s" % save_dir) + + # Load config + print("Loading config %s" % (config_path)) + if os.path.exists(config_path): + with open(config_path, "r") as f: + config = yaml.load(f, Loader=yaml.SafeLoader) + else: + print("ERROR: config file not found") + return + + # Expand all specified config combinations (gridsearch) + config_expanded = [] + for param_set in config: + param_set_expanded = list( + dict(zip(param_set.keys(), x)) for x in itertools.product(*param_set.values()) + ) + config_expanded.extend(param_set_expanded) + + # Save config (only first job of array) for logging purposes + if task_id == 0: + with open(os.path.join(artifacts_dir, "bench_config.json"), "w") as f: + json.dump(config, f, indent=2) + with open(os.path.join(artifacts_dir, "bench_config_exp.json"), "w") as f: + json.dump(config_expanded, f, indent=2) + + # Determine which runs this job will work on + total_runs = len(config_expanded) + if total_runs <= task_count: + if task_id < total_runs: + selected_runs = [task_id] + else: + return + else: + selected_runs = [] + idx = task_id + while idx < total_runs: + selected_runs.append(idx) + idx = idx + task_count + print( + "STARTING JOB %d. IT WILL PERFORM %d OUT OF %d TOTAL RUNS" + % (task_id, len(selected_runs), total_runs) + ) + + # Run benchmark + successful_runs = [] + skipped_runs = [] + failed_runs = [] + for run, run_id in enumerate(selected_runs): + print( + "STARTING RUN %d/%d (ID %d OF %d TOTAL RUNS)" + % (run + 1, len(selected_runs), run_id, total_runs) + ) + + params = config_expanded[run_id] + print("RUN %d PARAMETERS: %s" % (run_id, str(params))) + + log_dict = {"run_id": run_id, "task_id": task_id, "params": params} + + # Create bench object for respective DUT + if "dut" in params: + if params["dut"] in dut: + bench_object = dut[params["dut"]]( + params, task_id, run_id, work_dir, artifacts_dir, save_dir + ) + else: + # If no custom bench subclass is defined, fall back to base class, + # expect DUT-specific YAML definition instead + bench_object = bench(params, task_id, run_id, work_dir, artifacts_dir, save_dir) + else: + print("ERROR: NO DUT SPECIFIED") + return 1 + + # Wrap stdout/stderr with an additional prefix to identify the run in the live console + original_stdout = sys.stdout + original_stderr = sys.stderr + sys.stdout = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stdout) + sys.stderr = PrefixPrinter("RUN %d (%s)" % (run_id, params["dut"]), sys.stderr) + try: + result = bench_object.run() + sys.stdout = original_stdout + sys.stderr = original_stderr + if result == "skipped": + log_dict["status"] = "skipped" + print("BENCH RUN %d SKIPPED" % run_id) + skipped_runs.append(run_id) + else: + log_dict["status"] = "ok" + except Exception: + sys.stdout = original_stdout + sys.stderr = original_stderr + log_dict["status"] = "failed" + print("BENCH RUN %d FAILED WITH EXCEPTION: %s" % (run_id, traceback.format_exc())) + failed_runs.append(run_id) + exit_code = 1 + + log_dict["output"] = bench_object.output_dict + + # examine status reported by builder (which catches all exceptions before they reach us) + # we could also fail the pipeline if functional verification fails (TODO) + builder_log_path = os.path.join(bench_object.report_dir, "metadata_builder.json") + if os.path.isfile(builder_log_path): + with open(builder_log_path, "r") as f: + builder_log = json.load(f) + if builder_log["status"] == "failed": + print("BENCH RUN %d FAILED (BUILDER REPORTED FAILURE)" % run_id) + failed_runs.append(run_id) + exit_code = 1 + else: + print("BENCH RUN %d COMPLETED (BUILDER REPORTED SUCCESS)" % run_id) + successful_runs.append(run_id) + else: + print("BENCH RUN %d COMPLETED" % run_id) + successful_runs.append(run_id) + + # log metadata of this run to its own report directory + log_path = os.path.join(bench_object.report_dir, "metadata_bench.json") + with open(log_path, "w") as f: + json.dump(log_dict, f, indent=2) + + # save GitLab artifacts of this run (e.g., reports and deployment package) + bench_object.save_artifacts_collection() + # save local artifacts of this run (e.g., full build dir, detailed debug info) + bench_object.save_local_artifacts_collection() + + print("STOPPING JOB %d (of %d total jobs)" % (task_id, task_count)) + print("JOB %d SUCCESSFUL RUNS: %s" % (task_id, successful_runs)) + print("JOB %d SKIPPED RUNS: %s" % (task_id, skipped_runs)) + print("JOB %d FAILED RUNS: %s" % (task_id, failed_runs)) + return exit_code diff --git a/src/finn/benchmarking/bench_base.py b/src/finn/benchmarking/bench_base.py new file mode 100644 index 0000000000..e0bea7ee13 --- /dev/null +++ b/src/finn/benchmarking/bench_base.py @@ -0,0 +1,315 @@ +import glob +import json +import os +import shutil +import subprocess +import yaml +from shutil import copy as shcopy +from shutil import copytree + +import finn.builder.build_dataflow as build +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.templates import ( + template_open, + template_sim_power, + template_single_test, + template_switching_simulation_tb, +) +from finn.benchmarking.util import delete_dir_contents, power_xml_to_dict +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.util.basic import alveo_default_platform, alveo_part_map, part_map + + +def start_test_batch_fast(results_path, project_path, run_target, pairs): + # Prepare tcl script + script = template_open.replace("$PROJ_PATH$", project_path) + # script = script.replace("$PERIOD$", period) + script = script.replace("$RUN$", run_target) + for toggle_rate, static_prob in pairs: + script = script + template_single_test + script = script.replace("$TOGGLE_RATE$", str(toggle_rate)) + script = script.replace("$STATIC_PROB$", str(static_prob)) + # script = script.replace("$SWITCH_TARGET$", switch_target) + script = script.replace("$REPORT_PATH$", results_path) + script = script.replace("$REPORT_NAME$", f"{toggle_rate}_{static_prob}") + with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file: + tcl_file.write(script) + + # Prepare bash script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash \n") + script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n") + + # Run script + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + # Parse results + for toggle_rate, static_prob in pairs: + power_report_dict = power_xml_to_dict(f"{results_path}/{toggle_rate}_{static_prob}.xml") + power_report_json = f"{results_path}/{toggle_rate}_{static_prob}.json" + with open(power_report_json, "w") as json_file: + json_file.write(json.dumps(power_report_dict, indent=2)) + + +def sim_power_report(results_path, project_path, in_width, out_width, dtype_width, sim_duration_ns): + # Prepare tcl script + script = template_open.replace("$PROJ_PATH$", project_path) + script = script.replace("$RUN$", "impl_1") + script = script + template_sim_power + script = script.replace("$TB_FILE_PATH$", os.getcwd() + "/switching_simulation_tb.v") + script = script.replace("$SAIF_FILE_PATH$", os.getcwd() + "/switching.saif") + script = script.replace("$SIM_DURATION_NS$", str(int(sim_duration_ns))) + script = script.replace("$REPORT_PATH$", results_path) + script = script.replace("$REPORT_NAME$", "sim") + with open(os.getcwd() + "/power_report.tcl", "w") as tcl_file: + tcl_file.write(script) + + # Prepare testbench + testbench = template_switching_simulation_tb.replace("$INSTREAM_WIDTH$", str(in_width)) + testbench = testbench.replace("$OUTSTREAM_WIDTH$", str(out_width)) + testbench = testbench.replace("$DTYPE_WIDTH$", str(dtype_width)) + testbench = testbench.replace( + "$RANDOM_FUNCTION$", "$urandom_range(0, {max})".format(max=2**dtype_width - 1) + ) + with open(os.getcwd() + "/switching_simulation_tb.v", "w") as tb_file: + tb_file.write(testbench) + + # Prepare shell script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash \n") + script.write(f"vivado -mode batch -source {os.getcwd()}/power_report.tcl\n") + + # Run script + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + # Parse results + power_report_dict = power_xml_to_dict(f"{results_path}/sim.xml") + power_report_json = f"{results_path}/sim.json" + with open(power_report_json, "w") as json_file: + json_file.write(json.dumps(power_report_dict, indent=2)) + + +class bench: + def __init__(self, params, task_id, run_id, work_dir, artifacts_dir, save_dir, debug=True): + super().__init__() + self.params = params + self.task_id = task_id + self.run_id = run_id + self.work_dir = work_dir + self.artifacts_dir = artifacts_dir + self.save_dir = save_dir + self.debug = debug + + # Setup some basic global default configuration + # TODO: clean up or remove these attributes + if "synth_clk_period_ns" in params: + self.clock_period_ns = params["synth_clk_period_ns"] + else: + self.clock_period_ns = 10 + self.params["synth_clk_period_ns"] = self.clock_period_ns + + # TODO: do not allow multiple targets in a single bench job due to measurement? + if "board" in params: + self.board = params["board"] + else: + self.board = "RFSoC2x2" + self.params["board"] = self.board + + if "part" in params: + self.part = params["part"] + elif self.board in part_map: + self.part = part_map[self.board] + else: + raise Exception("No part specified for board %s" % self.board) + + if self.board in alveo_part_map: + self.params["shell_flow_type"] = build_cfg.ShellFlowType.VITIS_ALVEO + self.params["vitis_platform"] = alveo_default_platform[self.board] + else: + self.params["shell_flow_type"] = build_cfg.ShellFlowType.VIVADO_ZYNQ + + # Load custom (= non build_dataflow_config) parameters from topology-specific .yml + custom_params = [ + "model_dir", # used to setup onnx/npy input + "model_path", # used to setup onnx/npy input + # model-gen parameters, such as seed, simd, pe, etc. + # TODO: separate these more cleanly from builder options + ] + + dut_yaml_name = self.params["dut"] + ".yml" + dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name) + if os.path.isfile(dut_path): + with open(dut_path, "r") as f: + dut_cfg = yaml.load(f, Loader=yaml.SafeLoader) + for key in dut_cfg: + if key in custom_params: + self.params[key] = dut_cfg[key] + + # Clear FINN tmp build dir before every run + print("Clearing FINN BUILD DIR ahead of run") + delete_dir_contents(os.environ["FINN_BUILD_DIR"]) + + # Initialize dictionary to collect all benchmark results + # TODO: remove completely or only use for meta data, + # actual results go into run-specific .json files within /report + self.output_dict = {} + + # Inputs (e.g., ONNX model, golden I/O pair, folding config, etc.) + self.build_inputs = {} + + # Collect tuples of (name, source path, archive?) to save as pipeline artifacts + self.artifacts_collection = [] + + # Collect tuples of (name, source path, archive?) to save as local artifacts + self.local_artifacts_collection = [] + if self.debug: + # Save entire FINN_BUILD_DIR + # TODO: add option to only save upon error/exception + self.local_artifacts_collection.append( + ("debug_finn_tmp", os.environ["FINN_BUILD_DIR"], True) + ) + + # SETUP + # Use a temporary dir for buildflow-related files (next to FINN_BUILD_DIR) + # Ensure it exists but is empty (clear potential artifacts from previous runs) + tmp_buildflow_dir = os.path.join(self.work_dir, "buildflow") + os.makedirs(tmp_buildflow_dir, exist_ok=True) + delete_dir_contents(tmp_buildflow_dir) + self.build_inputs["build_dir"] = os.path.join( + tmp_buildflow_dir, "build_output" + ) # TODO remove in favor of self.build_dir + self.build_dir = os.path.join(tmp_buildflow_dir, "build_output") + self.report_dir = os.path.join(self.build_dir, "report") + os.makedirs(self.report_dir, exist_ok=True) + + # Save full build dir as local artifact + self.local_artifacts_collection.append(("build_output", self.build_dir, False)) + # Save reports and deployment package as pipeline artifacts + self.artifacts_collection.append(("reports", self.report_dir, False)) + self.artifacts_collection.append( + ("reports", os.path.join(self.build_dir, "build_dataflow.log"), False) + ) + self.artifacts_collection.append(("deploy", os.path.join(self.build_dir, "deploy"), True)) + + def save_artifact(self, target_path, source_path, archive=False): + if os.path.isdir(source_path): + if archive: + os.makedirs(os.path.dirname(target_path), exist_ok=True) + shutil.make_archive(target_path, "zip", source_path) + else: + os.makedirs(target_path, exist_ok=True) + copytree(source_path, target_path, dirs_exist_ok=True) + elif os.path.isfile(source_path): + os.makedirs(target_path, exist_ok=True) + shcopy(source_path, target_path) + + def save_artifacts_collection(self): + # this should be called upon successful or failed completion of a run + for name, source_path, archive in self.artifacts_collection: + target_path = os.path.join( + self.artifacts_dir, "runs_output", "run_%d" % (self.run_id), name + ) + self.save_artifact(target_path, source_path, archive) + + def save_local_artifacts_collection(self): + # this should be called upon successful or failed completion of a run + for name, source_path, archive in self.local_artifacts_collection: + target_path = os.path.join(self.save_dir, name, "run_%d" % (self.run_id)) + self.save_artifact(target_path, source_path, archive) + + # must be defined by subclass + def step_export_onnx(self): + pass + + # can be overwritten by subclass if setup is too complex for YAML definition + def step_build_setup(self): + dut_yaml_name = self.params["dut"] + ".yml" + dut_path = os.path.join(os.path.dirname(__file__), "dut", dut_yaml_name) + if os.path.isfile(dut_path): + with open(dut_path, "r") as f: + return DataflowBuildConfig.from_yaml(f) + else: + raise Exception("No DUT-specific YAML build definition found") + + # defaults to normal build flow, may be overwritten by subclass + def run(self): + return self.steps_full_build_flow() + + def step_parse_builder_output(self, build_dir): + # TODO: output as .json or even add as new build step + # CHECK FOR VERIFICATION STEP SUCCESS + if os.path.exists(os.path.join(build_dir, "verification_output")): + # Collect all verification output filenames + outputs = glob.glob(os.path.join(build_dir, "verification_output/*.npy")) + # Extract the verification status for each verification output by matching + # to the SUCCESS string contained in the filename + status = all([out.split("_")[-1].split(".")[0] == "SUCCESS" for out in outputs]) + + # Construct a dictionary reporting the verification status as string + self.output_dict["builder_verification"] = { + "verification": {True: "success", False: "fail"}[status] + } + # TODO: mark job as failed if verification fails? + + def steps_full_build_flow(self): + # Default step sequence for benchmarking a full FINN builder flow + # MODEL CREATION/IMPORT + # TODO: track fixed input onnx models with DVC + if "model_dir" in self.params: + # input ONNX model and verification input/output pairs are provided + model_dir = self.params["model_dir"] + self.build_inputs["onnx_path"] = os.path.join(model_dir, "model.onnx") + self.build_inputs["input_npy_path"] = os.path.join(model_dir, "inp.npy") + self.build_inputs["output_npy_path"] = os.path.join(model_dir, "out.npy") + elif "model_path" in self.params: + self.build_inputs["onnx_path"] = self.params["model_path"] + else: + # input ONNX model (+ optional I/O pair for verification) will be generated + self.build_inputs["onnx_path"] = os.path.join( + self.build_inputs["build_dir"], "model_export.onnx" + ) + if self.step_export_onnx(self.build_inputs["onnx_path"]) == "skipped": + # microbenchmarks might skip because no model can be generated for given params + return "skipped" + + # BUILD SETUP + # Initialize from YAML (default) or custom script (if dedicated subclass is defined) + cfg = self.step_build_setup() + + # Set some global defaults (could still be overwritten by run-specific YAML) + cfg.output_dir = self.build_inputs["build_dir"] + # enable extra performance optimizations (physopt) + # TODO: check OMX synth strategy again! + cfg.vitis_opt_strategy = build_cfg.VitisOptStrategy.PERFORMANCE_BEST + cfg.verbose = True + cfg.console_log_level = "ERROR" + cfg.enable_build_pdb_debug = False + # cfg.stitched_ip_gen_dcp = False # only needed for further manual integration + cfg.force_python_rtlsim = False + cfg.split_large_fifos = True + cfg.save_intermediate_models = True # Save the intermediate model graphs + cfg.verify_save_full_context = True # Output full context dump for verification steps + cfg.enable_instrumentation = True + # rtlsim_use_vivado_comps # TODO ? + # cfg.default_swg_exception + # cfg.large_fifo_mem_style + + # Overwrite build config settings with run-specific YAML build definition + # TODO: warn/error if there are unrecognized options set? + for key in self.params: + if hasattr(cfg, key): + setattr(cfg, key, self.params[key]) + + # Default of 1M cycles is insufficient for MetaFi (6M) and RN-50 (2.5M) + # TODO: make configurable or set on pipeline level? + os.environ["LIVENESS_THRESHOLD"] = "10000000" + + # BUILD + build.build_dataflow_cfg(self.build_inputs["onnx_path"], cfg) + + # ANALYSIS + self.step_parse_builder_output(self.build_inputs["build_dir"]) diff --git a/src/finn/benchmarking/dut/mobilenetv1.yml b/src/finn/benchmarking/dut/mobilenetv1.yml new file mode 100644 index 0000000000..16a68f4143 --- /dev/null +++ b/src/finn/benchmarking/dut/mobilenetv1.yml @@ -0,0 +1,23 @@ +model_path: models/mobilenetv1/mobilenetv1-w4a4_pre_post_tidy_opset-11.onnx +folding_config_file: models/mobilenetv1/ZCU102_folding_config_live_fifo.json +specialize_layers_config_file: models/mobilenetv1/ZCU102_specialize_layers.json + +steps: + - finn.builder.custom_step_library.mobilenet.step_mobilenet_streamline # Custom step + - finn.builder.custom_step_library.mobilenet.step_mobilenet_lower_convs # Custom step + - finn.builder.custom_step_library.mobilenet.step_mobilenet_convert_to_hw_layers_separate_th # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package + +# folding config comes with FIFO sizes +auto_fifo_depths: False diff --git a/src/finn/benchmarking/dut/mvau.py b/src/finn/benchmarking/dut/mvau.py new file mode 100644 index 0000000000..2c4a6b730a --- /dev/null +++ b/src/finn/benchmarking/dut/mvau.py @@ -0,0 +1,344 @@ +import json +import math +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + gen_finn_dt_tensor, + qonnx_make_model, +) + +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.bench_base import bench +from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth +from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth + + +class bench_mvau(bench): + def _make_single_mvau_model( + self, + W, + numInputVectors, + pe, + simd, + m, + wdt, + idt, + odt, + T=None, + tdt=None, + mem_mode="const", + ram_style="auto", + ram_style_thresholds="auto", + ): + mw = W.shape[0] + mh = W.shape[1] + + # there are two ways to implement bipolar weights and inputs for + # MatrixVectorActivation: + # - specify their datatypes as such + # - specify their datatypes as BINARY as use binaryXnorMode + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # we'll internally convert weights/inputs to binary and specify the + # datatypes as such, and also set the binaryXnorMode attribute to 1 + export_wdt = DataType["BINARY"] + export_idt = DataType["BINARY"] + binary_xnor_mode = 1 + else: + export_wdt = wdt + export_idt = idt + binary_xnor_mode = 0 + + # numInputVectors for dense = [N] + # numInputVectors for conv = [N, H, W] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, numInputVectors + [mw]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, numInputVectors + [mh]) + if T is not None: + no_act = 0 + node_inp_list = ["inp", "weights", "thresh"] + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() + else: + # no thresholds + node_inp_list = ["inp", "weights"] + actval = 0 + no_act = 1 + mvau_node = helper.make_node( + "MVAU_hls", # TODO: add rtl support (configurable as param) + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow.hls", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + M=m, + numInputVectors=numInputVectors, + inputDataType=export_idt.name, + weightDataType=export_wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=binary_xnor_mode, + noActivation=no_act, + resType="lut", + mem_mode=mem_mode, + ram_style=ram_style, + ram_style_thresholds=ram_style_thresholds, + runtime_writeable_weights=0, + ) + + graph = helper.make_graph( + nodes=[mvau_node], name="mvau_graph", inputs=[inp], outputs=[outp] + ) + model = qonnx_make_model(graph, producer_name="mvau-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", odt) + model.set_tensor_datatype("weights", wdt) + # model.set_tensor_shape("weights", (channels, 1, k_h, k_w)) from VVAU + if binary_xnor_mode: + # convert bipolar to binary + model.set_initializer("weights", (W + 1) / 2) + else: + model.set_initializer("weights", W) + if T is not None: + model.set_tensor_datatype("thresh", tdt) + model.set_initializer("thresh", T) + + # Minimize weight & accumulator width to obtain realistic resource consumption + # model = model.transform(InferShapes()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(InferDataTypes()) + + return model + + def step_export_onnx(self, onnx_export_path): + # Read params + idt = self.params["idt"] + wdt = self.params["wdt"] + act = self.params["act"] + + numInputVectors = self.params["nhw"] + mw = self.params["mw"] + mh = self.params["mh"] + sf = self.params["sf"] + nf = self.params["nf"] + m = self.params["m"] + + mem_mode = self.params["mem_mode"] + ram_style = self.params["ram_style"] + ram_style_thr = self.params["ram_style_thr"] + + output_dict = {} + + # convert string to FINN DataType + idt = DataType[idt] + wdt = DataType[wdt] + if act is not None: + act = DataType[act] + + # Determine and log folding + if sf == -1: + sf = mw + simd = mw // sf + if nf == -1: + nf = mh + pe = mh // nf + if mw % simd != 0 or mh % pe != 0: + print("Invalid simd/pe configuration, skipping") + return "skipped" + if m > 1 and (simd != mw or pe != mh): + print("M > 1 not possible for non-max simd/pe, skipping") + return "skipped" + output_dict["simd"] = simd + output_dict["pe"] = pe + + # Generate weights + np.random.seed(123456) # TODO: verify or switch to modern numpy random generation + + W = gen_finn_dt_tensor(wdt, (mw, mh)) + + if "sparsity_type" in self.params: + sparsity_type = self.params["sparsity_type"] + else: + sparsity_type = "none" + + if sparsity_type == "none": + if "sparsity_amount" in self.params: + if self.params["sparsity_amount"] > 0: + print("sparsity amount > 0 not applicable for none sparsity, skipping") + return "skipped" + else: + if self.params["sparsity_amount"] == 0: + print("sparsity amount = 0 not applicable for selected sparsity, skipping") + return "skipped" + if sparsity_type == "unstructured": + idx = np.random.choice( + mw * mh, size=int(self.params["sparsity_amount"] * mw * mh), replace=False + ) + W = np.reshape(W, -1) + W[idx] = 0.0 + W = np.reshape(W, (mw, mh)) + elif sparsity_type == "rows_random": + idx_mw = np.random.choice( + mw, size=int(self.params["sparsity_amount"] * mw), replace=False + ) + W[idx_mw, :] = 0.0 + elif sparsity_type == "cols_random": + idx_mh = np.random.choice( + mh, size=int(self.params["sparsity_amount"] * mh), replace=False + ) + W[:, idx_mh] = 0.0 + elif sparsity_type == "rows_regular": + if self.params["sparsity_amount"] == 0.25: + idx_mw = np.arange(0, mw, step=4) + elif self.params["sparsity_amount"] == 0.5: + idx_mw = np.arange(0, mw, step=2) + elif self.params["sparsity_amount"] == 0.75: + idx_mw = np.concatenate( + ( + np.arange(0, mw, step=4), + np.arange(1, mw, step=4), + np.arange(2, mw, step=4), + ) + ) + else: + print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") + return "skipped" + W[idx_mw, :] = 0.0 + elif sparsity_type == "cols_regular": + if self.params["sparsity_amount"] == 0.25: + idx_mh = np.arange(0, mh, step=4) + elif self.params["sparsity_amount"] == 0.5: + idx_mh = np.arange(0, mh, step=2) + elif self.params["sparsity_amount"] == 0.75: + idx_mh = np.concatenate( + ( + np.arange(0, mh, step=4), + np.arange(1, mh, step=4), + np.arange(2, mh, step=4), + ) + ) + else: + print("regular sparsity only applicable for amount 0.25/0.5/0.75, skipping") + return "skipped" + W[:, idx_mh] = 0.0 + + else: + print("ERROR: unknown sparsity type") + raise Exception("ERROR: unknown sparsity type") + + # TODO: implement enforce option which prevents naturally occurring sparsity + # params["sparsity_enforce"] + # TODO: implement distribution option which selects between uniform/normal/?? + # params["sparsity_distribution"] + + # log resulting sparsity statistics + # could be higher than selected due to naturally occurring sparsity + num_zeros = (W == 0).sum() + num_ones = (W == 1).sum() + (W == -1).sum() + num_p2 = 0 + for w in np.nditer(W): + if w != 0 and w != 1 and w != -1: + if w > 0: + if math.log2(w).is_integer(): + num_p2 = num_p2 + 1 + else: + if math.log2(-w).is_integer(): + num_p2 = num_p2 + 1 + output_dict["zero_weights"] = round(num_zeros / W.size, 2) + output_dict["easy_weights"] = round((num_zeros + num_ones + num_p2) / W.size, 2) + + # Generate thresholds + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + # set range for threshold values according to worst-case accumulator range + # (not weight value specific) + # this could result in some thresholds being clipped by MinimizeAccumulatorWidth + # lower_range = calculate_matvec_accumulator_range(wdt.min() * np.ones_like(W), idt) + # upper_range = calculate_matvec_accumulator_range(wdt.max() * np.ones_like(W), idt) + # acc_min = min(min(lower_range), min(upper_range)) + # acc_max = max(max(lower_range), max(upper_range)) + # set range for threshold values according to actual accumulator range + # for the generated weights + (acc_min, acc_max) = calculate_matvec_accumulator_range(W, idt) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(acc_min, acc_max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + + # Create model + model = self._make_single_mvau_model( + W, + numInputVectors, + pe, + simd, + m, + wdt, + idt, + odt, + T, + tdt, + mem_mode, + ram_style, + ram_style_thr, + ) + model = model.transform(GiveUniqueNodeNames()) + # node = model.get_nodes_by_op_type("MVAU_hls")[0] + # inst = getCustomOp(node) + + # display results of analysis passes only for the first occurence of this op type + self.target_node = "MVAU_hls" + + # log additional info about the generated model (e.g. SIMD/PE or sparsity) + with open(self.build_inputs["build_dir"] + "/report/dut_info.json", "w") as f: + json.dump(output_dict, f, indent=2) + + # TODO: also generate golden I/O pair for further verification steps + model.save(onnx_export_path) + + def step_build_setup(self): + # create build config for synthetic microbenchmark models + cfg = build_cfg.DataflowBuildConfig( + # manual folding + target_fps=None, + steps=[ + "step_create_dataflow_partition", + "step_minimize_bit_width", + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + "step_create_stitched_ip", + "step_measure_rtlsim_performance", + "step_out_of_context_synthesis", + "step_synthesize_bitfile", + "step_make_driver", + "step_deployment_package", + ], + ) + return cfg diff --git a/src/finn/benchmarking/dut/resnet50.yml b/src/finn/benchmarking/dut/resnet50.yml new file mode 100644 index 0000000000..c8779e5654 --- /dev/null +++ b/src/finn/benchmarking/dut/resnet50.yml @@ -0,0 +1,26 @@ +model_path: models/resnet50/resnet50_w1a2_exported.onnx +folding_config_file: models/resnet50/U250_folding_config_live_fifo.json +specialize_layers_config_file: models/resnet50/U250_specialize_layers.json +vitis_floorplan_file: models/resnet50/floorplan_resnet50.json + +steps: + - finn.builder.custom_step_library.resnet.step_resnet50_tidy # Custom step + - finn.builder.custom_step_library.resnet.step_resnet50_streamline # Custom step + - finn.builder.custom_step_library.resnet.step_resnet50_convert_to_hw # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_measure_rtlsim_performance + - step_out_of_context_synthesis + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package + +# folding config comes with FIFO sizes +auto_fifo_depths: False diff --git a/src/finn/benchmarking/dut/synthetic_nonlinear.py b/src/finn/benchmarking/dut/synthetic_nonlinear.py new file mode 100644 index 0000000000..ff33436976 --- /dev/null +++ b/src/finn/benchmarking/dut/synthetic_nonlinear.py @@ -0,0 +1,288 @@ +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.transformation.general import ( + GiveRandomTensorNames, + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.merge_onnx_models import MergeONNXModels +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.bench_base import bench + +from finn.util.basic import make_build_dir + + +def generate_random_threshold_values( + data_type, num_input_channels, num_steps, narrow=False, per_tensor=False +): + if per_tensor: + num_input_channels = 1 + if narrow: + num_steps -= 1 + + return np.random.randint( + data_type.min(), + data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def make_conv_building_block(ifm_dim, ch, kernel_size, simd, pe, parallel_window=0): + # hardcoded parameters + idt = DataType["UINT4"] + wdt = DataType["UINT4"] + odt = DataType["UINT4"] + tdt = DataType["UINT32"] + stride = 1 + in_ch = out_ch = ch # input channel = output channel for stacking + # pad so that input dim = output dim for stacking (only supports odd kernel_size for now) + pad = int(np.floor(kernel_size / 2)) + + total_pad = 2 * pad + out_feature_dim = compute_conv_output_dim(ifm_dim, kernel_size, stride, total_pad) + weights_shape = [in_ch * kernel_size * kernel_size, out_ch] + thresholds_shape = [1, odt.get_num_possible_values() - 1] + input_shape = [1, ifm_dim, ifm_dim, in_ch] + padding_out_shape = [1, ifm_dim + total_pad, ifm_dim + total_pad, in_ch] + inpgen_out_shape = [1, out_feature_dim, out_feature_dim, in_ch * kernel_size * kernel_size] + output_shape = [1, out_feature_dim, out_feature_dim, out_ch] + + assert input_shape == output_shape, "ERROR: Conv layer dimensions not stackable" + + padding_config = {} + padding_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + padding_config["backend"] = "fpgadataflow" + padding_config["ImgDim"] = [ifm_dim, ifm_dim] + padding_config["NumChannels"] = in_ch + padding_config["SIMD"] = simd + padding_config["Padding"] = [pad, pad, pad, pad] + padding_config["inputDataType"] = idt.name + + inpgen_config = {} + inpgen_config["domain"] = "finn.custom_op.fpgadataflow.rtl" + inpgen_config["backend"] = "fpgadataflow" + inpgen_config["ConvKernelDim"] = [kernel_size, kernel_size] + inpgen_config["IFMChannels"] = in_ch + inpgen_config["IFMDim"] = [ifm_dim + total_pad, ifm_dim + total_pad] + inpgen_config["OFMDim"] = [ifm_dim, ifm_dim] + inpgen_config["inputDataType"] = idt.name + inpgen_config["outputDataType"] = idt.name + inpgen_config["SIMD"] = simd + inpgen_config["parallel_window"] = parallel_window + inpgen_config["Stride"] = [stride, stride] + inpgen_config["Dilation"] = [1, 1] + + mvau_config = {} + mvau_config["domain"] = "finn.custom_op.fpgadataflow.hls" + mvau_config["backend"] = "fpgadataflow" + mvau_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + mvau_config["MW"] = in_ch * kernel_size * kernel_size + mvau_config["MH"] = in_ch + mvau_config["SIMD"] = simd if parallel_window == 0 else simd * kernel_size * kernel_size + mvau_config["PE"] = pe + mvau_config["resType"] = "lut" + mvau_config["mem_mode"] = "internal_embedded" # internal_decoupled + mvau_config["inputDataType"] = idt.name + mvau_config["weightDataType"] = wdt.name + mvau_config["outputDataType"] = odt.name + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + value_info = [ + helper.make_tensor_value_info("weights", TensorProto.FLOAT, weights_shape), + helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, thresholds_shape), + helper.make_tensor_value_info("padding_out", TensorProto.FLOAT, padding_out_shape), + helper.make_tensor_value_info("inpgen_out", TensorProto.FLOAT, inpgen_out_shape), + ] + + modelproto = qonnx_make_model( + helper.make_graph( + name="building_block", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + helper.make_node("FMPadding_rtl", ["top_in"], ["padding_out"], **padding_config), + helper.make_node( + "ConvolutionInputGenerator_rtl", + ["padding_out"], + ["inpgen_out"], + **inpgen_config, + ), + helper.make_node( + "MVAU_hls", ["inpgen_out", "weights", "thresholds"], ["top_out"], **mvau_config + ), + ], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", idt) + model.set_tensor_layout("top_in", ["N", "H", "W", "C"]) + model.set_tensor_datatype("top_out", odt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype("thresholds", tdt) + + weights = gen_finn_dt_tensor(wdt, weights_shape) + # TODO: thresholds are all the same + thresholds = generate_random_threshold_values( + tdt, out_ch, odt.get_num_possible_values() - 1, False, True + ) + thresholds = sort_thresholds_increasing(thresholds) + + model.set_initializer("weights", weights) + model.set_initializer("thresholds", thresholds) + + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def combine_blocks(lb, rb, ifm_dim, ch, pe): + # assumes left branch (lb) and right branch (rb) each have a + # single (dynamic) input/output with the same shape + + # to avoid mix-ups, start by giving all tensors random names + lb = lb.transform(GiveRandomTensorNames()) + rb = rb.transform(GiveRandomTensorNames()) + # erase all node names to avoid conflict + for n in lb.graph.node: + n.name = "" + for n in rb.graph.node: + n.name = "" + + lb_input = lb.graph.input[0] + lb_output = lb.graph.output[0] + rb_input = rb.graph.input[0] + rb_output = rb.graph.output[0] + + top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ch]) + + dup_config = {} + dup_config["domain"] = "finn.custom_op.fpgadataflow.hls" + dup_config["backend"] = "fpgadataflow" + dup_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + dup_config["NumChannels"] = ch + dup_config["PE"] = pe + dup_config["NumOutputStreams"] = 2 + dup_config["inputDataType"] = lb.get_tensor_datatype(lb_input.name).name + # We always need to set outFIFODepths explictly for DuplicateStreams + # because it has no default value that corresponds automatically to NumOutputStreams + dup_config["outFIFODepths"] = [2] * 2 + + add_config = {} + add_config["domain"] = "finn.custom_op.fpgadataflow.hls" + add_config["backend"] = "fpgadataflow" + add_config["numInputVectors"] = [1, ifm_dim, ifm_dim] + add_config["NumChannels"] = ch + add_config["PE"] = pe + add_config["inputDataType"] = lb.get_tensor_datatype(lb_output.name).name + + nodes_lb = [node for node in lb.graph.node] + nodes_rb = [node for node in rb.graph.node] + nodes_new = ( + nodes_lb + + nodes_rb + + [ + helper.make_node( + "DuplicateStreams_hls", ["top_in"], [lb_input.name, rb_input.name], **dup_config + ), + helper.make_node( + "AddStreams_hls", [lb_output.name, rb_output.name], ["top_out"], **add_config + ), + ] + ) + + value_info_lb = [x for x in lb.graph.value_info] + value_info_rb = [x for x in rb.graph.value_info] + value_info_new = value_info_lb + value_info_rb + [lb_input, lb_output, rb_input, rb_output] + + initializer_lb = [x for x in lb.graph.initializer] + initializer_rb = [x for x in rb.graph.initializer] + initializer_new = initializer_lb + initializer_rb + modelproto = qonnx_make_model( + helper.make_graph( + name="branching_model", + inputs=[top_in], + outputs=[top_out], + value_info=value_info_new, + nodes=nodes_new, + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("top_in", lb.get_tensor_datatype(lb_input.name)) + model.set_tensor_layout("top_in", lb.get_tensor_layout(lb_input.name)) + for i in initializer_new: + model.graph.initializer.append(i) + + # tidy-up + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(GiveReadableTensorNames()) + return model + + +class bench_synthetic_nonlinear(bench): + def step_export_onnx(self, onnx_export_path): + np.random.seed(0) + tmp_output_dir = make_build_dir("test_fifosizing") + + # TODO: allow manual folding/fifo config as input + # TODO: how to determine rtlsim_n automatically? + + # conv parameters + dim = self.params["dim"] + kernel_size = self.params["kernel_size"] + ch = self.params["ch"] + simd = self.params["simd"] + pe = self.params["pe"] + parallel_window = self.params["parallel_window"] + + lb = None + for i in range(self.params["lb_num_layers"]): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + lb = new_block if lb is None else lb.transform(MergeONNXModels(new_block)) + lb.save(tmp_output_dir + "/lb.onnx") + + rb = None + for i in range(self.params["rb_num_layers"]): + new_block = make_conv_building_block( + dim, ch, kernel_size=kernel_size, simd=simd, pe=pe, parallel_window=parallel_window + ) + rb = new_block if rb is None else rb.transform(MergeONNXModels(new_block)) + rb.save(tmp_output_dir + "/rb.onnx") + + model = combine_blocks(lb, rb, dim, ch, pe=4) + model.save(onnx_export_path) + + def step_build_setup(self): + # create build config for synthetic test models + + cfg = build_cfg.DataflowBuildConfig( + # manual folding + target_fps=None, + ) + + return cfg diff --git a/src/finn/benchmarking/dut/transformer.py b/src/finn/benchmarking/dut/transformer.py new file mode 100644 index 0000000000..83002ef418 --- /dev/null +++ b/src/finn/benchmarking/dut/transformer.py @@ -0,0 +1,974 @@ +# Adapted from Christoph's attention-dummy repository + +# PyTorch base package: Math and Tensor Stuff +import json +import numpy as np +import random +import torch +from brevitas.export import export_qonnx + +# Brevitas: Quantized versions of PyTorch layers +from brevitas.nn import ( + QuantEltwiseAdd, + QuantIdentity, + QuantLinear, + QuantMultiheadAttention, + QuantReLU, +) + +# Brevitas wrapper around PyTorch tensors adding quantization information +from brevitas.quant_tensor import QuantTensor +from qonnx.core.modelwrapper import ModelWrapper + +# Range information structure for seeding the range analysis for converting +# quantized activations to MultiThreshold +from qonnx.util.range_analysis import RangeInfo + +# Progressbar +from tqdm import trange + +# FINN dataflow builder +import finn.builder.build_dataflow_config as build_cfg +from finn.benchmarking.bench_base import bench + +# Custom build steps required to streamline and convert the attention operator +from finn.builder.custom_step_library.transformer import ( + node_by_node_cppsim, + prepare_graph, + set_fifo_depths, + set_target_parallelization, + step_apply_folding_config, + step_convert_attention_to_hw, + step_convert_depth_wise_to_hw, + step_convert_elementwise_binary_to_hw, + step_convert_lookup_to_hw, + step_convert_split_concat_to_hw, + step_replicate_streams, + step_streamline, +) + + +# ADAPTED FROM utils.py +# Seeds all relevant random number generators to the same seed for +# reproducibility +def seed(s): + random.seed(s) + np.random.seed(s) + torch.manual_seed(s) + + +# ADAPTED FROM model.py +# Derives a weight quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def weight_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.inject.enum import RestrictValueType + from brevitas.quant.base import MaxStatsScaling, NarrowIntQuant + from brevitas.quant.solver import WeightQuantSolver + + # Derive a Quantizer from the brevitas bases + class Quantizer(NarrowIntQuant, MaxStatsScaling, WeightQuantSolver): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Per tensor quantization, not per channel + scaling_per_output_channel = False + # What is this? Copied from PerTensorFloatScaling* + # Probably restricts the scale to be floating-point? + restrict_scaling_type = RestrictValueType.FP + + # Return the derived quantizer configuration + return Quantizer + + +# Derives a bias quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def bias_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.quant import IntBias + + # Derive a Quantizer from the brevitas bases + class Quantizer(IntBias): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Do not require the bit-width to be adjusted to fit the accumulator to + # which the bias is added + requires_input_bit_width = False + + # Return the derived quantizer configuration + return Quantizer + + +# Derives an activation quantizer from the brevitas bases leaving bit-width and +# signedness configurable +def act_quantizer(bits, _signed=True): + # Brevitas quantizer base classes + from brevitas.inject.enum import RestrictValueType + from brevitas.quant.base import IntQuant, ParamFromRuntimePercentileScaling + from brevitas.quant.solver import ActQuantSolver + + # Derive a Quantizer from the brevitas bases + class Quantizer(IntQuant, ParamFromRuntimePercentileScaling, ActQuantSolver): + # Configure the quantization bit-width + bit_width = bits + # Signedness of the quantization output + signed = _signed + # Per tensor quantization, not per channel + scaling_per_output_channel = False + # What is this? Copied from PerTensorFloatScaling* + # Probably restricts the scale to be floating-point? + restrict_scaling_type = RestrictValueType.FP + + # Return the derived quantizer configuration + return Quantizer + + +# Gets the normalization layer from configuration key +def get_norm(key, normalized_shape): + # Transposes Sequence and Embedding dimensions + class Transpose(torch.nn.Module): + # Forward pass transposing the feature map + def forward(self, x): # noqa: May be static + # Transpose the last two dimensions of batch x seq x emb layout + return torch.transpose(x, dim0=-1, dim1=-2) + + # Dictionary mapping keys to supported normalization layer implementations + norms = { + # PyTorch default layer normalization. Needs to know the shape of the + # feature map to be normalized + "layer-norm": torch.nn.LayerNorm( + # Note: Disable affine parameters as potential negative scale causes + # streamlining issues later + normalized_shape=normalized_shape, + elementwise_affine=False, + ), + # PyTorch default 1-dimensional batch normalization. Needs to transpose + # embedding and sequence dimension to normalized over the embedding + # dimension, which is expected to be second. + "batch-norm": torch.nn.Sequential( + # Note: Disable affine parameters as potential negative scale causes + # streamlining issues later + Transpose(), + torch.nn.LazyBatchNorm1d(affine=False), + Transpose(), + ), + # No normalization by a PyTorch built-in identity layer. Should not + # appear in the graph. + "none": torch.nn.Identity(), + } + + # Select the normalization layer by key + return norms[key] + + +# Gets the attention mask from configuration key and shape +def get_mask(key, length): + # Dictionary mapping keys to supported normalization layer implementations + masks = { + # No attention mask + "none": None, + # Generate the upper triangular mask for causal attention + "causal": torch.nn.Transformer.generate_square_subsequent_mask(length), + # Square matrix with entries randomly set to -inf or 0.0 with 50% + # probability each + "random": torch.where( # noqa: Confused by types? + torch.rand(length, length) > 0.5, -torch.inf, 0.0 + ), + } + # Select the mask type by key + return masks[key] + + +# Single-layer scaled dot-product attention block with MLP and normalization +class TransformerBlock(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits): + # Initialize the PyTorch Module superclass + super().__init__() + + # Input quantizer to the scaled dot-product attention operations, shared + # by queries, keys and values inputs. It is important to have this + # quantizer separate and not preceding the fork node of the residual + # branches to avoid consecutive quantizers in the skip branch. + # Note: For some reason it seems not to be possible to use the + # in_proj_input_quant of the attention operator + self.sdp_input_quant = QuantIdentity( + # Quantize at the output + act_quant=act_quantizer(bits, _signed=True), + # Pass quantization information on to the next layer. + return_quant_tensor=True, + ) + # Quantized scaled dot-product attention operator + self.sdp = QuantMultiheadAttention( + # Size of the embedding dimension (input and output) + embed_dim=emb_dim, + # Number of attention heads + num_heads=num_heads, + # Enable a bias added to the input and output projections + bias=bias, + # Layout of the inputs: + # Batch x Sequence x Embedding (batch-first, True) + # Sequence x Batch x Embedding (batch-second, False) + batch_first=True, + # If query, key and value input are the same, packed input + # projections use a single, large linear projection to produce + # the actual query, key and value inputs. Otherwise, use + # separate linear projections on each individual input. + packed_in_proj=False, + # Brevitas has this as an unsigned quantizer by default, but + # finn can only handle signed quantizer + attn_output_weights_quant=act_quantizer(bits, _signed=True), + # Insert an additional quantizer in front ot the softmax. In our + # finn custom-op, this will be matched to the quantizer + # following the query and key matmul. + # Note: Disable to prevent the quantizer from tripping over -inf + # from the attention mask + softmax_input_quant=None, + # Quantize the input projections weights as configured + in_proj_weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias of the input projections as configured + in_proj_bias_quant=bias_quantizer(bits, _signed=True), + # No quantization in front of the input projections as this is + # either done by a standalone quantizer preceding the whole block + in_proj_input_quant=None, + # Quantize the output projections weights as configured + out_proj_weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias of the output projections as configured + out_proj_bias_quant=bias_quantizer(bits, _signed=True), + # Quantize the input to the output projection as configured + out_proj_input_quant=act_quantizer(bits, _signed=True), + # Quantizer the key after projections as configured + k_transposed_quant=act_quantizer(bits, _signed=True), + # Quantize the queries after projections as configured + q_scaled_quant=act_quantizer(bits, _signed=True), + # Quantize the values after projection as configured + v_quant=act_quantizer(bits, _signed=True), + # No output quantization for now, as stacking multiple layers + # results in multiple multi-thresholds in succession + out_proj_output_quant=None, + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True, + ) + # Residual branch addition skipping over the attention layer + self.residual_sdp = QuantEltwiseAdd( + # Shared input activation quantizer such that the scales at both + # input branches are identical. This allows floating point scale + # factor to be streamlined past the add-node. + input_quant=act_quantizer(bits, _signed=True), + # Disable the output quantizer after the add operation. Output of + # the add will have one more bit than the inputs, which is probably + # fine and does not require re-quantization. + output_quant=None, + # Pass quantization information on to the next layer. + return_quant_tensor=True, + ) + # Normalization following the attention layer + self.norm_sdp = torch.nn.Sequential( + # Select the normalization layer implementation + get_norm(key=norm, normalized_shape=emb_dim), + # No quantizer to avoid consecutive quantizer in the MLP residual + # branch. See input quantizer in front of the first MLP layer. + ) + + # Quantized MLP following the scaled dot-product attention + self.mlp = torch.nn.Sequential( + # Quantize the inputs to the MLP block. Placed here to not have this + # at the input of the residual branch. + QuantIdentity( + # Quantize at the output + act_quant=act_quantizer(bits, _signed=True), + # Pass quantization information on to the next layer. + return_quant_tensor=True, + ), + # First mlp layer projecting to the mlp dimension + QuantLinear( + # Inputs have the size of the attention embedding dimension + emb_dim, + # Project to the configured mlp dimension, which is typically + # larger than the embedding dimension + mlp_dim, + # Enable the learned bias vector + bias=bias, + # Quantize weights to the same representation as all other + # layers + weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias to the same representation as all other + # layers + bias_quant=bias_quantizer(bits, _signed=True), + # No input quantizer as this is directly preceded by a + # standalone quantizer + input_quant=None, + # Not output quantizer as this is directly followed by a + # quantized ReLU activation taking care of quantization + output_quant=None, + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True, + ), + # Use the ReLU activation function instead of the more commonly used + # GELU, as the latter is not mapped easily to hardware with FINN + QuantReLU( + # Note: ReLU must be quantized to unsigned representation + act_quant=act_quantizer(bits, _signed=False), + # Return the quantization parameters so the next layer can + # quantize the bias + return_quant_tensor=True, + ), + # Second mlp layer projecting back to the embedding dimension + QuantLinear( + # Inputs have the configured mlp dimension, which is typically + # larger than the embedding dimension + mlp_dim, + # Project back to the size of the attention embedding dimension + emb_dim, + # Enable the learned bias vector + bias=bias, + # Quantize weights to the same representation as all other + # layers + weight_quant=weight_quantizer(bits, _signed=True), + # Quantize the bias to the same representation as all other + # layers + bias_quant=bias_quantizer(bits, _signed=True), + # No input quantizer as the inputs are already quantized by the + # preceding ReLU layer + input_quant=None, + # Not output quantizer as this is directly followed by a + # quantized element-wise addition taking care of quantization + output_quant=None, + # Pass quantization information on to the next layer. + return_quant_tensor=True, + ), + ) + # Residual branch addition skipping over the MLP layer + self.residual_mlp = QuantEltwiseAdd( + # Shared input activation quantizer such that the scales at both + # input branches are identical. This allows floating point scale + # factor to be streamlined past the add-node. + input_quant=act_quantizer(bits, _signed=True), + # Disable the output quantizer after the add operation. Output of + # the add will have one more bit than the inputs, which is probably + # fine and does not require re-quantization. + output_quant=None, + # Pass quantization information on to the next layer. + # Note: Not for the last layer to allow this to be combined with + # standard pytorch calls like .detach() or .numpy(), which are + # not directly available on QuantTensor. + return_quant_tensor=True, + ) + # Normalization following the attention layer + self.norm_mlp = torch.nn.Sequential( + # Select the normalization layer implementation + get_norm(key=norm, normalized_shape=emb_dim), + # No quantizer to avoid consecutive quantizer in the SDP residual + # branch + ) + # Generate the attention mask according to configuration + self.mask = get_mask(mask, seq_len) + + # Forward pass through the transformer block + def forward(self, x): + # Move the mask to the same device as the input, just in case... + mask = self.mask.to(x.device) if self.mask is not None else None + # Quantize the input to the attention block + q = self.sdp_input_quant(x) + # Scaled dot-product attention with residual branch and normalization + x = self.norm_sdp(self.residual_sdp(x, self.sdp(q, q, q, attn_mask=mask)[0])) + # MLP layer with residual branch and normalization + return self.norm_mlp(self.residual_mlp(x, self.mlp(x))) + + +# Quantized sinusoidal positional encoding layer +class QuantSinusoidalPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor, + ) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Get the size of the inputs to dynamically generate encodings of the + # same size + _, seq, emb = x.shape + # Start by enumerating all steps of the sequence + i = torch.as_tensor([[n] for n in range(seq)]) + # Scale factor adjusting the frequency/wavelength of the sinusoid + # depending on the embedding dimension index + f = torch.as_tensor([1e4 ** -(i / emb) for i in range(0, emb, 2)]) + # Prepare empty positional encoding tensor of the same size as the input + pos = torch.empty(seq, emb) + # Fill the positional encoding with alternating sine and cosine waves + pos[:, 0::2] = torch.sin(f * i) + pos[:, 1::2] = torch.cos(f * i) + # Move the encoding tensor to the same device as the input tensor + pos = pos.to(x.device, dtype=x.dtype) + # Add the quantized encoding to the quantized input + return self.add(x, pos) + + +# Quantized learned positional encoding layer +class QuantLearnedPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, seq_len, emb_dim, input_quant, output_quant, return_quant_tensor): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor, + ) + # Register a parameter tensor representing the not quantized positional + # encoding + self.pos = torch.nn.Parameter(torch.empty(seq_len, emb_dim)) + # Reset/Initialize the parameter tensor + self.reset_parameters() + + # Resets/Initializes the positional encoding parameter tensor + def reset_parameters(self): + # Initialize the positional encoding from a normal distribution with + # zero mean and unit standard deviation + torch.nn.init.normal_(self.pos, mean=0, std=1) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Add the quantized encoding to the quantized input + return self.add(x, self.pos) + + +# Lazy version of the learned encoding not requiring input dimensions at +# initialization, inferring these at the first forward pass +class LazyQuantLearnedPositionalEncoding( + torch.nn.modules.lazy.LazyModuleMixin, QuantLearnedPositionalEncoding # noqa +): + # Once initialized, this will become a QuantLearnedPositionalEncoding as + # defined above + cls_to_become = QuantLearnedPositionalEncoding + # Parameter tensor of the QuantLearnedPositionalEncoding is uninitialized + pos: torch.nn.UninitializedParameter + + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the quantizer parts of QuantLearnedPositionalEncoding, + # leaving the dimensions empty + super().__init__(0, 0, input_quant, output_quant, return_quant_tensor) + # Register an uninitialized parameter tensor for the positional encoding + self.pos = torch.nn.UninitializedParameter() + + # Resets/Initializes the positional encoding parameter tensor + def reset_parameters(self): + # If this has already been initialized, delegate to the actual + # implementation + if not self.has_uninitialized_params(): + super().reset_parameters() + + # Initializes/Materializes the uninitialized parameter tensor given some + # sample input tensor to infer the dimensions + def initialize_parameters(self, x): + # Only materialize the parameter tensor if it is not yet initialized + if self.has_uninitialized_params(): + # Do not accumulate gradient information from initialization + with torch.no_grad(): + # Get the size of the inputs to generate encodings of the same + # size + _, seq, emb = x.shape + # Materialize the positional encoding parameter tensor + self.pos.materialize((seq, emb)) + # Properly initialize the parameters by resetting the values + self.reset_parameters() + + +# Quantized binary positional encoding layer +class QuantBinaryPositionalEncoding(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__(self, input_quant, output_quant, return_quant_tensor): + # Initialize the PyTorch Module superclass + super().__init__() + # Adds the quantized input and positional encoding + self.add = QuantEltwiseAdd( + # Input quantization to be applied to the input as well as the + # positional encodings + input_quant=input_quant, + # Quantize the outputs after adding input and positional encoding + output_quant=output_quant, + # Returns quantization information to the next layer + return_quant_tensor=return_quant_tensor, + ) + + # Forward pass adding positional encoding to the input tensor + def forward(self, x): + # Get the size of the inputs to dynamically generate encodings of the + # same size + _, seq, emb = x.shape + # Binary positional encoding fills the embedding dimension with the bit + # pattern corresponding to the position in the sequence + pos = torch.as_tensor([[(n & (1 << bit)) >> bit for bit in range(emb)] for n in range(seq)]) + # Move the encoding tensor to the same device as the input tensor + pos = pos.to(x.device, dtype=x.dtype) + # Add the quantized encoding tp the quantized input + # Note: Convert encoding to bipolar representation + return self.add(x, 2 * pos - 1) + + +# Gets the positional encoding layer from configuration key, quantizers and +# shape +def get_positional_encoding(key, input_quant, output_quant, return_quant_tensor): + # Dictionary mapping keys to supported normalization layer implementations + masks = { + # No positional encoding + "none": QuantIdentity(act_quant=input_quant, return_quant_tensor=return_quant_tensor), + # Fixed, sinusoidal positional encoding according to Vaswani et al. with + # added quantizers + "sinusoidal": QuantSinusoidalPositionalEncoding( + input_quant, output_quant, return_quant_tensor + ), + # Fixed, binary positional encoding with quantizers + "binary": QuantBinaryPositionalEncoding(input_quant, output_quant, return_quant_tensor), + # Learned positional encoding with quantizers + "learned": LazyQuantLearnedPositionalEncoding( + input_quant, output_quant, return_quant_tensor + ), + } + # Select the positional encoding type by key + return masks[key] + + +# Unpacks the standard PyTorch tensor from a brevitas QuantTensor +def unpack_from_quant(tensor: torch.Tensor | QuantTensor): + # If this is a QuantTensor we can extract the wrapped tensor + if isinstance(tensor, QuantTensor): + # The underlying tensor is wrapped as the value attribute + return tensor.value + # Assume this is already a plain PyTorch tensor + return tensor + + +# Dummy transformer encoder model +class DummyTransformer(torch.nn.Module): + # Initializes the model and registers the module parameters + def __init__( + self, + # Number of layers of attention blocks + num_layers, + # Number of attention heads per block + num_heads, + # Size of embedding dimension going into/out of the attention block + emb_dim, + # Size of MLP dimension in each attention block + mlp_dim, + # Length of the input sequence, i.e., context size + seq_len, + # Enables bias term added to Linear layers + bias, + # Quantization bit-width: For now all layers are quantized to the + # same bit-width + bits, + # Type of normalization layer to use in the transformer blocks + # Options are: layer-norm, batch-norm and none + norm="none", + # Type of attention mask to use + # Options are: none, causal or const + mask="none", + # Type of positional encoding to use at the input + # Options are: none, sinusoidal, binary, learned + positional_encoding="none", + ): + # Initialize the PyTorch Module superclass + super().__init__() + + # Positional encoding layer at the input + self.pos = get_positional_encoding( + # Select the implementation by configuration key + key=positional_encoding, + # Quantize the inputs to the positional encoding to the same + # bit-width as the input + input_quant=act_quantizer(bits, _signed=True), + # Quantize the sum of input and positional encoding to the same + # bit-width as the input + output_quant=None, + # Pass quantization information on to the next layer + return_quant_tensor=True, + ) + + # Sequence of num_layers transformer encoder blocks + self.encoder = torch.nn.Sequential( + *[ + TransformerBlock(num_heads, emb_dim, mlp_dim, seq_len, bias, norm, mask, bits) + for _ in range(num_layers) + ] + ) + + # Model forward pass taking an input sequence and returning a single set of + # class probabilities + def forward(self, x): + # Add positional encoding to the input and feed through the encoder + # stack + # Note: Get the wrapped value out of the QuantTensor to have only a + # single output from the model. + return unpack_from_quant(self.encoder(self.pos(x))) + + +# ADAPTED FROM export.py + + +# Check whether a layer is a normalization layer of some supported type +def is_norm_layer(module): + # Set of normalization layer (bases) which maybe need to be patched + norm_layers = { + # All BatchNorm and InstanceNorm variants derive from this baseclass + torch.nn.modules.batchnorm._NormBase, # noqa: Access to _NormBase + # LayerNorm has a unique implementation + torch.nn.LayerNorm, + } + # Check the module against all supported norm layer types + return any(isinstance(module, norm) for norm in norm_layers) + + +# Fixes export issues of normalization layers with disabled affine parameters. +# Somehow the export to ONNX trips when it encounters the weight and bias tensor +# to be 'None'. +def patch_non_affine_norms(model: torch.nn.Module): # noqa: Shadows model + # Iterate all modules in the model container + for name, module in model.named_modules(): + # If the module is a normalization layer it might require patching the + # affine parameters + if is_norm_layer(module): + # Check whether affine scale parameters are missing + if hasattr(module, "weight") and module.weight is None: + # There need to be running statistics to patch the scales + if hasattr(module, "running_var"): + # Patch the affine bias by all 1 tensor of the same shape, + # type and device as the running variance + module.weight = torch.nn.Parameter(torch.ones_like(module.running_var)) + # Check whether affine bias parameters are missing + if hasattr(module, "bias") and module.bias is None: + # There need to be running statistics to patch the scales + if hasattr(module, "running_mean"): + # Patch the affine bias by all 0 tensor of the same shape, + # type and device as the running mean + module.bias = torch.nn.Parameter(torch.zeros_like(module.running_var)) + # Return the patched model container + return model + + +template_folding_yaml = """ +# Per operator type default configurations +defaults: + # Scaled dot-product attention head implemented via HLS + ScaledDotProductAttention_hls: + # Type of memory to be used for internal buffer storage + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Type of memory to be used fo the attention mask (if present) + # Options: auto, block, distributed + ram_style_mask: block + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + mac_resource: lut + # Addition of two inputs (constants or streamed) implemented via HLS + ElementwiseAdd_hls: + # Type of memory to be used for internal buffer storage and/or constant + # parameter tensors + # Options: auto, block, distributed, ultra + ram_style: distributed + # Matrix vector activation unit implemented via HLS + MVAU_hls: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Type of memory to be used for threshold storage + # Options: auto, block, distributed + ram_style_thresholds: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Matrix vector activation unit implemented via RTL + MVAU_rtl: + # Resource type to be used for implementing multiplications/MACs + # Options: auto, lut or dsp + # Note: RTL MVAU currently does not support LUT-based implementation + resType: dsp + # Memory mode for weight storage + # Options: internal_embedded, internal_decoupled, external + mem_mode: internal_decoupled + # Type of memory to be used for weight storage if "internal_decoupled" + # Options: auto, block, distributed, ultra + ram_style: block + # Makes weights writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via HLS (applies to standalone thresholds) + Thresholding_hls: + # Memory mode for threshold storage + # Options: internal_embedded, internal_decoupled + mem_mode: internal_decoupled + # Type of memory to be used for threshold storage if "internal_decoupled" + # Options: distributed, block + ram_style: distributed + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # Multi-thresholds implemented via RTL (applies to standalone thresholds) + Thresholding_rtl: + # Decides to use BRAM, URAM or LUTs for threshold memory, depending on the + # depth of the thresholds + # Note: This combination forces "distributed" LUT implementation + depth_trigger_uram: 2147483647 # "infinity" + depth_trigger_bram: 2147483647 # "infinity" + # # Note: This combination forces "block" RAM implementation + # depth_trigger_uram: 0 + # depth_trigger_bram: 1 + # # Note: This combination forces "ultra" RAM implementation + # depth_trigger_uram: 1 + # depth_trigger_bram: 0 + # # Note: This combination is equivalent to "auto" + # depth_trigger_uram: 0 + # depth_trigger_bram: 0 + # Makes thresholds writeable through AXI-lite interface at runtime + runtime_writeable_weights: 0 + # FIFO implemented via RTL (there is no HLS FIFO implementation in FINN) + StreamingFIFO_rtl: + # RTL vs. IPI implementation of FIFOs + # Options: rtl, vivado + impl_style: rtl + # Resource type for FIFOs when impl_style is vivado + # Options: auto, block, distributed, ultra + ram_style: distributed + # Individual, named node-specific configurations here + # ... +""" + + +class bench_transformer(bench): + def step_export_onnx(self, output_onnx_path): + # Generates a dummy transformer block, + # not used for actual models (RadioML, GPT, etc.) + + # Load the parameters file + # params = dvc.api.params_show("params.yaml") + # Seed all RNGs + seed(self.params["seed"]) + # Make PyTorch behave deterministically if possible + torch.use_deterministic_algorithms(mode=True, warn_only=True) + # Create a model instance from the configuration parameters + # model = DummyTransformer(**params["model"]) + model = DummyTransformer( + num_layers=self.params["model_num_layers"], + num_heads=self.params["model_num_heads"], + emb_dim=self.params["model_emb_dim"], + mlp_dim=self.params["model_mlp_dim"], + seq_len=self.params["model_seq_len"], + bias=self.params["model_bias"], + bits=self.params["model_bits"], + norm=self.params["model_norm"], + mask=self.params["model_mask"], + positional_encoding=self.params["model_positional_encoding"], + ) + + # Get the configured sequence length and embedding dimension to generate + # test inputs + seq, dim = self.params["model_seq_len"], self.params["model_emb_dim"] + # No gradient accumulation for calibration passes required + with torch.no_grad(): + # Check whether GPU training is available and select the appropriate + # device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # Move the model to the training device + model = model.to(device) + # Multiple passes of calibration might be necessary for larger/deep + # models + for _ in trange(0, self.params["calibration_passes"], desc="calibrating"): + # Pass random data through the model to "calibrate" dummy quantizer. + # Large batch to have more calibration samples. Otherwise, there is + # too much deviation between this calibration and the verification + # samples. + model(torch.rand(128, seq, dim, device=device)) + # Move the model back to the CPU + model = model.cpu() + # Prevent export issue for missing affine normalization parameters + model = patch_non_affine_norms(model) + # Switch model to evaluation mode to have it fixed for export + model = model.eval() + # Sample random input tensor in batch-first layout + x = torch.rand(1, seq, dim) + # Compute attention output + o = model(x) + # Save the input and output data for verification purposes later + np.save("inp.npy", x.detach().numpy()) + np.save("out.npy", o.detach().numpy()) + self.build_inputs["input_npy_path"] = "inp.npy" + self.build_inputs["output_npy_path"] = "out.npy" + # Export the model graph to QONNX + # export_qonnx(model, (x,), "attention.onnx", **self.params["export"]) + export_qonnx(model, (x,), output_onnx_path, opset_version=14, do_constant_folding=True) + + def step_build_setup(self): + # with open("params.yaml") as file: + # params = yaml.safe_load(file) + # Seed all RNGs + seed(self.params["seed"]) + # Extract sequence length and embedding dimension from parameters + if "model_seq_len" in self.params and "model_emb_dim" in self.params: + # for dummy Transformer DUT + seq_len, emb_dim = self.params["model_seq_len"], self.params["model_emb_dim"] + else: + # for real input models + inp_shape = np.load(self.build_inputs["input_npy_path"]).shape + if len(inp_shape) == 3: + # for RadioML Transformers + _, seq_len, emb_dim = inp_shape + else: + # for GPTs (why is this different?) + model = ModelWrapper(self.build_inputs["onnx_path"]) + _, seq_len, emb_dim = model.get_tensor_shape( + "/emb_add/input_quant/export_handler/Quant_output_0" + ) + + # Read the input value range information for the dataset from the parameters + # Note: Consider calibrating this on the fly from the dataset + value_range = [-100, +100] # params["build"]["range"] # TODO: make configurable? + input_range = tuple(np.array([value_range]).T) + # Construct the seed range information of the input tensor + range_info = RangeInfo(shape=(1, seq_len, emb_dim), range=input_range) + + # Prepare config files + # TODO: make configurable + # TODO: log intermediate files such as inp.npy, folding.yaml, + # or specialize_layers.jon as artifacts, maybe create in unique temp dirs + specialize_layers_dict = { + "Defaults": {"preferred_impl_style": ["rtl", ["MVAU", "Thresholding"]]}, + "": {"preferred_impl_style": ""}, + } + with open("specialize_layers.json", "w") as f: + json.dump(specialize_layers_dict, f, indent=2) + with open("folding.yaml", "w") as f: + f.write(template_folding_yaml) + + # Create a configuration for building the scaled dot-product attention + # operator to a hardware accelerator + cfg = build_cfg.DataflowBuildConfig( + folding_config_file="folding.yaml", + specialize_layers_config_file="specialize_layers.json", + standalone_thresholds=True, + max_multithreshold_bit_width=16, + mvau_wwidth_max=2048, + verify_steps=[ + # Verify the model after converting to the FINN onnx dialect + build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON, + # Verify the model again using python mode after the default + # streamlining step + build_cfg.VerificationStepType.STREAMLINED_PYTHON, + # Verify the model again after tidy up transformations, right before + # converting to HLS + build_cfg.VerificationStepType.TIDY_UP_PYTHON, + # Verify the model after generating C++ HLS and applying folding + # only inserted if live FIFO-sizing is off: + # build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM, + # No RTL Simulation support for now + ], + # File with test inputs for verification + verify_input_npy=self.build_inputs["input_npy_path"], + # File with expected test outputs for verification + verify_expected_output_npy=self.build_inputs["output_npy_path"], + # Build steps to execute + steps=[ + # Prepares the QONNX graph to be consumed by FINN: Cleanup, lowering + # and Quant to MultiThreshold conversion + prepare_graph(range_info=range_info), + # Unified exhaustive streamlining of complex model topologies + # including attention, residuals and splits + step_streamline, + # conversion of the scaled dot-product attention pattern to + # hardware, including cleanup and data layout squeezing + step_convert_attention_to_hw, + # Convert the elementwise binary operations to hardware operators. + # These include for example adding residual branches and positional + # encoding + step_convert_elementwise_binary_to_hw, + # Convert Lookup layers, e.g., token embedding, to hardware custom + # operators + step_convert_lookup_to_hw, + # Convert Split and Concat operators to hardware, e.g., splits + # contained in the GLU activation + step_convert_split_concat_to_hw, + # Convert depth-wise convolution MatMuls to VVUs + step_convert_depth_wise_to_hw, + # Properly replicate the stream feeding the query, key and value + # projections + step_replicate_streams, + # Convert most other layers supported by FINN to HW operators + "step_convert_to_hw", + # Specialize HW layer implementations as either HLS or RTL + "step_specialize_layers", + "step_create_dataflow_partition", + # Set the folding configuration to meet the cycles per sequence + # target + set_target_parallelization(seq_len, emb_dim), + # Apply folding configuration, specifying hardware implementation + # details + # Note: This triggers a verification step + step_apply_folding_config, + "step_minimize_bit_width", + # The ScaledDotProductAttention custom op does not define any + # estimates + "step_generate_estimate_reports", + "step_hw_codegen", + "step_hw_ipgen", + # Run additional node-by-node verification in RTL simulation of the + # model before creating the stitched IP + # Note: end-to-end verification of the stitched IP in RTL simulation + # is still not possible due to missing float IPs + # node_by_node_cppsim, #only inserted if live FIFO-sizing is off + # Only for debugging for now, does not work if "vivado" style + # StreamingFIFOs are used + # node_by_node_rtlsim, + "step_create_stitched_ip", + # "step_measure_rtlsim_performance", # not possible due to float components + "step_out_of_context_synthesis", # for synthesis results (e.g. utilization) + "step_synthesize_bitfile", + "step_make_driver", + "step_deployment_package", + ], + ) + + # TESTING custom vs live FIFO-sizing + if self.params.get("live_fifo_sizing"): + # insert default FIFO-sizing step (behind step_generate_estimate_reports) + for i in range(len(cfg.steps)): + if cfg.steps[i] == "step_generate_estimate_reports": + cfg.steps.insert(i + 1, "step_set_fifo_depths") + else: + # insert Christoph's custom FIFO-sizing step (behind step_hw_ipgen) + for i in range(len(cfg.steps)): + if cfg.steps[i] == "step_hw_ipgen": + cfg.steps.insert( + i + 1, set_fifo_depths(seq_len, emb_dim, uram_threshold=seq_len) + ) + # also enable cppsim, which doesn't work with virtual FIFOs + cfg.steps.insert(i + 2, node_by_node_cppsim) + cfg.verify_steps.append(build_cfg.VerificationStepType.FOLDED_HLS_CPPSIM) + + return cfg diff --git a/src/finn/benchmarking/dut/vgg10.yml b/src/finn/benchmarking/dut/vgg10.yml new file mode 100644 index 0000000000..99a9ab333d --- /dev/null +++ b/src/finn/benchmarking/dut/vgg10.yml @@ -0,0 +1,31 @@ +model_path: models/vgg10/radioml_w4a4_small_tidy.onnx +folding_config_file: models/vgg10/ZCU104_folding_config.json +specialize_layers_config_file: models/vgg10/ZCU104_specialize_layers.json + +steps: + - step_tidy_up + - finn.builder.custom_step_library.conv1d.step_pre_streamline # Custom step + - step_streamline + - step_convert_to_hw + - finn.builder.custom_step_library.conv1d.step_convert_final_layers # Custom step + - step_create_dataflow_partition + - step_specialize_layers + - step_target_fps_parallelization + - step_apply_folding_config + - step_minimize_bit_width + - step_generate_estimate_reports + - step_set_fifo_depths + - step_hw_codegen + - step_hw_ipgen + - step_create_stitched_ip + - step_measure_rtlsim_performance + - step_out_of_context_synthesis + - step_synthesize_bitfile + - step_make_driver + - step_deployment_package + +# folding config doesn't come with FIFO sizes +auto_fifo_depths: True +auto_fifo_strategy: largefifo_rtlsim + +standalone_thresholds: True diff --git a/src/finn/benchmarking/templates.py b/src/finn/benchmarking/templates.py new file mode 100644 index 0000000000..44c2ebced8 --- /dev/null +++ b/src/finn/benchmarking/templates.py @@ -0,0 +1,214 @@ +# Template strings for benchmarking + +# flake8: noqa + +# power report scripting based on Lucas Reuter: +template_open = """ +open_project $PROJ_PATH$ +open_run $RUN$ +""" + +template_single_test = """ +set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type lut [get_cells -r finn_design_i/.*] +set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type register [get_cells -r finn_design_i/.*] +set_switching_activity -deassert_resets +report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +reset_switching_activity -hier -type lut [get_cells -r finn_design_i/.*] +reset_switching_activity -hier -type register [get_cells -r finn_design_i/.*] +""" + +# template_single_test_type = """ +# set_switching_activity -toggle_rate $TOGGLE_RATE$ -static_probability $STATIC_PROB$ -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*] +# set_switching_activity -deassert_resets +# report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +# reset_switching_activity -hier -type $SWITCH_TARGET$ [get_cells -r finn_design_i/.*] +# """ + +template_sim_power = """ +set_property SOURCE_SET sources_1 [get_filesets sim_1] +import_files -fileset sim_1 -norecurse $TB_FILE_PATH$ +set_property top switching_simulation_tb [get_filesets sim_1] +update_compile_order -fileset sim_1 + +launch_simulation -mode post-implementation -type functional +restart +open_saif $SAIF_FILE_PATH$ +log_saif [get_objects -r /switching_simulation_tb/dut/*] +run $SIM_DURATION_NS$ ns +close_saif + +read_saif $SAIF_FILE_PATH$ +report_power -file $REPORT_PATH$/$REPORT_NAME$.xml -format xml +""" + +# TODO: configurable clock frequency +template_switching_simulation_tb = """ +`timescale 1 ns/10 ps + +module switching_simulation_tb; +reg clk; +reg rst; + +//dut inputs +reg tready; +reg [$INSTREAM_WIDTH$-1:0] tdata; +reg tvalid; + +//dut outputs +wire [$OUTSTREAM_WIDTH$-1:0] accel_tdata; +wire accel_tready; +wire accel_tvalid; + +finn_design_wrapper dut( + .ap_clk(clk), + .ap_rst_n(rst), + .m_axis_0_tdata(accel_tdata), + .m_axis_0_tready(tready), + .m_axis_0_tvalid(accel_tvalid), + .s_axis_0_tdata(tdata), + .s_axis_0_tready(accel_tready), + .s_axis_0_tvalid(tvalid) + ); + +always + begin + clk = 0; + #2.5; + clk = 1; + #2.5; + end + +integer i; +initial + begin + tready = 0; + tdata = 0; + tvalid = 0; + rst = 0; + #50; + rst = 1; + tvalid = 1; + tready = 1; + while(1) + begin + for (i = 0; i < $INSTREAM_WIDTH$/$DTYPE_WIDTH$; i = i+1) begin + tdata[i*$DTYPE_WIDTH$ +: $DTYPE_WIDTH$] = $RANDOM_FUNCTION$; + end + #5; + end + end +endmodule +""" + +zynq_harness_template = """ +set FREQ_MHZ %s +set NUM_AXILITE %d +if {$NUM_AXILITE > 9} { + error "Maximum 10 AXI-Lite interfaces supported" +} +set NUM_AXIMM %d +set BOARD %s +set FPGA_PART %s +create_project finn_zynq_link ./ -part $FPGA_PART + +# set board part repo paths to find boards installed by FINN +set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]] +set paths_param [get_param board.repoPaths] +lappend paths_prop $::env(FINN_ROOT)/deps/board_files +lappend paths_param $::env(FINN_ROOT)/deps/board_files +set_property BOARD_PART_REPO_PATHS $paths_prop [current_project] +set_param board.repoPaths $paths_param + +if {$BOARD == "RFSoC2x2"} { + set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project] + set ZYNQ_TYPE "zynq_us+" +} else { + puts "Unrecognized board" +} + +create_bd_design "top" +if {$ZYNQ_TYPE == "zynq_us+"} { + set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]] + create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps + apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] + set_property CONFIG.PSU__DISPLAYPORT__PERIPHERAL__ENABLE {0} [get_bd_cells zynq_ps] + #activate one slave port, deactivate the second master port + set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {0}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PSU__USE__M_AXI_GP1 {0}] [get_bd_cells zynq_ps] + #set frequency of PS clock (this can't always be exactly met) + set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps] + set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] +} else { + puts "Unrecognized Zynq type" +} + +#instantiate axi interconnect, axi smartconnect +set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]] +#set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]] +create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0 +#create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0 +#set number of axilite interfaces, and number of axi master interfaces +#set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0] +set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0] + +#create reset controller and connect interconnects to PS +if {$ZYNQ_TYPE == "zynq_us+"} { + set axi_peripheral_base 0xA0000000 + #connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD] + connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI] + #connect interconnect clocks and resets + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/ACLK] + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins axi_interconnect_0/S00_ACLK] + #apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins zynq_ps/saxihp0_fpd_aclk] +} +#connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn] + +#procedure used by below IP instantiations to map BD address segments based on the axi interface aperture +proc assign_axi_addr_proc {axi_intf_path} { + #global variable holds current base address + global axi_peripheral_base + #infer range + set range [expr 2**[get_property CONFIG.ADDR_WIDTH [get_bd_intf_pins $axi_intf_path]]] + set range [expr $range < 4096 ? 4096 : $range] + #align base address to range + set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)] + #perform assignment + assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg*] -offset $offset -range $range + #advance base address + set axi_peripheral_base [expr $offset + $range] +} + +#custom IP instantiations/connections start here +%s + +#finalize clock and reset connections for interconnects +if {$ZYNQ_TYPE == "zynq_us+"} { + apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_0/M*_ACLK] +} + +save_bd_design +assign_bd_address +validate_bd_design + +set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [ get_files top.bd ] +make_wrapper -files [get_files top.bd] -import -fileset sources_1 -top + +#set_property strategy Flow_PerfOptimized_high [get_runs synth_1] +#set_property STEPS.SYNTH_DESIGN.ARGS.DIRECTIVE AlternateRoutability [get_runs synth_1] +#set_property STEPS.SYNTH_DESIGN.ARGS.RETIMING true [get_runs synth_1] +#set_property strategy Performance_ExtraTimingOpt [get_runs impl_1] +#set_property STEPS.OPT_DESIGN.ARGS.DIRECTIVE Explore [get_runs impl_1] +#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +#set_property STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE AggressiveExplore [get_runs impl_1] +#set_property STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED true [get_runs impl_1] + +# out-of-context synth can't be used for bitstream generation +# set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} -value {-mode out_of_context} -objects [get_runs synth_1] +launch_runs -to_step write_bitstream impl_1 +wait_on_run [get_runs impl_1] + +# generate synthesis report +open_run impl_1 +report_utilization -hierarchical -hierarchical_depth 4 -file synth_report.xml -format xml +close_project +""" diff --git a/src/finn/benchmarking/util.py b/src/finn/benchmarking/util.py new file mode 100644 index 0000000000..1e08bd2501 --- /dev/null +++ b/src/finn/benchmarking/util.py @@ -0,0 +1,118 @@ +# Utility functions for benchmarking +import json +import os +import shutil +import xml.etree.ElementTree as ET + + +def _find_rows_and_headers(table): + rows = table.findall("tablerow") + headers = [] + + for row in rows: + headers = row.findall("tableheader") + if len(headers) > 0: + break + return (rows, headers) + + +def summarize_table(table): + table_summary = {} + table_summary["headers"] = [] + rows, headers = _find_rows_and_headers(table) + + if len(headers) > 0: + string = "Header: " + for header in headers: + table_summary["headers"].append(header.attrib["contents"]) + string = string + header.attrib["contents"] + " " + # print(string.rstrip()) + + for row in rows: + cells = row.findall("tablecell") + if len(cells) > 0: + cell_name = cells[0].attrib["contents"] + string = cell_name + table_summary[cell_name] = [] + for cell in cells[1:]: + table_summary[cell_name].append(cell.attrib["contents"]) + string = string + cell.attrib["contents"] + " " + # print(string.rstrip()) + + return table_summary + + +def summarize_section(section): + section_summary = {} + section_summary["tables"] = [] + section_summary["subsections"] = {} + + # print("Section:", section.attrib["title"]) + tables = section.findall("table") + sub_sections = section.findall("section") + for table in tables: + section_summary["tables"].append(summarize_table(table)) + # print("") + for sub_section in sub_sections: + section_summary["subsections"][sub_section.attrib["title"]] = summarize_section(sub_section) + + return section_summary + + +def power_xml_to_dict(xml_path): + tree = ET.parse(xml_path) + root = tree.getroot() + sections = root.findall("section") + result = {} + + for section in sections: + result[section.attrib["title"]] = summarize_section(section) + + return result + + +def delete_dir_contents(dir): + for filename in os.listdir(dir): + file_path = os.path.join(dir, filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print("Failed to delete %s. Reason: %s" % (file_path, e)) + + +def merge_dicts(a: dict, b: dict): + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge_dicts(a[key], b[key]) + elif a[key] != b[key]: + raise Exception("ERROR: Dict merge conflict") + else: + a[key] = b[key] + return a + + +def merge_logs(log_a, log_b, log_out): + # merges json log (list of nested dicts) b into a, not vice versa (TODO) + + with open(log_a, "r") as f: + a = json.load(f) + with open(log_b, "r") as f: + b = json.load(f) + + for idx, run_a in enumerate(a): + for run_b in b: + if run_a["run_id"] == run_b["run_id"]: + # a[idx] |= run_b # requires Python >= 3.9 + # a[idx] = {**run_a, **run_b} + a[idx] = merge_dicts(run_a, run_b) + break + + # also sort by run id + out = sorted(a, key=lambda x: x["run_id"]) + + with open(log_out, "w") as f: + json.dump(out, f, indent=2) diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index 8bb8a850f7..2184531443 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -39,8 +39,10 @@ import sys import time from qonnx.core.modelwrapper import ModelWrapper +from rich import print as rprint from rich.console import Console from rich.logging import RichHandler +from rich.traceback import Traceback from finn.builder.build_dataflow_config import DataflowBuildConfig, default_build_dataflow_steps from finn.builder.build_dataflow_steps import build_dataflow_step_lookup @@ -159,21 +161,12 @@ def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: return filename -def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): - """Best-effort build a dataflow accelerator using the given configuration. - - :param model_filename: ONNX model filename to build - :param cfg: Build configuration - """ - finn_build_dir = os.environ["FINN_BUILD_DIR"] - - print(f"Intermediate outputs will be generated in {finn_build_dir}") - print(f"Final outputs will be generated in {cfg.output_dir}") - print(f"Build log is at {cfg.output_dir}/build_dataflow.log") - # create the output dir if it doesn't exist - os.makedirs(cfg.output_dir, exist_ok=True) - - # set up logger +def setup_logging(cfg: DataflowBuildConfig): + # Set up global logger, the force=True has the following effects: + # - If multiple build are run in a row, the log file will be re-created for each, + # which is needed if the file was deleted/moved or the output dir changed + # - In a PyTest session, this logger will replace the PyTest log handlers, so logs + # (+ captured warnings!) will end up in the log file instead of being collected by PyTest logpath = os.path.join(cfg.output_dir, "build_dataflow.log") if cfg.verbose: logging.basicConfig( @@ -181,6 +174,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): format="[%(asctime)s]%(levelname)s: %(pathname)s:%(lineno)d: %(message)s", filename=logpath, filemode="w", + force=True, ) else: logging.basicConfig( @@ -188,22 +182,25 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): format="[%(asctime)s]%(levelname)s: %(message)s", filename=logpath, filemode="w", + force=True, ) - # Capture all warnings.warn calls of qonnx,... + # Capture all warnings.warn calls of qonnx, ... logging.captureWarnings(True) + # Mirror stdout and stderr to log log = logging.getLogger("build_dataflow") - - # mirror stdout and stderr to log - sys.stdout = PrintLogger(log, logging.INFO, sys.stdout) - sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr) + if not isinstance(sys.stdout, PrintLogger): + # Prevent rediricting stdout/sterr multiple times + sys.stdout = PrintLogger(log, logging.INFO, sys.stdout) + sys.stderr = PrintLogger(log, logging.ERROR, sys.stderr) console = Console(file=sys.stdout.console) + # Mirror a configurable log level to console (default = ERROR) if cfg.console_log_level != "NONE": - # set up console logger - consoleHandler = RichHandler(show_time=True, show_path=False, console=console) - + consoleHandler = RichHandler( + show_time=True, log_time_format="[%Y-%m-%d %H:%M:%S]", show_path=False, console=console + ) if cfg.console_log_level == "DEBUG": consoleHandler.setLevel(logging.DEBUG) elif cfg.console_log_level == "INFO": @@ -216,9 +213,52 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): consoleHandler.setLevel(logging.CRITICAL) logging.getLogger().addHandler(consoleHandler) - # Setup done, start processing + return log + + +def exit_buildflow(cfg: DataflowBuildConfig, time_per_step: dict = None, exit_code: int = 0): + if exit_code: + print("Build failed") + status = "failed" + else: + print("Build completed successfully") + status = "ok" + + # Generate metadata_builder.json + metadata = { + "status": status, + "tool_version": os.path.basename(os.environ.get("XILINX_VIVADO")), + } + with open(os.path.join(cfg.output_dir, "report/metadata_builder.json"), "w") as f: + json.dump(metadata, f, indent=2) + + # Generate time_per_step.json + if time_per_step is not None: + time_per_step["total_build_time"] = sum(time_per_step.values()) + with open(os.path.join(cfg.output_dir, "report/time_per_step.json"), "w") as f: + json.dump(time_per_step, f, indent=2) + + return exit_code + + +def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): + """Best-effort build a dataflow accelerator using the given configuration. + + :param model_filename: ONNX model filename to build + :param cfg: Build configuration + """ + # Create the output (report) dir if it doesn't exist + os.makedirs(os.path.join(cfg.output_dir, "report"), exist_ok=True) + + log = setup_logging(cfg) + + print(f"Intermediate outputs will be generated in {os.environ['FINN_BUILD_DIR']}") + print(f"Final outputs will be generated in {cfg.output_dir}") + print(f"Build log is at {cfg.output_dir}/build_dataflow.log") + + # Setup done, start build flow try: - # if start_step is specified, override the input model + # If start_step is specified, override the input model if cfg.start_step is None: print(f"Building dataflow accelerator from {model_filename}") model = ModelWrapper(model_filename) @@ -240,7 +280,7 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): model = ModelWrapper(intermediate_model_filename) assert type(model) is ModelWrapper - # start processing + # Start processing step_num = 1 time_per_step = dict() build_dataflow_steps = resolve_build_steps(cfg) @@ -249,11 +289,11 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): step_name = transform_step.__name__ print(f"Running step: {step_name} [{step_num}/{len(build_dataflow_steps)}]") - # run the step + # Run the step step_start = time.time() model = transform_step(model, cfg) step_end = time.time() - time_per_step[step_name] = step_end - step_start + time_per_step[step_name] = round(step_end - step_start) chkpt_name = f"{step_name}.onnx" if cfg.save_intermediate_models: intermediate_model_dir = os.path.join(cfg.output_dir, "intermediate_models") @@ -263,36 +303,28 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): step_num += 1 except KeyboardInterrupt: print("KeyboardInterrupt detected. Aborting...") - print("Build failed") - return -1 + return exit_buildflow(cfg, time_per_step, -1) except (Exception, FINNError) as e: - # Print full traceback if we are on debug log level - # or encountered a non-user error - print_full_traceback = True - if issubclass(type(e), FINNUserError) and log.level != logging.DEBUG: - print_full_traceback = False - - extype, value, tb = sys.exc_info() - if print_full_traceback: - # print exception info and traceback - log.error("FINN Internal compiler error:") - console.print_exception(show_locals=False) - else: - console.print(f"[bold red]FINN Error: [/bold red]{e}") - log.error(f"{e}") - print("Build failed") - return -1 # A user error shouldn't be need to be fixed using PDB - - # start postmortem debug if configured - if cfg.enable_build_pdb_debug: - pdb.post_mortem(tb) - print("Build failed") - return -1 + # Re-raise exception if we are in a PyTest session so we don't miss it + if "PYTEST_CURRENT_TEST" in os.environ: + raise - with open(os.path.join(cfg.output_dir, "time_per_step.json"), "w") as f: - json.dump(time_per_step, f, indent=2) - print("Completed successfully") - return 0 + if issubclass(type(e), FINNUserError): + # Handle FINN USER ERROR + log.error(f"FINN ERROR: {e}") + else: + # Handle remaining errors (= FINN INTERNAL COMPILER ERROR) + log.error(f"FINN INTERNAL COMPILER ERROR: {e}") + + # Print traceback for interal errors or if in debug mode + if not issubclass(type(e), FINNUserError) or log.level == logging.DEBUG: + rprint(Traceback(show_locals=False)) + # Start postmortem debug if configured + if cfg.enable_build_pdb_debug: + pdb.post_mortem(e.__traceback__) + + return exit_buildflow(cfg, time_per_step, -1) + return exit_buildflow(cfg, time_per_step, 0) def build_dataflow_directory(path_to_cfg_dir: str): diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 02e1d66d54..57204c5745 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -173,16 +173,16 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin): """ #: Directory where the final build outputs will be written into - output_dir: str + output_dir: Optional[str] = None #: Target clock frequency (in nanoseconds) for Vivado synthesis. #: e.g. synth_clk_period_ns=5.0 will target a 200 MHz clock. #: If hls_clk_period_ns is not specified it will default to this value. - synth_clk_period_ns: float + synth_clk_period_ns: Optional[float] = None #: Which output(s) to generate from the build flow. See documentation of #: DataflowOutputType for available options. - generate_outputs: List[DataflowOutputType] + generate_outputs: Optional[List[DataflowOutputType]] = None #: (Optional) Path to configuration JSON file in which user can specify #: a preferred implementation style (HLS or RTL) for each node. @@ -350,14 +350,14 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin): #: Whether pdb postmortem debuggig will be launched when the build fails enable_build_pdb_debug: Optional[bool] = False - #: When True, additional verbose information will be written to the log file. - #: Otherwise, these additional information will be suppressed. + #: When True, additional information (level = DEBUG) will be written to the log file. + #: Otherwise, this additional information will be suppressed (level = INFO). verbose: Optional[bool] = False #: Log level to be used on the command line for finn-plus internal logging. - #: This is different from the log level used for the build process, + #: This is different from the log level used for build_dataflow.log, #: which is controlled using the verbose flag. - console_log_level: Optional[LogLevel] = LogLevel.NONE + console_log_level: Optional[LogLevel] = LogLevel.ERROR #: If given, only run the steps in the list. If not, run default steps. #: See `default_build_dataflow_steps` for the default list of steps. @@ -395,6 +395,9 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin): #: If set to commit hash specified version will be used cpp_driver_version: Optional[str] = "latest" + #: Specify validation dataset to be used for deployment of the PYNQ driver + validation_dataset: Optional[str] = None + def _resolve_hls_clk_period(self): if self.hls_clk_period_ns is None: # use same clk for synth and hls if not explicitly specified diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 6de7f1dc0f..aab45b9972 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -418,7 +418,9 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi "depth_trigger_uram", "depth_trigger_bram", ] - extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs) + extract_model_config_to_json( + model, cfg.output_dir + "/report/auto_folding_config.json", hw_attrs + ) return model @@ -507,6 +509,7 @@ def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) estimate_layer_resources_hls = model.analysis(hls_synth_res_estimation) + estimate_layer_resources_hls["total"] = aggregate_dict_keys(estimate_layer_resources_hls) with open(report_dir + "/estimate_layer_resources_hls.json", "w") as f: json.dump(estimate_layer_resources_hls, f, indent=2) @@ -663,7 +666,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): model = model.transform(ApplyConfig(cfg.folding_config_file)) # extract the final configuration and save it as json - extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) + extract_model_config_to_json(model, cfg.output_dir + "/report/final_hw_config.json", hw_attrs) # perform FIFO splitting and shallow FIFO removal only after the final config # json file has been written. otherwise, since these transforms may add/remove @@ -827,7 +830,9 @@ def step_make_driver(model: ModelWrapper, cfg: DataflowBuildConfig): ) ) else: - model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) + model = model.transform( + MakePYNQDriverIODMA(cfg._resolve_driver_platform(), cfg.validation_dataset) + ) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) log.info("PYNQ Python driver written into " + driver_dir) elif DataflowOutputType.CPP_DRIVER in cfg.generate_outputs: diff --git a/src/finn/builder/custom_step_library/__init__.py b/src/finn/builder/custom_step_library/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/finn/builder/custom_step_library/conv1d.py b/src/finn/builder/custom_step_library/conv1d.py new file mode 100644 index 0000000000..f6de8edaae --- /dev/null +++ b/src/finn/builder/custom_step_library/conv1d.py @@ -0,0 +1,20 @@ +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.change_3d_tensors_to_4d import Change3DTo4DTensors +from qonnx.transformation.general import GiveUniqueNodeNames + +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb +from finn.builder.build_dataflow_config import DataflowBuildConfig + + +def step_pre_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Change3DTo4DTensors()) + model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) + return model + + +def step_convert_final_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(GiveUniqueNodeNames()) + return model diff --git a/src/finn/builder/custom_step_library/mobilenet.py b/src/finn/builder/custom_step_library/mobilenet.py new file mode 100644 index 0000000000..0c251ad299 --- /dev/null +++ b/src/finn/builder/custom_step_library/mobilenet.py @@ -0,0 +1,114 @@ +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.change_datalayout import ChangeDataLayoutQuantAvgPool2d +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +from qonnx.transformation.general import ApplyConfig, GiveReadableTensorNames, GiveUniqueNodeNames +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps + +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +import finn.transformation.streamline.absorb as absorb +import finn.transformation.streamline.reorder as reorder +from finn.builder.build_dataflow_config import ( + DataflowBuildConfig, + ShellFlowType, + VerificationStepType, +) +from finn.builder.build_dataflow_steps import verify_step +from finn.transformation.streamline import Streamline +from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds + + +def step_mobilenet_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(Streamline()) + additional_streamline_transformations = [ + DoubleToSingleFloat(), + reorder.MoveMulPastDWConv(), + absorb.AbsorbMulIntoMultiThreshold(), + ChangeDataLayoutQuantAvgPool2d(), + InferDataLayouts(), + reorder.MoveTransposePastScalarMul(), + absorb.AbsorbTransposeIntoFlatten(), + reorder.MoveFlattenPastAffine(), + reorder.MoveFlattenPastTopK(), + reorder.MoveScalarMulPastMatMul(), + CollapseRepeatedMul(), + RemoveIdentityOps(), + RoundAndClipThresholds(), + ] + for trn in additional_streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + + if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): + verify_step(model, cfg, "streamlined_python", need_parent=False) + + return model + + +def step_mobilenet_lower_convs(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(LowerConvsToMatMul()) + model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) + model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(RoundAndClipThresholds()) + model = model.transform(InferDataLayouts()) + return model + + +def step_mobilenet_convert_to_hw_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model + + +def step_mobilenet_slr_floorplan(model: ModelWrapper, cfg: DataflowBuildConfig): + if cfg.shell_flow_type == ShellFlowType.VITIS_ALVEO: + try: + from finnexperimental.analysis.partitioning import partition + + # apply partitioning of the model, restricting the first and last layers + # to SLR0 + default_slr = 0 + abs_anchors = [(0, [default_slr]), (-1, [default_slr])] + floorplan = partition( + model, + cfg.synth_clk_period_ns, + cfg.board, + abs_anchors=abs_anchors, + multivariant=False, + )[0] + # apply floorplan to model + model = model.transform(ApplyConfig(floorplan)) + print("SLR floorplanning applied") + except Exception: + print("No SLR floorplanning applied") + return model + + +def step_mobilenet_convert_to_hw_layers_separate_th(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + return model diff --git a/src/finn/builder/custom_step_library/resnet.py b/src/finn/builder/custom_step_library/resnet.py new file mode 100644 index 0000000000..3e1c61063b --- /dev/null +++ b/src/finn/builder/custom_step_library/resnet.py @@ -0,0 +1,208 @@ +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine +from qonnx.transformation.composed import ComposedTransformation +from qonnx.transformation.double_to_single_float import DoubleToSingleFloat +from qonnx.transformation.fold_constants import FoldConstants +from qonnx.transformation.general import ( + ConvertDivToMul, + ConvertSubToAdd, + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, + RemoveStaticGraphInputs, + RemoveUnusedTensors, + SortGraph, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.insert_topk import InsertTopK +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.transformation.remove import RemoveIdentityOps + +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.transformation.move_reshape import RemoveCNVtoFCFlatten +from finn.transformation.streamline.absorb import ( + Absorb1BitMulIntoConv, + Absorb1BitMulIntoMatMul, + AbsorbAddIntoMultiThreshold, + AbsorbConsecutiveTransposes, + AbsorbMulIntoMultiThreshold, + AbsorbScalarMulAddIntoTopK, + AbsorbTransposeIntoMultiThreshold, + FactorOutMulSignMagnitude, +) +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedAdd, + CollapseRepeatedMul, +) + +# just for not linear +from finn.transformation.streamline.reorder import ( + MoveAddPastConv, + MoveAddPastMul, + MoveLinearPastEltwiseAdd, + MoveLinearPastFork, + MoveMaxPoolPastMultiThreshold, + MoveScalarAddPastMatMul, + MoveScalarLinearPastInvariants, + MoveScalarMulPastConv, + MoveScalarMulPastMatMul, + MoveTransposePastEltwise, + MoveTransposePastFork, + MoveTransposePastJoinAdd, +) +from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds +from finn.transformation.streamline.sign_to_thres import ConvertSignToThres + + +def step_resnet50_tidy(model: ModelWrapper, cfg: DataflowBuildConfig): + model = model.transform(GiveUniqueParameterTensors()) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(RemoveStaticGraphInputs()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(InsertTopK()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + return model + + +def step_resnet50_streamline_linear(model: ModelWrapper, cfg: DataflowBuildConfig): + streamline_transformations = [ + AbsorbScalarMulAddIntoTopK(), # before MoveAddPastMul to avoid int->float + ConvertSubToAdd(), + ConvertDivToMul(), + RemoveIdentityOps(), + CollapseRepeatedMul(), + BatchNormToAffine(), + ConvertSignToThres(), + MoveAddPastMul(), + MoveScalarAddPastMatMul(), + MoveAddPastConv(), + MoveScalarMulPastMatMul(), + MoveScalarMulPastConv(), + MoveScalarLinearPastInvariants(), + MoveAddPastMul(), + CollapseRepeatedAdd(), + CollapseRepeatedMul(), + AbsorbAddIntoMultiThreshold(), + FactorOutMulSignMagnitude(), + MoveMaxPoolPastMultiThreshold(), + AbsorbMulIntoMultiThreshold(), + Absorb1BitMulIntoMatMul(), + Absorb1BitMulIntoConv(), + RoundAndClipThresholds(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline_nonlinear(model: ModelWrapper, cfg: DataflowBuildConfig): + streamline_transformations = [ + MoveLinearPastEltwiseAdd(), + MoveLinearPastFork(), + ] + for trn in streamline_transformations: + model = model.transform(trn) + model = model.transform(GiveUniqueNodeNames()) + return model + + +def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + for iter_id in range(4): + model = step_resnet50_streamline_linear(model, cfg) + model = step_resnet50_streamline_nonlinear(model, cfg) + + # big loop tidy up + model = model.transform(RemoveUnusedTensors()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + model = model.transform(DoubleToSingleFloat()) + + # Lower convolutions and streamline resulting transposes + model = model.transform(LowerConvsToMatMul()) + model = model.transform( + ComposedTransformation( + [ + MoveTransposePastJoinAdd(), + MoveTransposePastFork(), + MoveTransposePastEltwise(), + AbsorbConsecutiveTransposes(), + AbsorbTransposeIntoMultiThreshold(), + ] + ) + ) + return model + + +def step_resnet50_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) + model = model.transform(InferDataLayouts()) + model = model.transform(DoubleToSingleFloat()) + model = model.transform(InferDataTypes()) + model = model.transform(SortGraph()) + + to_hw_transformations = [ + to_hw.InferChannelwiseLinearLayer, + to_hw.InferPool, + AbsorbConsecutiveTransposes, + RoundAndClipThresholds, + to_hw.InferQuantizedMatrixVectorActivation, + to_hw.InferThresholdingLayer, + to_hw.InferConvInpGen, + to_hw.InferDuplicateStreamsLayer, + to_hw.InferAddStreamsLayer, + to_hw.InferLabelSelectLayer, + ] + for trn in to_hw_transformations: + model = model.transform(trn()) + model = model.transform(InferDataLayouts()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(InferDataTypes()) + + model = model.transform(RemoveCNVtoFCFlatten()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(RemoveUnusedTensors()) + model = model.transform(SortGraph()) + + return model diff --git a/src/finn/builder/custom_step_library/transformer.py b/src/finn/builder/custom_step_library/transformer.py new file mode 100644 index 0000000000..79cfa29353 --- /dev/null +++ b/src/finn/builder/custom_step_library/transformer.py @@ -0,0 +1,772 @@ +# ADAPTED FROM Christoph's radioml-transformer repository, specifically these files: +# build_steps.py +# custom/apply_config.py + +# Copies (deep-copies) python objects +import copy +import json + +# Numpy for loading and comparing the verification input/output +import numpy as np + +# YAML for loading experiment configurations +import yaml + +# QONNX quantization data types +from qonnx.core.datatype import DataType + +# QONNX wrapper of ONNX model graphs +from qonnx.core.modelwrapper import ModelWrapper + +# Converts ONNX graph nodes to QONNX custom-ops if possible +from qonnx.custom_op.registry import getCustomOp + +# Converts BatchNorm operation to affine transformation +from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine + +# Transformation for exhaustively composing transformations +from qonnx.transformation.composed import ComposedTransformation + +# If we have a convolution with a bias tensors input, QONNX and later FINN +# expect the bias to be expressed as a standalone Add node following the Conv +# node. +from qonnx.transformation.extract_conv_bias import ExtractBiasFromConv + +# Collapses chains of constants into a single constant operation or even +# initializer tensors. +from qonnx.transformation.fold_constants import FoldConstants + +# Converts Gemm operation to MatMul with extracted standalone bias op +from qonnx.transformation.gemm_to_matmul import GemmToMatMul + +# QONNX graph transformations for renaming and cleaning up +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, + RemoveStaticGraphInputs, + RemoveUnusedTensors, + Transformation, +) + +# QONNX graph transformations for annotating the graph with datatype and shape +# information +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes + +# Converts Conv to Im2Col and MatMul with extracted standalone bias op +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul + +# Transposes the initializer tensors of a Quant node instead of having a +# standalone Transpose following +from qonnx.transformation.quant_constant_folding import FoldTransposeIntoQuantInit + +# Range information structure for seeding the range analysis for converting +# quantized activations to MultiThreshold +from qonnx.util.range_analysis import RangeInfo + +# FINN dataflow builder configuration +from finn.builder.build_dataflow_config import DataflowBuildConfig, VerificationStepType + +# FINN verification after build/graph transformation steps +from finn.builder.build_dataflow_steps import verify_step + +# Detects the attention pattern and converts to hardware custom op +from finn.transformation.fpgadataflow.attention import ( + AbsorbMultiThresholdIntoScaledDotProductAttention, + InferScaledDotProductAttention, +) + +# Mult-Head Attention support +from finn.transformation.fpgadataflow.attention_heads import ( + InferMultiHeads, + MoveMergeMultiHeadsPastMultiThreshold, + MoveSplitMultiHeadsPastMultiThreshold, + UnrollMultiHeadAttention, +) +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim + +# Converts (infers) ONNX and QONNX nodes to FINN hardware CustomOps +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferConcatLayer, + InferElementwiseBinaryOperation, + InferLookupLayer, + InferSplitLayer, + InferSqueeze, + InferUnsqueeze, + InferVectorVectorActivation, +) +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP + +# Inserts data-width converter and FIFO nodes into the model graph +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO + +# Transformations preparing the operators for synthesis and simulation +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim + +# Converts fork-nodes to ReplicateStream hardware operator +from finn.transformation.fpgadataflow.replicate_stream import InferReplicateStream +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +# Splitting and removing of FIFOs from the model graph +from finn.transformation.fpgadataflow.set_fifo_depths import RemoveShallowFIFOs, SplitLargeFIFOs + +# Graph transformation setting the folding, i.e., parallelization configuration +from finn.transformation.fpgadataflow.set_folding import SetFolding + +# Specializes each layer's implementation style: HLS or RTL implementation +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + +# Standard QONNX to FINN conversion function +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN + +# Folds quantizers into weight tensor initializers, needed for lowering +# convolutions to MatMuls +from finn.transformation.qonnx.fold_quant_weights import FoldQuantWeights +from finn.transformation.qonnx.quant_act_to_multithreshold import default_filter_function_generator + +# Cleanup transformation getting rid of 3d data layout +from finn.transformation.squeeze import Squeeze +from finn.transformation.streamline.absorb import ( + AbsorbAddIntoMultiThreshold, + AbsorbSignBiasIntoMultiThreshold, +) + +# FINN streamlining transformations fusing/collapsing operations of the same +# kind +from finn.transformation.streamline.collapse_repeated import CollapseRepeatedTranspose + +# FINN streamlining transformations removing nodes without real effect from the +# graph +from finn.transformation.streamline.remove import RemoveIdentityReshape, RemoveIdentityTranspose + +# FINN streamlining transformations reordering the graph +from finn.transformation.streamline.reorder import ( + MoveMulPastAdd, + MoveSqueezePastMatMul, + MoveSqueezePastMultiThreshold, + MoveTransposePastEltwise, + MoveTransposePastFork, + MoveTransposePastJoinAdd, + MoveTransposePastJoinConcat, + MoveTransposePastJoinMul, + MoveTransposePastSplit, +) +from finn.transformation.streamline.streamline_plus import StreamlinePlus as Streamline + +# Execute onnx model graphs from the dataflow parent for verification +from finn.util.test import execute_parent + + +# Prepares the graph to be consumed by FINN: +# 1. Some graph cleanup removing unused tensors, nodes without effect and +# folding constants, i.e., collapsing chains of operations on constant tensors +# 2. Lowers some "more complex" operations: converts Conv and Gemm to MatMul and +# BatchNorm to Mul and Add operations followed by some necessary cleanup +# 3. Converts all QONNX Quant nodes to MultiThreshold operations which can +# absorb scales and biases during streamlining +def prepare_graph(range_info: RangeInfo): + # Wrap the actual transformation/build step function + def step_prepare_graph(model: ModelWrapper, cfg: DataflowBuildConfig): + # Exhaustively apply the set of cleanup transformations + model = model.transform( + ComposedTransformation( + [ + # Adds shape and datatype annotations to all tensors in this graph + InferDataTypes(), + InferShapes(), + # Cleanup the graph by removing redundant, unnecessary and constant + # nodes and tensors and give unique names to everything remaining + GiveUniqueNodeNames(), + GiveReadableTensorNames(), + RemoveStaticGraphInputs(), + RemoveUnusedTensors(), + GiveUniqueParameterTensors(), + FoldConstants(), + # Remove unnecessary shape and layout transformations + RemoveIdentityReshape(), + RemoveIdentityTranspose(), + # Redo shape and datatype annotations after removing nodes and + # tensors + InferShapes(), + InferDataTypes(), + ] + ) + ) + # If configured, run a verification of the transformed model on some + # sample inputs + if VerificationStepType.TIDY_UP_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "tidied_up_python", need_parent=False) + # Exhaustively apply the lowering transformations + model = model.transform( + ComposedTransformation( + [ + # Moves the bias input to the Conv operator as a separate Add node + # behind the Conv node + ExtractBiasFromConv(), + # Converts Gemm nodes to MatMul (+ bias) + GemmToMatMul(), + # Need to do some constant and weight folding first + FoldConstants(), + FoldTransposeIntoQuantInit(), + FoldQuantWeights(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + # Converts Conv layers to MatMul + LowerConvsToMatMul(), + # Converts BatchNorm to affine scale and bias + BatchNormToAffine(), + # Annotate the graph with shape and data type information + InferShapes(), + InferDataTypes(), + ] + ) + ) + # If configured, run a verification of the transformed model on some + # sample inputs + if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "lowered_python", need_parent=False) + + # Apply the standard QONNX to FINN conversion step to convert the + # remaining quantizers not yet covered by the new range analysis based + # method + model = model.transform( + ConvertQONNXtoFINN( + filter_function=default_filter_function_generator( + max_multithreshold_bit_width=cfg.max_multithreshold_bit_width + ) + ) + ) + # If configured, run a verification of the transformed model on some + # sample inputs + if VerificationStepType.QONNX_TO_FINN_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "prepared_graph_python", need_parent=False) + # Return the transformed model + return model + + # Return the wrapped transformation step function + return step_prepare_graph + + +# Applies the custom set of exhaustive streamlining transformations, also taking +# special topology like attention, residuals, splits and transposes into account +def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): + # These should not be applied exhaustively with the other streamlining + # transformations to not end up in cycles. + # Note: This is essential to allow some Add operations to be + # absorbed by the next round's AbsorbSignBiasIntoMultiThreshold + model = model.transform(MoveMulPastAdd()) + model = model.transform(AbsorbSignBiasIntoMultiThreshold()) + # Exhaustively apply the following set of transformations to streamline the + # graph with the overall goal of collecting scales and biases in front of + # MultiThreshold operations or, alternatively, at the end of the graph. + # Note: Contains some sets of nested exhaustive transformations meant for + # particular architectural patterns, e.g., residual topologies. + model = model.transform(Streamline()) + # If configured, run a verification of the transformed model on some + # sample inputs + if VerificationStepType.STREAMLINED_PYTHON in cfg._resolve_verification_steps(): # noqa + verify_step(model, cfg, "streamlined_python", need_parent=False) + # Return the transformed model + return model + + +# Converts scaled dot-product attention operations to FINN hardware operations +# Note: This includes some necessary cleanup after converting the pattern, in +# particular squeezing the data layouts throughout the graph +def step_convert_attention_to_hw(model: ModelWrapper, _: DataflowBuildConfig): + # Try to infer reshaping of attention heads + model = model.transform(InferMultiHeads()) # noqa: Duplicate + # Try to mode the mult-head splitting past the multi thresholds + model = model.transform(MoveSplitMultiHeadsPastMultiThreshold()) + # Moving multi-head splitting past multi thresholds might enable absorbing + # adds into thresholds once again + model = model.transform(AbsorbAddIntoMultiThreshold()) + # Try to infer a ScaledDotProductAttention custom op + model = model.transform(InferScaledDotProductAttention()) + # Parallelize attention head in the onnx graph + model = model.transform(UnrollMultiHeadAttention()) + # Swap the order of merging the multi heads and applying thresholds + model = model.transform(MoveMergeMultiHeadsPastMultiThreshold()) + # If applicable, absorb the final thresholds into the attention operator + model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + # Squeeze (i.e., remove dimensions of size 1) the data layouts throughout + # the graph to treat the time dimension as the batch dimension for all MVU + # and Threshold operators + model = model.transform(Squeeze()) + # Squeezing might have turned further transpose and reshape operations into + # identities (those which just swapped around the dimensions of size 1) + model = model.transform( + ComposedTransformation( + [ + # Move transposes around to some place where they could be removed + # later, i.e., where they collapse into identities + MoveTransposePastFork(), + MoveTransposePastSplit(), + MoveTransposePastJoinConcat(), + MoveTransposePastEltwise(), + MoveTransposePastJoinMul(), + MoveTransposePastJoinAdd(), + CollapseRepeatedTranspose(), + # Remove identity shape/layout transformations + RemoveIdentityTranspose(), + RemoveIdentityReshape(), + # Squeeze operators can be moved past MatMuls and thresholding + MoveSqueezePastMatMul(), + MoveSqueezePastMultiThreshold(), + ] + ) + ) + # Squeezing might enable absorbing adds into thresholds once again + model = model.transform(AbsorbAddIntoMultiThreshold()) + # If applicable, absorb the final thresholds into the attention operator + # Note: Might be applicable again after squeezing a transpose away + model = model.transform(AbsorbMultiThresholdIntoScaledDotProductAttention()) + # We should do another round of streamlining to be sure and support more + # general architectural patterns, we are not aware of yet... + model = model.transform(Streamline()) + # Convert Squeeze and Unsqueeze operators to hardware operations + model = model.transform(InferSqueeze()) + model = model.transform(InferUnsqueeze()) + # Return the model with attention and multi-heads mapped to hardware + # operators + return model + + +# Function running the transformations to convert elementwise binary operations +# to their hardware implementations +def step_convert_elementwise_binary_to_hw(model: ModelWrapper, _): + # Convert elementwise operations to hardware operators + # Note: Do not convert the final Mul operator at the output + return model.transform( + InferElementwiseBinaryOperation(InferElementwiseBinaryOperation.reject_output_dequant) + ) + + +# Converts Split and Concat operations to hardware custom operators +def step_convert_split_concat_to_hw(model: ModelWrapper, _): + return model.transform(InferSplitLayer()).transform(InferConcatLayer()) + + +# Function running the transformations to convert Gather, i.e., index lookup, +# nodes to their hardware implementations +def step_convert_lookup_to_hw(model: ModelWrapper, _): + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(model.graph.node): + # If this is a Gather node, force the input (index) type annotation + if node.op_type == "Gather": + # Force to unsigned 64-bit integer for now + model.set_tensor_datatype(node.input[1], DataType["UINT64"]) + # Get the value info for the input tensor to have access to the ONNX + # datatype of the tensor + value_info = model.get_tensor_valueinfo(node.input[1]) + # Force the container datatype of the input to be a float + value_info.type.tensor_type.elem_type = 1 + # Convert Gather to Lookup layers + return model.transform(InferLookupLayer()) + + +# Converts depth-wise convolution to hardware operator calling the +# InferVectorVectorActivation transformation +def step_convert_depth_wise_to_hw(model: ModelWrapper, _: DataflowBuildConfig): + return model.transform(InferVectorVectorActivation()) + + +# Function running the InferReplicateStream transformation +def step_replicate_streams(model: ModelWrapper, _): + # Properly replicate the stream feeding the query, key and value projections + return model.transform(InferReplicateStream()) + + +# Custom step for setting the parallelism to meet the target of T^2 cycles per +# sequence +def set_target_parallelization(seq_len: int, emb_dim: int): # noqa: emb_dim + # The wrapping function is a generator and this is the actual build step + # function taking the model and build configuration + def step_set_target_parallelization(model: ModelWrapper, cfg: DataflowBuildConfig): + # Run over all nodes in the model graph to look for attention operators, + # which are currently not handled by the SetFolding transformation + for index, node in enumerate(model.graph.node): + # Only handle attention operations here + if node.op_type == "ScaledDotProductAttention_hls": + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Set the sequence and embedding dimension folding to meet the + # T^2 cycles target, i.e., fully parallel along the embedding + # dimension and fully sequential along the sequence dimension + inst.set_nodeattr("EmbFold", 1) + inst.set_nodeattr("SeqFold", seq_len) + # Apply the built-in folding configuration transformation with the + # T^2 target cycles + model = model.transform( + SetFolding(seq_len**2, cfg.mvau_wwidth_max, cfg.folding_two_pass_relaxation) + ) + # TODO: Extract the folding configuration + # Return the model with configured parallelization + return model + + # Return the wrapped build step function + return step_set_target_parallelization + + +# Applies configuration dictionary to the model graph +class ApplyConfig(Transformation): + # Initializes the transformation with the configuration dictionary + def __init__(self, config): + # Initialize the transformation base class + super().__init__() + # Register the configuration dictionary to be used in apply() + self.config = config + + # Applies the transform to a whole model graph + def apply(self, model: ModelWrapper): # noqa + # Get the model graph out of the model wrapper object + graph = model.graph + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(graph.node): + # A node should not be named "defaults"... + assert node.name != "defaults", "Node has reserved name 'defaults'" + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Apply the per operator type default configurations to the node + if node.op_type in self.config["defaults"]: + # Run over all default options to be applied to this node + for key, value in self.config["defaults"][node.op_type].items(): + # Set the nodes attribute to the default option value + inst.set_nodeattr(key, value) + # If there is an individual, node-specific configuration apply + # this next, potentially overriding the defaults set above + if node.name in self.config: + # Run over all node-specific options to be applied to this + # node + for key, value in self.config[node.name].items(): + # Set the nodes attribute to the option value + inst.set_nodeattr(key, value) + # Return model with configuration applied + # Note: Do not consider this as modifying the graph. This does not have + # to be reapplied multiple times. + return model, False + + +# Custom build step trying to set appropriate FIFO sizes for the transformer +def set_fifo_depths(seq_len: int, emb_dim: int, uram_threshold: int = 32): # noqa: emb_dim + # The wrapping function is a generator and this is the actual build step + # function taking the model and build configuration + def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): + # Run over all nodes in the model graph + for index, node in enumerate(model.graph.node): + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Extract the FIFO depths configuration of the node + in_depths = inst.get_nodeattr("inFIFODepths") + out_depths = inst.get_nodeattr("outFIFODepths") + + # Number of inputs and outputs to/from the node + num_inputs = len(node.input) + num_outputs = len(node.output) + + # If the input/output has only default configurations, fill with as + # many shallow FIFOs as there are inputs, to avoid later problems + # with to few FIFO depths specified + if in_depths == [2] and num_inputs > 1: + in_depths = num_inputs * [2] + if out_depths == [2] and num_outputs > 1: + out_depths = num_outputs * [2] + + # Special case: Attention needs properly sized input FIFOs + if node.op_type == "ScaledDotProductAttention_hls": + # Each folded input stream needs to be buffered completely + # TODO: Not exactly sure whether this is always correct or just + # the worst-case + in_depths = [inst.get_number_input_values(i) for i in range(num_inputs)] + # Note: No special treatment of the output FIFO + # out_depths = ... + + # Special case: Adding residual branches needs to buffer the inputs + # to avoid deadlocks if one branch is running faster/slower + if node.op_type == "ElementwiseAdd_hls": + # Only relevant if for join-node operations, i.e., node actually + # consumes two branches, potentially operating at a different + # rate + if model.is_join_node(node): + # Set both inputs to buffer as many cycles as we target for + # the attention operations, i.e., the T^2 cycles per + # sequence target + # TODO: Not exactly sure whether this is always correct or + # just the worst-case + # TODO: Currently we do not really have a reliable way of + # figuring out which of the two is the longer/deeper branch + # in terms of cycles to set a corresponding buffer only to + # the shorter branch. + in_depths = [seq_len**2, seq_len**2] + # Note: No special treatment of the output FIFO + # out_depths = ... + + # Set the updated FIFO depths attributes + inst.set_nodeattr("inFIFODepths", in_depths) + inst.set_nodeattr("outFIFODepths", out_depths) + + # The following partially mirrors (or even copies from) the build-in + # step_set_fifo_depths using only manual FIFO depths and our YAML-based + # folding configuration. + + # Insert data-width converters + model = model.transform(InsertDWC()) + # Insert FIFOs between all operators (inserts shallow, depths 2 FIFOs if + # no other depth is specified) + model = model.transform(InsertFIFO(create_shallow_fifos=True)) + # Specialize the implementation variant of the (newly added FIFO) layers + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) # noqa: Access _ method + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Only applies if a configuration file is given + if cfg.folding_config_file is not None: + # Load the configuration dictionary form YAML file + with open(cfg.folding_config_file, "r") as file: + # Load YAML string + config = yaml.safe_load(file) + # Assign unique names to the nodes which can be matched by + # individual per-node configuration options + model = model.transform(GiveUniqueNodeNames()) + # Apply the configuration dictionary to the model graph + model = model.transform(ApplyConfig(config)) + + # Run over all nodes in the model graph once again to modify the + # inserted FIFOs + # Note: This overwrites the folding configuration... + # TODO: Find a better way to handle this + for index, node in enumerate(model.graph.node): + # Modify all RTL FIFO operators + if node.op_type == "StreamingFIFO_rtl": + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Check the depth of the FIFO: If this is not a shallow FIFO, + # implement this via the vivado strategy in URAM + if inst.get_nodeattr("depth") >= uram_threshold: + # Change the implementation style to vivado + inst.set_nodeattr("impl_style", "vivado") + # Set the resource type for the memory to URAM + inst.set_nodeattr("ram_style", "ultra") + + # Hardware attributes to be extracted from each node + hw_attrs = { + "PE", + "SIMD", + "parallel_window", + "ram_style", + "ram_style_thresholds", + "ram_style_mask", + "depth", + "impl_style", + "resType", + "mac_resource", + "mem_mode", + "runtime_writeable_weights", + "inFIFODepths", + "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", + } + + # Start collecting the configuration from the model graph as a + # dictionary + config = {"defaults": {}} + # Iterate all nodes in the graph keeping track of the index + for index, node in enumerate(model.graph.node): + # Convert this to the custom-op instance for easy access to node + # attributes + inst = getCustomOp(node) + # Prepare the node-specific configuration entry for this node + config[node.name] = {} + # Collect attribute values for all specified hardware attributes + for key in hw_attrs: + # Some hardware attributes may not be present for all nodes or + # op-types, this will be signaled via exception + try: + # Try extracting the configuration value from the node + # custom-op instance + config[node.name][key] = inst.get_nodeattr(key) + # Missing attributes are signaled va AttributeError + except AttributeError: + # Can be safely ignored here + pass + # Cleanup: If no attribute is present for this node, there is no + # need to keep this in the configuration dictionary as there is + # nothing to be restored later + if not config[node.name]: + # Remove the entry form the configuration dictionary + del config[node.name] + + # Create/Open a YAML file to store the configuration for later reuse + # TODO: make consistent with .json report in default step + with open(cfg.output_dir + "/report/final_hw_config.yaml", "w") as file: + # Store the configuration dictionary as YAML code + yaml.safe_dump(config, file) + + # Perform FIFO splitting and shallow FIFO removal only after the final + # config file has been written. Otherwise, since these transforms may + # add/remove FIFOs, we get name mismatch problems when trying to reuse + # the final config. + if cfg.split_large_fifos: + model = model.transform(SplitLargeFIFOs()) + model = model.transform(RemoveShallowFIFOs()) + + # generate a dedicated report about final FIFO sizes + fifo_info = {} + fifo_info["fifo_depths"] = {} + fifo_info["fifo_sizes"] = {} + total_fifo_size = 0 + for node in model.get_nodes_by_op_type("StreamingFIFO_rtl"): + node_inst = getCustomOp(node) + fifo_info["fifo_depths"][node.name] = node_inst.get_nodeattr("depth") + fifo_info["fifo_sizes"][ + node.name + ] = node_inst.get_instream_width() * node_inst.get_nodeattr("depth") + total_fifo_size += fifo_info["fifo_sizes"][node.name] + fifo_info["total_fifo_size_kB"] = int(total_fifo_size / 8.0 / 1000.0) + + with open(cfg.output_dir + "/report/fifo_sizing.json", "w") as f: + json.dump(fifo_info, f, indent=2) + + # After FIFOs are ready to go, call PrepareIP and HLSSynthIP again + # this will only run for the new nodes (e.g. FIFOs and DWCs) + model = model.transform( + PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) # noqa + ) + model = model.transform(HLSSynthIP()) + + # Return the model with configured parallelization + return model + + # Return the wrapped build step function + return step_set_fifo_depths + + +# Custom step applying our custom format of folding configuration to the graph +def step_apply_folding_config(model: ModelWrapper, cfg: DataflowBuildConfig): + # Only applies if a configuration file is given + if cfg.folding_config_file is not None: + # Load the configuration dictionary form YAML file + with open(cfg.folding_config_file, "r") as file: + # Load YAML string + config = yaml.safe_load(file) + # Assign unique names to the nodes which can be matched by + # individual per-node configuration options + model = model.transform(GiveUniqueNodeNames()) + # Apply the configuration dictionary to the model graph + model = model.transform(ApplyConfig(config)) + # If configured, run a verification of the transformed model on some sample + # inputs + if VerificationStepType.FOLDED_HLS_CPPSIM in cfg._resolve_verification_steps(): # noqa + # Prepare C++ Simulation for verification + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + # Execute a verification step of the model with inputs specified in + # build configuration + verify_step(model, cfg, "folded_hls_cppsim", need_parent=True) + + # Return model with configuration applied + return model + + +# Runs a node-by-node C++ simulation of the model saving the fill execution +# context +def node_by_node_cppsim(model: ModelWrapper, cfg: DataflowBuildConfig): + # Save the original model + original = model + # Copy the model + model = copy.deepcopy(model) + # Set model execution mode to C++ simulation + model = model.transform(SetExecMode("cppsim")) + # Generates the C++ source and compiles the C++ simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + + # Load the verification input/output pair + inp = np.load(cfg.verify_input_npy) # noqa + out = np.load(cfg.verify_expected_output_npy) + + # Path to the parent model wrapping the streaming dataflow partition and the + # wrapped child model, i.e., the inside of the streaming dataflow partition + parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" + child = f"{cfg.output_dir}/intermediate_models/verify_cppsim.onnx" + # Save the child model prepared for C++ simulation + model.save(child) + # Load the parent model to pass to verification execution + parent_model = ModelWrapper(parent) + + # Reshape the input/output to match the model + inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) + out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) + + # Execute the onnx model to collect the result + # context = execute_onnx(model, context, return_full_exec_context=True) + context = execute_parent(parent, child, inp, return_full_ctx=True) + # Extract the output tensor from the execution context + model_out = context[parent_model.graph.output[0].name] + # Compare input to output + result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)] + # Save the verification outputs into the configured build directory + verification_output = f"{cfg.output_dir}/verification_output/" + # Save the verification execution context + np.savez(f"{verification_output}/verify_cppsim_{result}.npz", **context) + # Return the original, unmodified model + return original + + +# Runs a node-by-node RTL simulation of the model saving the fill execution +# context +def node_by_node_rtlsim(model: ModelWrapper, cfg: DataflowBuildConfig): + # Save the original model + original = model + # Copy the model + model = copy.deepcopy(model) + # Set model execution mode to RTL simulation + model = model.transform(SetExecMode("rtlsim")) + # Generates the C++ source and compiles the RTL simulation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)) # noqa + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + + # Load the verification input/output pair + inp = np.load(cfg.verify_input_npy) # noqa + out = np.load(cfg.verify_expected_output_npy) + + # Path to the parent model wrapping the streaming dataflow partition and the + # wrapped child model, i.e., the inside of the streaming dataflow partition + parent = f"{cfg.output_dir}/intermediate_models/dataflow_parent.onnx" + child = f"{cfg.output_dir}/intermediate_models/verify_rtlsim.onnx" + # Save the child model prepared for RTL simulation + model.save(child) + # Load the parent model to pass to verification execution + parent_model = ModelWrapper(parent) + + # Reshape the input/output to match the model + inp = inp.reshape(parent_model.get_tensor_shape(model.graph.input[0].name)) + out = out.reshape(parent_model.get_tensor_shape(model.graph.output[0].name)) + + # Execute the onnx model to collect the result + # context = execute_onnx(model, context, return_full_exec_context=True) + context = execute_parent(parent, child, inp, return_full_ctx=True) + # Extract the output tensor from the execution context + model_out = context[parent_model.graph.output[0].name] + # Compare input to output + result = {True: "SUCCESS", False: "FAIL"}[np.allclose(out, model_out)] + # Save the verification outputs into the configured build directory + verification_output = f"{cfg.output_dir}/verification_output/" + # Save the verification execution context + np.savez(f"{verification_output}/verify_rtlsim_{result}.npz", **context) + # Return the original, unmodified model + return original diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index 61f2762039..46616599cb 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -28,8 +28,8 @@ import numpy as np import os -import sys from qonnx.custom_op.registry import getCustomOp +from subprocess import CalledProcessError from finn.util.basic import ( get_liveness_threshold_cycles, @@ -39,6 +39,7 @@ ) from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy from finn.util.deps import get_deps_path +from finn.util.exception import FINNError from finn.util.logging import log try: @@ -294,11 +295,12 @@ def rtlsim_exec_cppxsi( # write compilation command to a file for easy re-running/debugging with open(sim_base + "/compile_rtlsim.sh", "w") as f: f.write(" ".join(build_cmd)) - stdout, stderr = launch_process_helper(build_cmd, cwd=sim_base) + try: + launch_process_helper(build_cmd, cwd=sim_base, print_stdout=False) + except CalledProcessError: + raise FINNError("Failed to compile rtlsim executable") if not os.path.isfile(sim_base + "/rtlsim_xsi"): - print(stdout) - print(stderr, file=sys.stderr) - raise RuntimeError("Failed to compile rtlsim executable") + raise FINNError("Failed to compile rtlsim executable") # launch the rtlsim executable # important to specify LD_LIBRARY_PATH here for XSI to work correctly diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py index f17bc48fc6..e7d02a4915 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingfifo_hls.py @@ -66,7 +66,7 @@ def strm_decl(self): ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( + 'hls::stream> out0_{} ("out0_{}");'.format( self.get_outstream_width(), self.hls_sname(), self.hls_sname() ) ) @@ -88,7 +88,7 @@ def docompute(self): VirtualFIFO(in_fifo, out_fifo, mode, depth, occupancy, max_occupancy); // FIFO -> AXI-Stream - move(out_fifo, out_%s); + move(out_fifo, out0_%s); """ % (self.hls_sname(), self.hls_sname()) ] @@ -99,7 +99,7 @@ def blackboxfunction(self): out_packed_bits = self.get_outstream_width() out_packed_hls_type = "ap_uint<%d>" % out_packed_bits self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s, ap_uint<32> mode, + """void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out0_%s, ap_uint<32> mode, ap_uint<32> depth, ap_uint<32> &occupancy, ap_uint<32> &max_occupancy)""" % ( self.onnx_node.name, @@ -115,7 +115,7 @@ def pragmas(self): "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=out0_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=mode") self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE s_axilite port=depth") diff --git a/src/finn/interface/run_finn.py b/src/finn/interface/run_finn.py index ca5faef96d..a01b70bfb4 100644 --- a/src/finn/interface/run_finn.py +++ b/src/finn/interface/run_finn.py @@ -260,6 +260,30 @@ def run(dependency_path: str, build_path: str, num_workers: int, script: str) -> ) +@click.command(help="Run a given benchmark configuration.") +@click.option("--bench_config", help="Name or path of experiment configuration file", required=True) +@click.option("--dependency-path", "-d", default="") +@click.option("--num-workers", "-n", default=-1, show_default=True) +@click.option( + "--build-path", + "-b", + help="Specify a build temp path of your choice", + default="", +) +def bench(bench_config: str, dependency_path: str, num_workers: int, build_path: str) -> None: + console = Console() + build_dir = Path(build_path).expanduser() if build_path != "" else None + dep_path = Path(dependency_path).expanduser() if dependency_path != "" else None + prepare_finn(dep_path, Path(), build_dir, num_workers) + console.rule("RUNNING BENCHMARK") + + # Late import because we need prepare_finn to setup remaining dependencies first + from finn.benchmarking.bench import start_bench_run + + exit_code = start_bench_run(bench_config) + sys.exit(exit_code) + + @click.command(help="Run a given test. Uses /tmp/FINN_TMP as the temporary file location") @click.option( "--variant", @@ -385,6 +409,7 @@ def main() -> None: main_group.add_command(config) main_group.add_command(deps) main_group.add_command(build) + main_group.add_command(bench) main_group.add_command(test) main_group.add_command(run) main_group() diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index c8bc1c009d..0e2bc27114 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -27,9 +27,69 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import json import numpy as np +import os +from dataset_loading import FileQueue, ImgQueue from driver import io_shape_dict from driver_base import FINNExampleOverlay +from PIL import Image +from pynq import PL + + +def img_resize(img, size): + w, h = img.size + if (w <= h and w == size) or (h <= w and h == size): + return img + if w < h: + ow = size + oh = int(size * h / w) + return img.resize((ow, oh), Image.BILINEAR) + else: + oh = size + ow = int(size * w / h) + return img.resize((ow, oh), Image.BILINEAR) + + +def img_center_crop(img, size): + crop_height, crop_width = (size, size) + image_width, image_height = img.size + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return img.crop((crop_left, crop_top, crop_left + crop_width, crop_top + crop_height)) + + +def pre_process(img_np): + img = Image.fromarray(img_np.astype(np.uint8)) + img = img_resize(img, 256) + img = img_center_crop(img, 224) + img = np.array(img, dtype=np.uint8) + return img + + +def setup_dataloader(val_path, label_file_path=None, batch_size=100, n_images=50000): + if label_file_path is None: + val_folders = [f.name for f in os.scandir(val_path) if f.is_dir()] + val_folders = sorted(val_folders) + assert len(val_folders) == 1000, "Expected 1000 subfolders in ILSVRC2012 val" + files = [] + labels = [] + for idx, folder in enumerate(val_folders): + current_files = sorted(os.listdir(os.path.join(val_path, folder))) + current_files = [os.path.join(folder, file) for file in current_files] + files.extend(current_files) + labels.extend([idx] * len(current_files)) + files = files[:n_images] + else: + files = ["ILSVRC2012_val_{:08d}.JPEG".format(i) for i in range(1, n_images + 1)] + labels = np.loadtxt(label_file_path, dtype=int, usecols=1) + + file_queue = FileQueue() + file_queue.load_epochs(list(zip(files, labels)), shuffle=False) + img_queue = ImgQueue(maxsize=batch_size) + img_queue.start_loaders(file_queue, num_threads=1, img_dir=val_path, transform=pre_process) + return img_queue + if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -38,7 +98,9 @@ parser.add_argument( "--batchsize", help="number of samples for inference", type=int, default=100 ) - parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True) + parser.add_argument( + "--dataset", help="dataset to use (mnist, cifar10, cifar100, imagenet)", default="" + ) parser.add_argument( "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma" ) @@ -48,14 +110,43 @@ parser.add_argument( "--dataset_root", help="dataset root dir for download/reuse", default="/tmp" ) + parser.add_argument( + "--reportfile", + help="Name of output .json report file", + type=str, + default="validation.json", + ) + parser.add_argument( + "--settingsfile", help="Name of optional input .json settings file", type=str, default="" + ) # parse arguments args = parser.parse_args() bsize = args.batchsize dataset = args.dataset bitfile = args.bitfile platform = args.platform + reportfile = args.reportfile + settingsfile = args.settingsfile dataset_root = args.dataset_root + # overwrite settings if specified in settings file + if settingsfile != "": + with open(settingsfile, "r") as f: + settings = json.load(f) + if "validation_dataset" in settings: + dataset = settings["validation_dataset"] + + # program FPGA and load driver + PL.reset() # reset PYNQ cache + driver = FINNExampleOverlay( + bitfile_name=bitfile, + platform=platform, + io_shape_dict=io_shape_dict, + batch_size=bsize, + runtime_weight_dir="runtime_weights/", + ) + + # prepare dataset if dataset == "mnist": from dataset_loading import mnist @@ -68,40 +159,73 @@ trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( dataset_root, download=True, one_hot=False ) + elif dataset == "cifar100": + from dataset_loading import cifar + + trainx, trainy, testx, testy, valx, valy = cifar.load_cifar_data( + dataset_root, download=True, one_hot=False, cifar10=False + ) + elif dataset == "imagenet": + val_dir = dataset_root + "/ImageNet/2012/val" + label_file = dataset_root + "/ImageNet/2012/val.txt" + img_queue = setup_dataloader(val_dir, label_file, bsize) + total = 50000 else: raise Exception("Unrecognized dataset") - test_imgs = testx - test_labels = testy - - ok = 0 - nok = 0 - total = test_imgs.shape[0] + # run accelerator on dataset + if dataset in ["mnist", "cifar10", "cifar100"]: + test_imgs = testx + test_labels = testy - driver = FINNExampleOverlay( - bitfile_name=bitfile, - platform=platform, - io_shape_dict=io_shape_dict, - batch_size=bsize, - runtime_weight_dir="runtime_weights/", - ) + ok = 0 + nok = 0 + total = test_imgs.shape[0] - n_batches = int(total / bsize) + n_batches = int(total / bsize) - test_imgs = test_imgs.reshape(n_batches, bsize, -1) - test_labels = test_labels.reshape(n_batches, bsize) + test_imgs = test_imgs.reshape(n_batches, bsize, -1) + test_labels = test_labels.reshape(n_batches, bsize) - for i in range(n_batches): - ibuf_normal = test_imgs[i].reshape(driver.ibuf_packed_device[0].shape) - exp = test_labels[i] - driver.copy_input_data_to_device(ibuf_normal) - driver.execute_on_buffers() - obuf_normal = np.empty_like(driver.obuf_packed_device[0]) - driver.copy_output_data_from_device(obuf_normal) - ret = np.bincount(obuf_normal.flatten() == exp.flatten()) - nok += ret[0] - ok += ret[1] - print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok)) + print("Starting validation..") + for i in range(n_batches): + ibuf_normal = test_imgs[i].reshape(driver.ishape_normal()) + exp = test_labels[i] + obuf_normal = driver.execute(ibuf_normal) + # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + if obuf_normal.shape[1] > 1: + obuf_normal = np.argmax(obuf_normal, axis=1) + ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) + nok += ret[0] + ok += ret[1] + print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok)) + elif dataset in ["imagenet"]: + ok = 0 + nok = 0 + i = 0 + print("Starting validation..") + while not img_queue.last_batch: + imgs, lbls = img_queue.get_batch(bsize, timeout=None) + imgs = np.array(imgs) + exp = np.array(lbls) + ibuf_normal = imgs.reshape(driver.ishape_normal()) + obuf_normal = driver.execute(ibuf_normal) + # obuf_normal = obuf_normal.reshape(bsize, -1)[:,0] + if obuf_normal.shape[1] > 1: + obuf_normal = np.argmax(obuf_normal, axis=1) + ret = np.bincount(obuf_normal.flatten() == exp.flatten(), minlength=2) + nok += ret[0] + ok += ret[1] + i += 1 + print("batch %d : total OK %d NOK %d" % (i, ok, nok)) + # calculate top-1 accuracy acc = 100.0 * ok / (total) print("Final accuracy: %f" % acc) + + # write report to file + report = { + "top-1_accuracy": acc, + } + with open(reportfile, "w") as f: + json.dump(report, f, indent=2) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 7a8d38182d..39bed71c82 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -30,14 +30,15 @@ import json import multiprocessing as mp import os -import subprocess from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.util.basic import get_num_default_workers from shutil import copytree +from subprocess import CalledProcessError from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths -from finn.util.basic import make_build_dir +from finn.util.basic import launch_process_helper, make_build_dir +from finn.util.exception import FINNError from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.logging import log @@ -633,14 +634,12 @@ def apply(self, model): f.write("vivado -mode batch -source make_project.tcl\n") f.write("cd {}\n".format(working_dir)) bash_command = ["bash", make_project_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - (_, stderr_data) = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for wrapper HDL + pass # wrapper may be created in different location depending on Vivado version if not os.path.isfile(wrapper_filename): @@ -649,7 +648,7 @@ def apply(self, model): if os.path.isfile(wrapper_filename_alt): model.set_metadata_prop("wrapper_filename", wrapper_filename_alt) else: - raise Exception( + raise FINNError( """CreateStitchedIP failed, no wrapper HDL found under %s or %s. Please check logs under the parent directory.""" % (wrapper_filename, wrapper_filename_alt) diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py index e58c33906c..4b1e70369b 100644 --- a/src/finn/transformation/fpgadataflow/make_driver.py +++ b/src/finn/transformation/fpgadataflow/make_driver.py @@ -312,9 +312,10 @@ class MakePYNQDriverIODMA(Transformation): under the runtime_weights/ subfolder of the pynq_driver_dir. """ - def __init__(self, platform): + def __init__(self, platform, validation_datset=None): super().__init__() self.platform = platform + self.validation_datset = validation_datset def apply(self, model): # create a temporary folder for the generated driver @@ -428,8 +429,16 @@ def apply(self, model): ) shutil.copy(validate_template, validate_py) - # generate weight files for runtime-writable layers + # generate settings.json for generated driver + if self.validation_datset is not None: + settings = { + "validation_dataset": self.validation_datset, + } + settingsfile = pynq_driver_dir + "/settings.json" + with open(settingsfile, "w") as f: + json.dump(settings, f, indent=2) + # generate weight files for runtime-writable layers for sdp_ind, sdp_node in enumerate(model.graph.node): assert sdp_node.op_type == "StreamingDataflowPartition" # get dataflow model diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 59d4293323..e280fba016 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -29,13 +29,13 @@ import math import os -import subprocess from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames from qonnx.transformation.infer_data_layouts import InferDataLayouts from shutil import copy +from subprocess import CalledProcessError from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP @@ -47,9 +47,14 @@ from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map +from finn.util.basic import ( + launch_process_helper, + make_build_dir, + pynq_native_port_width, + pynq_part_map, +) from finn.util.deps import get_deps_path -from finn.util.logging import log +from finn.util.exception import FINNError from . import templates @@ -399,16 +404,15 @@ def apply(self, model): # call the synthesis script bash_command = ["bash", synth_project_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for bitfile + pass + bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" if not os.path.isfile(bitfile_name): - raise Exception( + raise FINNError( "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir ) deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit" diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index 222c9c2336..1c5a5eff91 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -29,7 +29,6 @@ import json import os -import subprocess from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -38,6 +37,7 @@ GiveUniqueNodeNames, RemoveUnusedTensors, ) +from subprocess import CalledProcessError from finn.builder.build_dataflow_config import FpgaMemoryType, VitisOptStrategy from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition @@ -49,8 +49,8 @@ from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.basic import make_build_dir -from finn.util.logging import log +from finn.util.basic import launch_process_helper, make_build_dir +from finn.util.exception import FINNError from . import templates @@ -142,16 +142,14 @@ def apply(self, model): f.write("vivado -mode batch -source gen_xo.tcl\n") f.write("cd {}\n".format(working_dir)) bash_command = ["bash", package_xo_sh] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical - assert os.path.isfile(xo_path), ( - "Vitis .xo file not created, check logs under %s" % vivado_proj_dir - ) + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for .xo file + pass + if not os.path.isfile(xo_path): + raise FINNError("Vitis .xo file not created, check logs under %s" % vivado_proj_dir) + return (model, False) @@ -327,18 +325,17 @@ def apply(self, model): ) f.write("cd {}\n".format(working_dir)) bash_command = ["bash", script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical - # TODO rename xclbin appropriately here? + + try: + launch_process_helper(bash_command, print_stdout=False) + except CalledProcessError: + # Check success manually by looking for .xo file + pass xclbin = link_dir + "/a.xclbin" - assert os.path.isfile(xclbin), ( - "Vitis .xclbin file not created, check logs under %s" % link_dir - ) + if not os.path.isfile(xclbin): + raise FINNError("Vitis .xclbin file not created, check logs under %s" % link_dir) + + # TODO rename xclbin appropriately here? model.set_metadata_prop("bitfile", xclbin) # run Vivado to gen xml report @@ -350,13 +347,7 @@ def apply(self, model): f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl")) f.write("cd {}\n".format(working_dir)) bash_command = ["bash", gen_rep_xml_sh] - process_genxml = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_genxml.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + launch_process_helper(bash_command, print_stdout=False) # filename for the synth utilization report synth_report_filename = link_dir + "/synth_report.xml" model.set_metadata_prop("vivado_synth_rpt", synth_report_filename) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index acb8bb1303..7f7e658146 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -151,6 +151,65 @@ def make_build_dir(prefix: str = "", return_as_path: bool = False) -> str | Path return str(tmpdir) +def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): + """Helper function to launch a process in a way that facilitates logging + stdout/stderr with Python loggers. + Returns (cmd_out, cmd_err) if successful, raises CalledProcessError otherwise.""" + process = subprocess.run(args, capture_output=True, env=proc_env, cwd=cwd, text=True) + cmd_out = process.stdout.strip() + cmd_err = process.stderr.strip() + + # Handle stdout + if cmd_out: + if print_stdout is True: + log.info(cmd_out) + else: + # Print with DEBUG level regardless + log.debug(cmd_out) + + # Handle stderr, depending on return code + if process.returncode == 0: + # Process completed successfully, log stderr only as WARNING + if cmd_err: + log.warning(cmd_err) + else: + # Process failed, log stderr as ERROR + if cmd_err: + log.error(cmd_err) + + # Log additional ERROR message + if isinstance(args, list): + cmd = " ".join(args) + else: + cmd = args + log.error(f"Launched process returned non-zero exit code ({process.returncode}): {cmd}") + + # Raise CalledProcessError for non-zero return code + process.check_returncode() + return (cmd_out, cmd_err) + + +def which(program): + "Python equivalent of the shell cmd 'which'." + + # source: + # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + + fpath, fname = os.path.split(program) + if fpath: + if is_exe(program): + return program + else: + for path in os.environ["PATH"].split(os.pathsep): + exe_file = os.path.join(path, program) + if is_exe(exe_file): + return exe_file + + return None + + class CppBuilder: """Builds the g++ compiler command to produces the executable of the c++ code in code_gen_dir which is passed to the function build() of this class.""" @@ -194,50 +253,7 @@ def build(self, code_gen_dir): f.write("#!/bin/bash \n") f.write(bash_compile + "\n") bash_command = ["bash", self.compile_script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True - ) - _, stderr_data = process_compile.communicate() - if stderr_data.strip(): - log.critical(stderr_data.strip()) # Decode bytes and log as critical - - -def launch_process_helper(args, proc_env=None, cwd=None, print_stdout=True): - """Helper function to launch a process in a way that facilitates logging - stdout/stderr with Python loggers. - Returns (cmd_out, cmd_err).""" - if proc_env is None: - proc_env = os.environ.copy() - with subprocess.Popen( - args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=proc_env, cwd=cwd, text=True - ) as proc: - (cmd_out, cmd_err) = proc.communicate() - if cmd_out.strip() and print_stdout is True: - log.info(cmd_out.strip()) - if cmd_err.strip(): - log.critical(cmd_err.strip()) - return (cmd_out, cmd_err) - - -def which(program): - "Python equivalent of the shell cmd 'which'." - - # source: - # https://stackoverflow.com/questions/377017/test-if-executable-exists-in-python - def is_exe(fpath): - return os.path.isfile(fpath) and os.access(fpath, os.X_OK) - - fpath, fname = os.path.split(program) - if fpath: - if is_exe(program): - return program - else: - for path in os.environ["PATH"].split(os.pathsep): - exe_file = os.path.join(path, program) - if is_exe(exe_file): - return exe_file - - return None + launch_process_helper(bash_command, print_stdout=False) mem_primitives_versal = { diff --git a/src/finn/util/hls.py b/src/finn/util/hls.py index b1b88dbafe..dc153c0f52 100644 --- a/src/finn/util/hls.py +++ b/src/finn/util/hls.py @@ -27,10 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -import subprocess - -from finn.util.basic import which -from finn.util.logging import log +from finn.util.basic import launch_process_helper, which class CallHLS: @@ -65,10 +62,4 @@ def build(self, code_gen_dir): f.write("cd {}\n".format(working_dir)) f.close() bash_command = ["bash", self.ipgen_script] - process_compile = subprocess.Popen( - bash_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - _, stderr_data = process_compile.communicate() - stderr_stripped = stderr_data.decode().strip() - if stderr_stripped != "" and stderr_stripped is not None: - log.critical(stderr_stripped) # Decode bytes and log as critical + launch_process_helper(bash_command, print_stdout=False) diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index 4770066117..cf75fd273b 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -165,8 +165,8 @@ def test_end2end_cybsec_mlp_build(self): ) build.build_dataflow_cfg(model_file, cfg) # check the generated files - assert os.path.isfile(output_dir + "/time_per_step.json") - assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/report/time_per_step.json") + assert os.path.isfile(output_dir + "/report/final_hw_config.json") assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/driver/driver.py") est_cycles_report = output_dir + "/report/estimate_layer_cycles.json" diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index bb89e8ab84..97686235d0 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -95,7 +95,7 @@ def test_fifosizing_linear(method, topology): cfg_cmp.auto_fifo_depths = False cfg_cmp.target_fps = None cfg_cmp.generate_outputs = [build_cfg.DataflowOutputType.STITCHED_IP] - cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json" + cfg_cmp.folding_config_file = tmp_output_dir + "/report/final_hw_config.json" build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp) model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx") diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 18f574bc8f..65d1942bed 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -48,9 +48,9 @@ def test_end2end_build_dataflow_directory(): build_dataflow_directory(target_dir) # check the generated files output_dir = target_dir + "/output_tfc_w1a1_Pynq-Z1" - assert os.path.isfile(output_dir + "/time_per_step.json") - assert os.path.isfile(output_dir + "/auto_folding_config.json") - assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/report/time_per_step.json") + assert os.path.isfile(output_dir + "/report/auto_folding_config.json") + assert os.path.isfile(output_dir + "/report/final_hw_config.json") assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml") assert os.path.isfile(output_dir + "/driver/driver.py")