diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ebdad54bee..a2f9527976 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,11 +93,11 @@ Sync finn-dev: .setup_venv_from_whl: &setup_venv_from_whl # Move everything to working directory (e.g., RAMdisk) - - cp -dfR .. $PATH_WORKDIR + - cp -dfR . $PATH_WORKDIR - cd $PATH_WORKDIR # Create fresh virtual environment and install finn-plus from .whl (artifact) - python3 -m venv finn-plus-venv - - finn-plus-venv/bin/pip install ./finn-plus/dist/*.whl + - finn-plus-venv/bin/pip install dist/*.whl Build: id_tokens: @@ -171,8 +171,8 @@ FINN Test Suite 2022.2: - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log & # Launch FINN via test command, includes preparation of (cached) dependencies - | - source ./finn-plus-venv/bin/activate - finn test --variant $TEST_SUITE --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL + source finn-plus-venv/bin/activate + finn test --variant $TEST_SUITE --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL artifacts: name: "test_reports" when: always diff --git a/custom_hls/instrumentation.template.cpp b/custom_hls/instrumentation.template.cpp new file mode 100644 index 0000000000..bf15d77a87 --- /dev/null +++ b/custom_hls/instrumentation.template.cpp @@ -0,0 +1,307 @@ +/****************************************************************************** + * Copyright (c) 2023, Xilinx, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************* + * @brief Instrumentation wrapper module for FINN IP characterization. + * @author Thomas B. Preusser + * @details + * Instrumentation wrapper intercepting the feature map input to and + * the feature map output from a FINN IP to measure processing latency and + * initiation interval in terms of clock cycles. The most recent readings + * are exposed via AXI-light. + * This wrapper can run the FINN IP detached from an external data source + * and sink by feeding LFSR-generated data and sinking the output without + * backpressure. + * This module is currently not integrated with the FINN compiler. It must + * be instantiated and integrated with the rest of the system in a manual + * process. + * + * @param PENDING maximum number of feature maps in the FINN dataflow pipeline + * @param ILEN number of input transactions per IFM + * @param OLEN number of output transactions per OFM + * @param KO number of subwords within output payload vector + * @param TI type of input payload vector + * @param TO type of output payload vector + *******************************************************************************/ + + #include + #include + #include + #include + + // Module Configuration + constexpr unsigned PENDING = @PENDING@; // Max. feature maps in flight + constexpr unsigned ILEN = @ILEN@; // Input words per IFM + constexpr unsigned OLEN = @OLEN@; // Output words per OFM + constexpr unsigned KO = @KO@; // Subwords within OFM transaction word + using TI = @TI@; // IFM transaction word + using TO = @TO@; // OFM transaction word + + //--------------------------------------------------------------------------- + // Utility Functions + static constexpr unsigned clog2 (unsigned x) { return x<2? 0 : 1+clog2((x+1)/2); } + static constexpr unsigned clog2nz(unsigned x) { return std::max(1u, clog2(x)); } + + template + static void move( + hls::stream &src, + hls::stream &dst + ) { + #pragma HLS pipeline II=1 style=flp + dst.write(src.read()); + } + + template + static void move( + hls::stream> &src, + hls::stream &dst + ) { + #pragma HLS pipeline II=1 style=flp + dst.write(src.read().data); + } + + template + class Payload { + public: + using type = T; + }; + template + class Payload> { + public: + using type = T; + }; + + /** + * Computes a checksum over a forwarded stream assumed to carry frames of + * N words further subdivided into K subwords. + * - Subword slicing can be customized typically by using a lambda. + * The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word + * type with a member `width` and a range-based slicing operator. It + * further assumes a little-endian arrangement of subwords within words + * for the canonical subword stream order. + * - Subwords wider than 23 bits are folded using bitwise XOR across + * slices of 23 bits starting from the LSB. + * - The folded subword values are weighted according to their position + * in the stream relative to the start of frame by a periodic weight + * sequence 1, 2, 3, ... + * - The weighted folded subword values are reduced to a checksum by an + * accumulation module 2^24. + * - A checksum is emitted for each completed frame. It is the concatenation + * of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum. + */ + template + class DefaultSubwordSlicer { + static_assert(T::width%K == 0, "Word size must be subword multiple."); + static constexpr unsigned W = T::width/K; + public: + ap_uint operator()(T const &x, unsigned const j) const { + #pragma HLS inline + return x((j+1)*W-1, j*W); + } + }; + + //--------------------------------------------------------------------------- + // Instrumentation Core + template< + unsigned PENDING, + unsigned ILEN, + unsigned OLEN, + unsigned KO, + typename TI, + typename TO + > + void instrument( + hls::stream &finnix, + hls::stream &finnox, + ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed + ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow + ap_uint<32> &latency, + ap_uint<32> &interval, + ap_uint<32> &checksum, + ap_uint<32> &min_latency + ) { + #pragma HLS pipeline II=1 style=flp + + // Timestamp Management State + using clock_t = ap_uint<32>; + static clock_t cnt_clk = 0; + #pragma HLS reset variable=cnt_clk + hls::stream timestamps; + #pragma HLS stream variable=timestamps depth=PENDING + static bool timestamp_ovf = false; + static bool timestamp_unf = false; + #pragma HLS reset variable=timestamp_ovf + #pragma HLS reset variable=timestamp_unf + + // Input Feed & Generation + constexpr unsigned LFSR_WIDTH = (TI::width+15)/16 * 16; + static ap_uint icnt = 0; + static ap_uint lfsr; + #pragma HLS reset variable=icnt + #pragma HLS reset variable=lfsr off + if(!finnix.full()) { + + bool const first = icnt == 0; + bool wr; + if(first) { + // Start of new feature map + wr = cfg[0]; + for(unsigned i = 0; i < LFSR_WIDTH; i += 16) { + #pragma HLS unroll + lfsr(15+i, i) = cfg(31, 16) ^ (i>>4)*33331; + } + } + else { + // Advance LFSR + wr = true; + for(unsigned i = 0; i < LFSR_WIDTH; i += 16) { + #pragma HLS unroll + lfsr(15+i, i) = (lfsr(15+i, i) >> 1) ^ ap_uint<16>(lfsr[i]? 0 : 0x8805); + } + } + + if(wr) { + finnix.write_nb(lfsr); + if(first) timestamp_ovf |= !timestamps.write_nb(cnt_clk); + icnt = icnt == ILEN-1? decltype(icnt)(0) : decltype(icnt)(icnt + 1); + } + } + + // Output Tracking + static ap_uint ocnt = 0; + #pragma HLS reset variable=ocnt + static clock_t ts1 = 0; // last output timestamp + static clock_t last_latency = 0; + static clock_t last_interval = 0; + static clock_t cur_min_latency = ~0; + #pragma HLS reset variable=ts1 + #pragma HLS reset variable=last_latency + #pragma HLS reset variable=last_interval + #pragma HLS reset variable=cur_min_latency + + static ap_uint<8> pkts = 0; + #pragma HLS reset variable=pkts + static ap_uint< 2> coeff[3]; + static ap_uint<24> psum; + static ap_uint<32> last_checksum = 0; + #pragma HLS reset variable=coeff off + #pragma HLS reset variable=psum off + #pragma HLS reset variable=last_checksum + + TO oval; + if(finnox.read_nb(oval)) { + // Start of new output feature map + if(ocnt == 0) { + for(unsigned i = 0; i < 3; i++) coeff[i] = i+1; + psum = 0; + } + + // Update checksum + for(unsigned j = 0; j < KO; j++) { + #pragma HLS unroll + auto const v0 = DefaultSubwordSlicer()(oval, j); + constexpr unsigned W = 1 + (decltype(v0)::width-1)/23; + ap_uint v = v0; + ap_uint< 23> w = 0; + for(unsigned k = 0; k < W; k++) w ^= v(23*k+22, 23*k); + psum += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0)); + } + + // Re-align coefficients + for(unsigned j = 0; j < 3; j++) { + #pragma HLS unroll + ap_uint<3> const cc = coeff[j] + ap_uint<3>(KO%3); + coeff[j] = cc(1, 0) + cc[2]; + } + + // Track frame position + if(ocnt != OLEN-1) ocnt++; + else { + clock_t ts0; + if(!timestamps.read_nb(ts0)) timestamp_unf = true; + else { + last_latency = cnt_clk - ts0; // completion - start + last_interval = cnt_clk - ts1; // completion - previous completion + cur_min_latency = std::min(cur_min_latency, last_latency); + ts1 = cnt_clk; // mark completion ^ + } + ocnt = 0; + + last_checksum = (pkts++, psum); + } + } + + // Advance Timestamp Counter + cnt_clk++; + + // Copy Status Outputs + status = timestamp_ovf | (timestamp_unf << 1); + latency = last_latency; + interval = last_interval; + checksum = last_checksum; + min_latency = cur_min_latency; + + } // instrument() + + void instrumentation_wrapper( + hls::stream &finnix, + hls::stream &finnox, + ap_uint<32> cfg, + ap_uint<32> &status, + ap_uint<32> &latency, + ap_uint<32> &interval, + ap_uint<32> &checksum, + ap_uint<32> &min_latency + ) { + #pragma HLS interface axis port=finnix + #pragma HLS interface axis port=finnox + #pragma HLS interface s_axilite bundle=ctrl port=cfg + #pragma HLS interface s_axilite bundle=ctrl port=status + #pragma HLS interface s_axilite bundle=ctrl port=latency + #pragma HLS interface s_axilite bundle=ctrl port=interval + #pragma HLS interface s_axilite bundle=ctrl port=checksum + #pragma HLS interface s_axilite bundle=ctrl port=min_latency + #pragma HLS interface ap_ctrl_none port=return + + #pragma HLS dataflow disable_start_propagation + static hls::stream finnix0; + static hls::stream::type> finnox0; + #pragma HLS stream variable=finnix0 depth=2 + #pragma HLS stream variable=finnox0 depth=2 + + // AXI-Stream -> FIFO + move(finnox, finnox0); + + // Main + instrument(finnix0, finnox0, cfg, status, latency, interval, checksum, min_latency); + + // FIFO -> AXI-Stream + move(finnix0, finnix); + + } // instrumentation_wrapper diff --git a/custom_hls/instrumentation_sim.template.tcl b/custom_hls/instrumentation_sim.template.tcl new file mode 100644 index 0000000000..4875d799e2 --- /dev/null +++ b/custom_hls/instrumentation_sim.template.tcl @@ -0,0 +1,67 @@ +# Copyright (c) 2023 Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of AMD nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +set fpga_part @FPGA_PART@ +#set output_root ".." +# path to IP folder for instrumentation wrapper, change as needed +#set instrwrp_ip_dir "$output_root/instrumentation_wrapper/project_instrwrap/sol1/impl/ip" +# path to IP folder for FINN IP, change as needed +#set finn_ip_dir "$output_root/stitched_ip/ip" + +create_project -force instr_sim_proj instr_sim_proj/ -part $fpga_part +create_bd_design "dut" +update_compile_order -fileset sources_1 +#set_property ip_repo_paths [list $instrwrp_ip_dir] [current_project] +set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] @IP_DIRS_STR@] [current_project] +update_ip_catalog + + +create_bd_cell -type ip -vlnv xilinx_finn:finn:finn_design:1.0 finn_design_0 +create_bd_cell -type ip -vlnv xilinx.com:hls:instrumentation_wrapper:1.0 instrumentation_wrap_0 +connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/finnix] [get_bd_intf_pins finn_design_0/s_axis_0] +connect_bd_intf_net [get_bd_intf_pins finn_design_0/m_axis_0] [get_bd_intf_pins instrumentation_wrap_0/finnox] +make_bd_intf_pins_external [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] +make_bd_pins_external [get_bd_pins instrumentation_wrap_0/ap_clk] +make_bd_pins_external [get_bd_pins instrumentation_wrap_0/ap_rst_n] +connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins finn_design_0/ap_clk] +connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins finn_design_0/ap_rst_n] + +save_bd_design + +update_compile_order -fileset sources_1 +make_wrapper -files [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] -top +add_files -norecurse instr_sim_proj/instr_sim_proj.gen/sources_1/bd/dut/hdl/dut_wrapper.v + +set_property SOURCE_SET sources_1 [get_filesets sim_1] +add_files -fileset sim_1 ./instrwrap_testbench.sv +update_compile_order -fileset sim_1 + +set_property synth_checkpoint_mode None [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] +generate_target Simulation [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] +launch_simulation -simset sim_1 -mode behavioral +run all diff --git a/custom_hls/instrumentation_tb.template.sv b/custom_hls/instrumentation_tb.template.sv new file mode 100644 index 0000000000..933104c623 --- /dev/null +++ b/custom_hls/instrumentation_tb.template.sv @@ -0,0 +1,172 @@ +// Copyright (c) 2023 Advanced Micro Devices, Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of AMD nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +module tb #( + // sampling period (in cycles) for reading instrumentation wrapper registers + // TODO: make configurable or adjust automatically? + int unsigned INSTR_READ_PERIOD = 10000, + // 16-bit LFSR seed for generating fixed random data + int unsigned LFSR_SEED = 1 +)(); + + +// Clock & Reset +logic ap_clk = 0; +always #5ns ap_clk = !ap_clk; +logic ap_rst_n = 0; +uwire ap_rst = !ap_rst_n; + +// wires for instrumentation wrapper AXI lite interface +logic [31:0] axilite_ctrl_araddr = 'x; +uwire axilite_ctrl_arready; +logic axilite_ctrl_arvalid = 0; +logic [31:0] axilite_ctrl_awaddr = 'x; +uwire axilite_ctrl_awready; +logic axilite_ctrl_awvalid = 0; +uwire axilite_ctrl_bready = 1; +uwire [1:0]axilite_ctrl_bresp; +uwire axilite_ctrl_bvalid; +uwire [31:0]axilite_ctrl_rdata; +logic axilite_ctrl_rready = 1; +uwire [1:0]axilite_ctrl_rresp; +uwire axilite_ctrl_rvalid; +logic [31:0] axilite_ctrl_wdata = 'x; +uwire axilite_ctrl_wready; +uwire [3:0]axilite_ctrl_wstrb = 4'b1111; +logic axilite_ctrl_wvalid = 0; + + + + +dut_wrapper dut_wrapper_inst ( + .ap_clk_0(ap_clk), .ap_rst_n_0(ap_rst_n), + .s_axi_ctrl_0_araddr(axilite_ctrl_araddr), + .s_axi_ctrl_0_arready(axilite_ctrl_arready), + .s_axi_ctrl_0_arvalid(axilite_ctrl_arvalid), + .s_axi_ctrl_0_awaddr(axilite_ctrl_awaddr), + .s_axi_ctrl_0_awready(axilite_ctrl_awready), + .s_axi_ctrl_0_awvalid(axilite_ctrl_awvalid), + .s_axi_ctrl_0_bready(axilite_ctrl_bready), + .s_axi_ctrl_0_bresp(axilite_ctrl_bresp), + .s_axi_ctrl_0_bvalid(axilite_ctrl_bvalid), + .s_axi_ctrl_0_rdata(axilite_ctrl_rdata), + .s_axi_ctrl_0_rready(axilite_ctrl_rready), + .s_axi_ctrl_0_rresp(axilite_ctrl_rresp), + .s_axi_ctrl_0_rvalid(axilite_ctrl_rvalid), + .s_axi_ctrl_0_wdata(axilite_ctrl_wdata), + .s_axi_ctrl_0_wready(axilite_ctrl_wready), + .s_axi_ctrl_0_wstrb(axilite_ctrl_wstrb), + .s_axi_ctrl_0_wvalid(axilite_ctrl_wvalid) +); + +//--------------------------------------------------------------------------- + +initial begin + $timeformat(-9, 2, " ns"); + // perform reset + repeat(100) @(posedge ap_clk); + ap_rst_n <= 1; + $display("Reset complete"); + repeat(100) @(posedge ap_clk); + // instrumentation wrapper configuration: + // set up LFSR seed + start data generation + output sink + axilite_ctrl_awaddr <= 'h10; + axilite_ctrl_awvalid <= 1; + axilite_ctrl_wdata <= (LFSR_SEED << 16) | 'b11; + axilite_ctrl_wvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_wready && axilite_ctrl_awready) break; + end + axilite_ctrl_wvalid <= 0; + axilite_ctrl_awvalid <= 0; + axilite_ctrl_awaddr <= 'x; + axilite_ctrl_wdata <= 'x; + while(1) begin + axilite_ctrl_araddr <= 'h18; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] STATUS_I = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h20; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] STATUS_O = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h28; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] LATENCY = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h38; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] INTERVAL = %0d", $time, axilite_ctrl_rdata); + break; + end + end + axilite_ctrl_araddr <= 'h48; + axilite_ctrl_arvalid <= 1; + repeat(8) begin + @(posedge ap_clk); + if(axilite_ctrl_rvalid) begin + $display("[t=%0t] CHECKSUM = %8x", $time, axilite_ctrl_rdata); + if(axilite_ctrl_rdata) begin + $display("Nonzero checksum detected, stopping simulation"); + $finish; + // TODO: simulate for configurable number of frames, like this: + // if(axilite_ctrl_rdata[31:24] == 47) begin + // $display("Frame number 48 detected, stopping simulation"); + // $finish; + // end + end + break; + end + end + axilite_ctrl_arvalid <= 0; + repeat(INSTR_READ_PERIOD) @(posedge ap_clk); + end +end + + +endmodule : tb diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 2b01f24557..014a13db27 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -456,8 +456,8 @@ "metadata": {}, "outputs": [], "source": [ - "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n", - "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))" + "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n", + "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))" ] }, { diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index b0510b0fdb..de6de23d3f 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -751,8 +751,8 @@ "metadata": {}, "outputs": [], "source": [ - "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n", - "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))" + "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n", + "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))" ] }, { diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 42e015226d..c124b213ac 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -340,6 +340,10 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin): #: debug signals in the generated hardware) enable_hw_debug: Optional[bool] = False + #: Whether the accelerator will be simulated and synthesized with an + #: instrumentation wrapper attached to accurately measure performance. + enable_instrumentation: Optional[bool] = False + #: Whether pdb postmortem debuggig will be launched when the build fails enable_build_pdb_debug: Optional[bool] = False diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index b8d421b5bc..bf7ae19feb 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -79,7 +79,12 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.make_driver import MakeCPPDriver, MakePYNQDriver +from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker +from finn.transformation.fpgadataflow.make_driver import ( + MakeCPPDriver, + MakePYNQDriverInstrumentation, + MakePYNQDriverIODMA, +) from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth @@ -617,6 +622,26 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig): """Create stitched IP for a graph after all HLS IP blocks have been generated. Depends on the DataflowOutputType.STITCHED_IP output product.""" + # introduce tLAST marker, required for instrumentation + if cfg.enable_instrumentation: + model = model.transform( + InsertTLastMarker( + # only insert marker on output (input TLAST is ignored for these use-cases anyway) + both=False, + # use ap_axiu instead of qdma_axis + external=False, + # static number of iterations (based on what the compiler/folding sets up) + dynamic=False, + ) + ) + # give a proper name to the inserted node, important for codegen + # TODO: deal with multi-I/O accelerators? + model.graph.node[-1].name = "TLastMarker_0" + # re-run codegen and HLS IP gen, will affect only the new TLastMarker layer assuming + # all other IPs have been generated already + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) + model = model.transform(HLSSynthIP()) + if DataflowOutputType.STITCHED_IP in cfg.generate_outputs: stitched_ip_dir = cfg.output_dir + "/stitched_ip" model = model.transform( @@ -717,7 +742,14 @@ def step_make_driver(model: ModelWrapper, cfg: DataflowBuildConfig): driver_dir = os.path.join(cfg.output_dir, "driver") if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs: # generate PYNQ driver - model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform())) + if cfg.enable_instrumentation: + model = model.transform( + MakePYNQDriverInstrumentation( + cfg._resolve_driver_platform(), cfg.synth_clk_period_ns + ) + ) + else: + model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform())) shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True) log.info("PYNQ Python driver written into " + driver_dir) elif DataflowOutputType.CPP_DRIVER in cfg.generate_outputs: @@ -779,6 +811,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig): cfg.board, cfg.synth_clk_period_ns, cfg.enable_hw_debug, + cfg.enable_instrumentation, partition_model_dir=partition_model_dir, ) ) diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index a6ff29d608..af55ee13df 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -38,7 +38,7 @@ # Driver base class for FINN-generated dataflow accelerators. # The particulars of the generated accelerator are specified via the -# io_shape_dict (generated by the MakePYNQDriver transformation). +# io_shape_dict (generated by the MakePYNQDriverIODMA transformation). class FINNExampleOverlay(Overlay): diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py new file mode 100644 index 0000000000..aa5225eab6 --- /dev/null +++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py @@ -0,0 +1,169 @@ +import argparse +import json +import time +from pynq import PL, Overlay +from pynq.pl_server.device import Device +from pynq.ps import Clocks + +# Instrumentation wrapper register map # +# ap_uint<32> cfg, // [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed +# ap_uint<32> &status, // [0] - timestamp overflow; [1] - timestamp underflow +# ap_uint<32> &latency, +# ap_uint<32> &interval, +# ap_uint<32> &checksum, +# ap_uint<32> &min_latency + + +class FINNInstrumentationOverlay(Overlay): + def __init__( + self, + bitfile_name, + platform="zynq", + fclk_mhz=100.0, + device=None, + download=True, + seed=1, + ): + super().__init__(bitfile_name, download=download, device=device) + + self.platform = platform + self.fclk_mhz = fclk_mhz + self.seed = seed + + # configure clock (for ZYNQ platforms) + if self.platform == "zynq": + if self.fclk_mhz > 0: + Clocks.fclk0_mhz = self.fclk_mhz + self.fclk_mhz_actual = Clocks.fclk0_mhz + + def instrumentation_read(self, name): + return self.instrumentation_wrap_0.read( + offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"] + ) + + def instrumentation_write(self, name, value): + return self.instrumentation_wrap_0.write( + offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"], + value=value, + ) + + def reset_accelerator(self): + self.axi_gpio_0.write( + offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0 + ) + + def start_accelerator(self): + lfsr_seed = (self.seed << 16) & 0xFFFF0000 # upper 16 bits + self.instrumentation_write("cfg", lfsr_seed + 1) # start operation + + def observe_instrumentation(self, debug_print=True): + status_reg = self.instrumentation_read("status") + chksum_reg = self.instrumentation_read("checksum") + min_latency = self.instrumentation_read("min_latency") + latency = self.instrumentation_read("latency") + interval = self.instrumentation_read("interval") + + frame = (chksum_reg >> 24) & 0x000000FF + checksum = chksum_reg & 0x00FFFFFF + overflow_err = (status_reg & 0x00000001) != 0 + underflow_err = (status_reg & 0x00000002) != 0 + + if debug_print: + print("---INSTRUMENTATION_REPORT---") + if overflow_err or underflow_err: + print("Status ERROR") + print("Overflow error: %s" % overflow_err) + print("Underflow error: %s" % underflow_err) + else: + print("Status OK") + print("Frame number (8-bit): %d" % frame) + print("Checksum: 0x%06x" % checksum) + print("Min Latency (cycles): %d" % min_latency) + print("Latency (cycles): %d" % latency) + print("Interval (cycles): %d" % interval) + print("----------------------------") + + return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Profile FINN-generated accelerator using instrumentation wrapper" + ) + parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10) + parser.add_argument( + "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0 + ) + parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1) + parser.add_argument("--device", help="FPGA device to be used", type=int, default=0) + parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit") + parser.add_argument( + "--reportfile", + help="Name of output .json report file", + type=str, + default="measured_performance.json", + ) + parser.add_argument( + "--settingsfile", help="Name of optional input .json settings file", type=str, default="" + ) + # parse arguments + args = parser.parse_args() + runtime = args.runtime + frequency = args.frequency + seed = args.seed + bitfile = args.bitfile + reportfile = args.reportfile + settingsfile = args.settingsfile + devID = args.device + device = Device.devices[devID] + + # overwrite frequency if specified in settings file + if settingsfile != "": + with open(settingsfile, "r") as f: + settings = json.load(f) + if "fclk_mhz" in settings: + frequency = settings["fclk_mhz"] + + # instantiate FINN accelerator driver and pass batchsize and bitfile + print("Programming FPGA..") + PL.reset() # reset PYNQ cache + accel = FINNInstrumentationOverlay( + bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed + ) + + # start accelerator + print("Running accelerator..") + accel.start_accelerator() + + # let it run for specified runtime + time.sleep(runtime) + + # read measurement from instrumentation + ( + overflow_err, + underflow_err, + frame, + checksum, + min_latency, + latency, + interval, + ) = accel.observe_instrumentation() + + # write report to file + report = { + "error": overflow_err or underflow_err or interval == 0, + "checksum": checksum, + "min_latency_cycles": min_latency, + "latency_cycles": latency, + "interval_cycles": interval, + "frequency_mhz": round(accel.fclk_mhz_actual), + "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6), + "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))), + "min_pipeline_depth": round(min_latency / interval, 2), + "pipeline_depth": round(latency / interval, 2), + } + with open(reportfile, "w") as f: + json.dump(report, f, indent=2) + + print("Done.") diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 38312ce1ee..ceff0b4f8a 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -99,9 +99,13 @@ def apply(self, model): # if we have SLR assignment already. use that if node_slr != -1: continue + # if available, use the SLR of the preceding node srcnode = model.find_producer(node.input[0]) - node_slr = getCustomOp(srcnode).get_nodeattr("slr") - node_inst.set_nodeattr("slr", node_slr) + if srcnode is not None: + node_slr = getCustomOp(srcnode).get_nodeattr("slr") + node_inst.set_nodeattr("slr", node_slr) + else: + node_inst.set_nodeattr("slr", default_slr) if unassigned_nodes > 0: warning_str = f"{unassigned_nodes} nodes have no entry in\ @@ -127,25 +131,27 @@ def apply(self, model): ) non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)) + # assign every DMA node to its own partition for node in dma_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 + # assign every dynamic tLastMarker node to its own partition for node in dyn_tlastmarker_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 + # handle remaining nodes for node in non_dma_nodes: pre_node = model.find_producer(node.input[0]) node_inst = getCustomOp(node) if pre_node not in non_dma_nodes: - # input node + # input node -> start new partition node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 continue - elif not ( node.op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") is not None @@ -153,25 +159,36 @@ def apply(self, model): ): pre_nodes = model.find_direct_predecessors(node) else: + # exception for external weight MVAU: only consider primary input + # TODO: (why) is this necessary? should we consider such exceptions for other cases? pre_nodes = [pre_node] + axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"] + if len(axilite_intf_name) != 0: + # This node has an AXI-Lite interface -> start new partition + node_inst.set_nodeattr("partition_id", partition_cnt) + partition_cnt += 1 + continue + + # examine all predecessor nodes to determine partition id for this node node_slr = node_inst.get_nodeattr("slr") + slr_mismatch_count = 0 for pre_node in pre_nodes: pre_inst = getCustomOp(pre_node) pre_slr = pre_inst.get_nodeattr("slr") if node_slr == pre_slr: - axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"] - if len(axilite_intf_name) != 0: - node_inst.set_nodeattr("partition_id", partition_cnt) - partition_cnt += 1 - else: - partition_id = pre_inst.get_nodeattr("partition_id") - node_inst.set_nodeattr("partition_id", partition_id) - + # Default case -> assign to same partition as predecessor + partition_id = pre_inst.get_nodeattr("partition_id") + node_inst.set_nodeattr("partition_id", partition_id) + break else: - # no matching, new partition - node_inst.set_nodeattr("partition_id", partition_cnt) - partition_cnt += 1 + # SLR mismatch with predecessor, can't assign same partition + slr_mismatch_count += 1 + + if slr_mismatch_count == len(pre_nodes): + # SLR mismatch with ALL predecessors -> start new partition + node_inst.set_nodeattr("partition_id", partition_cnt) + partition_cnt += 1 # save the updated floorplan floorplan = model.analysis(floorplan_params) diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py new file mode 100644 index 0000000000..f2b3b21f6d --- /dev/null +++ b/src/finn/transformation/fpgadataflow/instrumentation.py @@ -0,0 +1,206 @@ +import numpy as np +import os +import subprocess +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation + +from finn.custom_op.fpgadataflow.templates import ipgentcl_template +from finn.util.basic import make_build_dir +from finn.util.deps import get_deps_path +from finn.util.hls import CallHLS + + +# TODO: duplicate function from make_zynq_proj.py +def collect_ip_dirs(model, ipstitch_path): + # collect list of all IP dirs + ip_dirs = [] + need_memstreamer = False + for node in model.graph.node: + node_inst = getCustomOp(node) + ip_dir_value = node_inst.get_nodeattr("ip_path") + assert os.path.isdir( + ip_dir_value + ), """The directory that should + contain the generated ip blocks doesn't exist.""" + ip_dirs += [ip_dir_value] + if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls": + if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": + need_memstreamer = True + ip_dirs += [ipstitch_path + "/ip"] + if need_memstreamer: + # add RTL streamer IP + ip_dirs.append("$::env(FINN_RTLLIB)/memstream") + return ip_dirs + + +class GenerateInstrumentationIP(Transformation): + def __init__( + self, + fpga_part, + clk_period_ns, + format="ip", # "ip" for Vivado (Zynq) or "xo" for Vitis (Alveo/Versal) + ): + super().__init__() + self.fpga_part = fpga_part + self.clk_period_ns = clk_period_ns + self.format = format + + def apply(self, model): + # Create directory for code-gen and HLS of instrumentation IP + wrapper_output_dir = make_build_dir(prefix="code_gen_ipgen_Instrumentation_") + model.set_metadata_prop("instrumentation_ipgen", wrapper_output_dir) + + # conservative max for pending feature maps: number of layers + pending = len(model.graph.node) + # query the parallelism-dependent folded input shape from the + # node consuming the graph input + inp_name = model.graph.input[0].name + inp_node = getCustomOp(model.find_consumer(inp_name)) + inp_shape_folded = list(inp_node.get_folded_input_shape()) + inp_stream_width = inp_node.get_instream_width_padded() + # number of beats per input is given by product of folded input + # shape except the last dim (which is the stream width) + ilen = np.prod(inp_shape_folded[:-1]) + ti = "ap_uint<%d>" % inp_stream_width + # perform the same for the output + out_name = model.graph.output[0].name + out_node = getCustomOp(model.find_producer(out_name)) + out_shape_folded = list(out_node.get_folded_output_shape()) + out_stream_width = out_node.get_outstream_width_padded() + olen = np.prod(out_shape_folded[:-1]) + to = "ap_uint<%d>" % out_stream_width + ko = out_shape_folded[-1] + # fill out instrumentation wrapper template + with open( + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation.template.cpp"), "r" + ) as f: + instrwrp_cpp = f.read() + instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending)) + instrwrp_cpp = instrwrp_cpp.replace("@ILEN@", str(ilen)) + instrwrp_cpp = instrwrp_cpp.replace("@OLEN@", str(olen)) + instrwrp_cpp = instrwrp_cpp.replace("@TI@", str(ti)) + instrwrp_cpp = instrwrp_cpp.replace("@TO@", str(to)) + instrwrp_cpp = instrwrp_cpp.replace("@KO@", str(ko)) + with open(wrapper_output_dir + "/top_instrumentation_wrapper.cpp", "w") as f: + f.write(instrwrp_cpp) + # fill out HLS synthesis tcl template + prjname = "project_instrwrap" + ipgentcl = ipgentcl_template + ipgentcl = ipgentcl.replace("$PROJECTNAME$", prjname) + ipgentcl = ipgentcl.replace("$HWSRCDIR$", wrapper_output_dir) + ipgentcl = ipgentcl.replace("$FINNHLSLIB$", str(get_deps_path() / "finn-hlslib")) + ipgentcl = ipgentcl.replace("$ATTENTIONHLSLIB$", str(get_deps_path() / "attention-hlslib")) + ipgentcl = ipgentcl.replace("$TOPFXN$", "instrumentation_wrapper") + ipgentcl = ipgentcl.replace("$FPGAPART$", self.fpga_part) + ipgentcl = ipgentcl.replace("$CLKPERIOD$", str(self.clk_period_ns)) + ipgentcl = ipgentcl.replace("$DEFAULT_DIRECTIVES$", "") + if self.format == "xo": + # use Vitis RTL kernel (.xo) output instead of IP-XACT + ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "config_export -format xo") + ipgentcl = ipgentcl.replace( + "export_design -format ip_catalog", "export_design -format xo" + ) + else: + ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "") + with open(wrapper_output_dir + "/hls_syn.tcl", "w") as f: + f.write(ipgentcl) + # build bash script to launch HLS synth and call it + code_gen_dir = wrapper_output_dir + builder = CallHLS() + builder.append_tcl(code_gen_dir + "/hls_syn.tcl") + builder.set_ipgen_path(code_gen_dir + "/{}".format(prjname)) + builder.build(code_gen_dir) + ipgen_path = builder.ipgen_path + assert os.path.isdir(ipgen_path), "HLS IPGen failed: %s not found" % (ipgen_path) + ip_path = ipgen_path + "/sol1/impl/ip" + assert os.path.isdir(ip_path), "HLS IPGen failed: %s not found. Check log under %s" % ( + ip_path, + code_gen_dir, + ) + if self.format == "xo": + assert False, "Not implemented" + # TODO: export for use in VitisBuild or VersalBuild + # xo_dir = self.output_dir + "/xo" + # xo_dir = str(os.path.abspath(xo_dir)) + # os.makedirs(xo_dir, exist_ok=True) + # xo_path = code_gen_dir + "/{}/sol1/impl/export.xo".format(prjname) + # xo_instr_path = xo_dir + "/instrumentation_wrapper.xo" + # shutil.copy(xo_path, xo_instr_path) + else: + # shutil.move(ip_path, self.output_dir) + pass + + return (model, False) + + +class PrepareInstrumentationSim(Transformation): + def __init__(self, fpga_part): + super().__init__() + self.fpga_part = fpga_part + + def apply(self, model): + # Create directory for simulation of instrumentation IP + FINN IP + sim_output_dir = make_build_dir(prefix="sim_Instrumentation_") + model.set_metadata_prop("instrumentation_sim", sim_output_dir) + + # check if instrumentation IP was generated + instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen") + if instr_ip_dir is None or (not os.path.isdir(instr_ip_dir)): + raise Exception( + "Instrumentation IP not generated, run GenerateInstrumentationIP first." + ) + + # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines) + # fill in testbench template + with open( + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_tb.template.sv"), + "r", + ) as f: + testbench_sv = f.read() + with open(sim_output_dir + "/instrwrap_testbench.sv", "w") as f: + f.write(testbench_sv) + # fill in testbench project creator template + with open( + os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_sim.template.tcl"), + "r", + ) as f: + testbench_tcl = f.read() + + # collect ip repo paths for finn accelerator sub cores so Vivado can find them + ipstitch_path = model.get_metadata_prop("vivado_stitch_proj") + ip_dirs = ["list"] + ip_dirs += collect_ip_dirs(model, ipstitch_path) + ip_dirs += [instr_ip_dir] + ip_dirs_str = "[%s]" % (" ".join(ip_dirs)) + testbench_tcl = testbench_tcl.replace("@FPGA_PART@", self.fpga_part) + testbench_tcl = testbench_tcl.replace("@IP_DIRS_STR@", ip_dirs_str) + with open(sim_output_dir + "/make_instrwrap_sim_proj.tcl", "w") as f: + f.write(testbench_tcl) + + return (model, False) + + +class RunInstrumentationSim(Transformation): + def __init__(self): + super().__init__() + + def apply(self, model): + sim_output_dir = model.get_metadata_prop("instrumentation_sim") + if sim_output_dir is None or (not os.path.isdir(sim_output_dir)): + raise Exception( + "Instrumentation sim not prepared, run PrepareInstrumentationSim first." + ) + + # Prepare bash script + bash_script = os.getcwd() + "/report_power.sh" + with open(bash_script, "w") as script: + script.write("#!/bin/bash\n") + script.write("cd %s\n" % (sim_output_dir)) + script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl\n") + + # Run script + print("Running Vivado simulation of instrumentation wrapper") + sub_proc = subprocess.Popen(["bash", bash_script]) + sub_proc.communicate() + + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py index 97fc97a4fd..1cea95f9c5 100644 --- a/src/finn/transformation/fpgadataflow/make_driver.py +++ b/src/finn/transformation/fpgadataflow/make_driver.py @@ -298,7 +298,7 @@ def formatKernelName(kname: str): return (model, False) -class MakePYNQDriver(Transformation): +class MakePYNQDriverIODMA(Transformation): """Create PYNQ Python code to correctly interface the generated accelerator, including data packing/unpacking. Should be called after conversion to HLS layers, folding and the creation of @@ -459,3 +459,35 @@ def apply(self, model): continue return (model, False) + + +class MakePYNQDriverInstrumentation(Transformation): + def __init__(self, platform, clk_period_ns): + super().__init__() + self.platform = platform + self.clk_period_ns = clk_period_ns + + def apply(self, model): + # TODO: support runtime-writable and external weights + # TODO: support Alveo and Versal platforms + + # create a temporary folder for the generated driver + pynq_driver_dir = make_build_dir(prefix="pynq_driver_") + model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) + + # create (copy) the static instrumentation driver + driver_template = ( + os.environ["FINN_QNN_DATA"] + "/templates/driver/driver_instrumentation.py" + ) + driver_py = pynq_driver_dir + "/driver.py" + shutil.copy(driver_template, driver_py) + + # write default settings to driver config file + settings = { + "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3, + } + settingsfile = pynq_driver_dir + "/settings.json" + with open(settingsfile, "w") as f: + json.dump(settings, f, indent=2) + + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 8110d76461..59d4293323 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -27,6 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math import os import subprocess from qonnx.core.modelwrapper import ModelWrapper @@ -43,6 +44,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA +from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map @@ -93,6 +95,7 @@ def __init__(self, platform, period_ns, enable_debug=False): self.platform = platform self.period_ns = period_ns self.enable_debug = 1 if enable_debug else 0 + self.enable_gpio_reset = 0 def apply(self, model): # create a config file and empty list of xo files @@ -100,8 +103,96 @@ def apply(self, model): idma_idx = 0 odma_idx = 0 aximm_idx = 0 + nested_interconnect_count = 0 + master_axilite_idx = 0 + axilite_interconnect_idx = 0 axilite_idx = 0 instance_names = {} + + # instantiate instrumentation IP if it was generated + instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen") + if instr_ip_dir is not None and os.path.isdir(instr_ip_dir): + use_instrumentation = True + + # instantiate GPIO IP to trigger reset + self.enable_gpio_reset = 1 + # in the template this will connect to first port of interconnect_0 + master_axilite_idx += 1 + + # update IP repository + config.append( + "set_property ip_repo_paths " + "[concat [get_property ip_repo_paths [current_project]] [list %s]] " + "[current_project]" % instr_ip_dir + ) + config.append("update_ip_catalog -rebuild -scan_changes") + # create instance + config.append( + "create_bd_cell -type ip -vlnv %s %s" + % ("xilinx.com:hls:instrumentation_wrapper:1.0", "instrumentation_wrap_0") + ) + # connect clock % reset + config.append( + "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_clk] " + "[get_bd_pins smartconnect_0/aclk]" + ) + config.append( + "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_rst_n] " + "[get_bd_pins smartconnect_0/aresetn]" + ) + # connect AXI-lite control interface + config.append( + "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] " + "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (master_axilite_idx) + ) + config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl") + master_axilite_idx += 1 + else: + use_instrumentation = False + + # instantiate nested AXI interconnects if required + # only the nested interconnects and all interfaces connected before this line + # will be connected to the original (master) interconnect + total_axilite_count = 0 + for node in model.graph.node: + sdp_node = getCustomOp(node) + dataflow_model_filename = sdp_node.get_nodeattr("model") + kernel_model = ModelWrapper(dataflow_model_filename) + ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames")) + total_axilite_count += len(ifnames["axilite"]) + if total_axilite_count > (64 - master_axilite_idx): + nested_interconnect_count = math.ceil(total_axilite_count / 64.0) + for i in range(1, nested_interconnect_count + 1): + # create instance + config.append( + "create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_%d" % (i) + ) + # configure instance + config.append( + "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]" + % (min(64, total_axilite_count), i) + ) + # connect to master interconnect + config.append( + "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] " + "-boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]" + % (master_axilite_idx, i) + ) + # connect clocks/reset + config.append( + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " + '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/ACLK]' % (i) + ) + master_axilite_idx += 1 + total_axilite_count = max(0, total_axilite_count - 64) + + assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!" + + # start populating the first nested interconnect + axilite_interconnect_idx = 1 + else: + axilite_idx = master_axilite_idx + for node in model.graph.node: assert node.op_type == "StreamingDataflowPartition", "Invalid link graph" sdp_node = getCustomOp(node) @@ -145,7 +236,8 @@ def apply(self, model): # define kernel instances # name kernels connected to graph inputs as idmaxx # name kernels connected to graph outputs as odmaxx - if (producer is None) or (consumer == []): + # do not expect IDMA/ODMA when instrumentation is enabled + if not use_instrumentation and ((producer is None) or (consumer == [])): # TODO not a good way of checking for external inp&out # should look at the list of top-level in/out instead if producer is None: @@ -168,8 +260,13 @@ def apply(self, model): assert axilite_intf_name is not None config.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" - % (instance_names[node.name], axilite_intf_name, axilite_idx) + "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]" + % ( + instance_names[node.name], + axilite_intf_name, + axilite_interconnect_idx, + axilite_idx, + ) ) # assign_bd_address with appropriate range/offset config.append( @@ -178,6 +275,11 @@ def apply(self, model): aximm_idx += 1 axilite_idx += 1 + if axilite_idx == 64: + axilite_interconnect_idx += 1 + axilite_idx = 0 + if axilite_interconnect_idx == 0: + master_axilite_idx += 1 else: instance_names[node.name] = node.name config.append( @@ -187,8 +289,13 @@ def apply(self, model): for axilite_intf_name in ifnames["axilite"]: config.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" - % (instance_names[node.name], axilite_intf_name, axilite_idx) + "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]" + % ( + instance_names[node.name], + axilite_intf_name, + axilite_interconnect_idx, + axilite_idx, + ) ) # assign_bd_address with appropriate range/offset config.append( @@ -196,6 +303,11 @@ def apply(self, model): % (instance_names[node.name], axilite_intf_name) ) axilite_idx += 1 + if axilite_idx == 64: + axilite_interconnect_idx += 1 + axilite_idx = 0 + if axilite_interconnect_idx == 0: + master_axilite_idx += 1 sdp_node.set_nodeattr("instance_name", instance_names[node.name]) config.append( @@ -223,6 +335,33 @@ def apply(self, model): ) ) + # connect first/last dataflow partition to instrumentation wrapper + if use_instrumentation: + if producer is None: + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_0] " + "[get_bd_intf_pins instrumentation_wrap_0/finnix]" + % (instance_names[node.name]) + ) + if consumer == []: + config.append( + "connect_bd_intf_net [get_bd_intf_pins %s/m_axis_0] " + "[get_bd_intf_pins instrumentation_wrap_0/finnox]" + % (instance_names[node.name]) + ) + + # TODO: WORKAROUND, do not instantiate smartconnect when not needed! + if use_instrumentation: + config.append("delete_bd_objs [get_bd_cells smartconnect_0]") + aximm_idx = 1 + + # finalize nested interconnect clock/reset + for i in range(1, nested_interconnect_count + 1): + config.append( + "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config " + '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/M*_ACLK]' % (i) + ) + # create a temporary folder for the project vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_") model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir) @@ -238,12 +377,13 @@ def apply(self, model): templates.custom_zynq_shell_template % ( fclk_mhz, - axilite_idx, + master_axilite_idx, aximm_idx, self.platform, pynq_part_map[self.platform], config, self.enable_debug, + self.enable_gpio_reset, ) ).replace("$BOARDFILES$", str(get_deps_path() / "board_files")) ) @@ -307,6 +447,7 @@ def __init__( platform, period_ns, enable_debug=False, + enable_instrumentation=False, partition_model_dir=None, ): super().__init__() @@ -315,19 +456,27 @@ def __init__( self.period_ns = period_ns self.platform = platform self.enable_debug = enable_debug + self.enable_instrumentation = enable_instrumentation self.partition_model_dir = partition_model_dir def apply(self, model): # first infer layouts model = model.transform(InferDataLayouts()) # prepare at global level, then break up into kernels - prep_transforms = [ - InsertIODMA(self.axi_port_width), - InsertDWC(), - SpecializeLayers(self.fpga_part), - Floorplan(), - CreateDataflowPartition(partition_model_dir=self.partition_model_dir), - ] + if self.enable_instrumentation: + prep_transforms = [ + GenerateInstrumentationIP(self.fpga_part, self.period_ns), + Floorplan(), + CreateDataflowPartition(partition_model_dir=self.partition_model_dir), + ] + else: + prep_transforms = [ + InsertIODMA(self.axi_port_width), + InsertDWC(), + SpecializeLayers(self.fpga_part), + Floorplan(), + CreateDataflowPartition(partition_model_dir=self.partition_model_dir), + ] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -339,7 +488,10 @@ def apply(self, model): sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) - kernel_model = kernel_model.transform(InsertFIFO()) + # InsertFIFO at this stage interferes with tLastMarker + # TODO: is this really needed here at all? + if not self.enable_instrumentation: + kernel_model = kernel_model.transform(InsertFIFO()) kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part)) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index 018e2c041c..63a6b00766 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -48,6 +48,10 @@ set FPGA_PART %s create_project finn_zynq_link ./ -part $FPGA_PART +# Prevent limitation on number of elements for string representations of Vivado collections of objects +# Otherwise we might run into the default limit of 500 if we have many IP_REPO_PATHS +set_param tcl.collectionResultDisplayLimit 0 + # set board part repo paths to find PYNQ-Z1/Z2 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]] set paths_param [get_param board.repoPaths] @@ -90,6 +94,7 @@ create_bd_design "top" if {$ZYNQ_TYPE == "zynq_us+"} { set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]] + set zynq_ps_clkname "pl_clk0" create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] #activate one slave port, deactivate the second master port @@ -100,6 +105,7 @@ set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] } elseif {$ZYNQ_TYPE == "zynq_7000"} { set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]] + set zynq_ps_clkname "FCLK_CLK0" create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ps] set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps] @@ -166,6 +172,17 @@ ] } +# set up GPIO to trigger reset +if {%d == 1} { + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0 + set_property -dict [list CONFIG.C_ALL_OUTPUTS {1} CONFIG.C_DOUT_DEFAULT {0x00000001} CONFIG.C_GPIO_WIDTH {1}] [get_bd_cells axi_gpio_0] + connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/M00_AXI] + assign_axi_addr_proc axi_gpio_0/S_AXI + connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aclk] [get_bd_pins axi_interconnect_0/ACLK] + connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aresetn] [get_bd_pins axi_interconnect_0/ARESETN] + connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_o] [get_bd_pins rst_zynq_ps_*/aux_reset_in] +} + #finalize clock and reset connections for interconnects if {$ZYNQ_TYPE == "zynq_us+"} { apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} } [get_bd_pins axi_interconnect_0/M*_ACLK] diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 9a2da7a45e..9d40b3ba93 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -73,7 +73,7 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC -from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim @@ -812,7 +812,7 @@ def test_make_pynq_driver(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "build") model = load_test_checkpoint_or_skip(prev_chkpt_name) board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma" - model = model.transform(MakePYNQDriver(board_to_driver_platform)) + model = model.transform(MakePYNQDriverIODMA(board_to_driver_platform)) model.save(get_checkpoint_name(board, topology, wbits, abits, "driver")) def test_deploy(self, topology, wbits, abits, board):