diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ebdad54bee..a2f9527976 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,11 +93,11 @@ Sync finn-dev:
 
 .setup_venv_from_whl: &setup_venv_from_whl
   # Move everything to working directory (e.g., RAMdisk)
-  - cp -dfR .. $PATH_WORKDIR
+  - cp -dfR . $PATH_WORKDIR
   - cd $PATH_WORKDIR
   # Create fresh virtual environment and install finn-plus from .whl (artifact)
   - python3 -m venv finn-plus-venv
-  - finn-plus-venv/bin/pip install ./finn-plus/dist/*.whl
+  - finn-plus-venv/bin/pip install dist/*.whl
 
 Build:
   id_tokens:
@@ -171,8 +171,8 @@ FINN Test Suite 2022.2:
     - $JOB_MONITORING_DIR/monitor.sh $JOB_MONITORING_DIR/$CI_PIPELINE_ID/$HOSTNAME.log &
     # Launch FINN via test command, includes preparation of (cached) dependencies
     - |
-      source ./finn-plus-venv/bin/activate
-      finn test --variant $TEST_SUITE --dependency-path ./finn-plus/deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
+      source finn-plus-venv/bin/activate
+      finn test --variant $TEST_SUITE --dependency-path ./deps --build-path $FINN_BUILD_DIR --num-workers 1 --num-test-workers $PYTEST_PARALLEL
   artifacts:
     name: "test_reports"
     when: always
diff --git a/custom_hls/instrumentation.template.cpp b/custom_hls/instrumentation.template.cpp
new file mode 100644
index 0000000000..bf15d77a87
--- /dev/null
+++ b/custom_hls/instrumentation.template.cpp
@@ -0,0 +1,307 @@
+/******************************************************************************
+ *  Copyright (c) 2023, Xilinx, Inc.
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  1.  Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2.  Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *  3.  Neither the name of the copyright holder nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ *  OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *******************************************************************************
+ * @brief	Instrumentation wrapper module for FINN IP characterization.
+ * @author	Thomas B. Preusser <thomas.preusser@amd.com>
+ * @details
+ *	Instrumentation wrapper intercepting the feature map input to and
+ *	the feature map output from a FINN IP to measure processing latency and
+ *	initiation interval in terms of clock cycles. The most recent readings
+ *	are exposed via AXI-light.
+ *	This wrapper can run the FINN IP detached from an external data source
+ *	and sink by feeding LFSR-generated data and sinking the output without
+ *	backpressure.
+ *	This module is currently not integrated with the FINN compiler. It must
+ *	be instantiated and integrated with the rest of the system in a manual
+ *	process.
+ *
+ * @param PENDING	maximum number of feature maps in the FINN dataflow pipeline
+ * @param ILEN		number of input transactions per IFM
+ * @param OLEN		number of output transactions per OFM
+ * @param KO           number of subwords within output payload vector
+ * @param TI		type of input payload vector
+ * @param TO		type of output payload vector
+ *******************************************************************************/
+
+ #include <hls_stream.h>
+ #include <ap_int.h>
+ #include <ap_axi_sdata.h>
+ #include <algorithm>
+
+ // Module Configuration
+ constexpr unsigned  PENDING = @PENDING@; // Max. feature maps in flight
+ constexpr unsigned  ILEN    = @ILEN@;    // Input words per IFM
+ constexpr unsigned  OLEN    = @OLEN@;    // Output words per OFM
+ constexpr unsigned  KO      = @KO@;      // Subwords within OFM transaction word
+ using  TI = @TI@;  // IFM transaction word
+ using  TO = @TO@;  // OFM transaction word
+
+ //---------------------------------------------------------------------------
+ // Utility Functions
+ static constexpr unsigned clog2  (unsigned  x) { return  x<2? 0 : 1+clog2((x+1)/2); }
+ static constexpr unsigned clog2nz(unsigned  x) { return  std::max(1u, clog2(x)); }
+
+ template<typename  T>
+ static void move(
+     hls::stream<T> &src,
+     hls::stream<T> &dst
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+     dst.write(src.read());
+ }
+
+ template<typename  T>
+ static void move(
+     hls::stream<hls::axis<T, 0, 0, 0>> &src,
+     hls::stream<T> &dst
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+     dst.write(src.read().data);
+ }
+
+ template<typename  T>
+ class Payload {
+ public:
+     using  type = T;
+ };
+ template<typename  T>
+ class Payload<hls::axis<T, 0, 0, 0>> {
+ public:
+     using  type = T;
+ };
+
+ /**
+  * Computes a checksum over a forwarded stream assumed to carry frames of
+  * N words further subdivided into K subwords.
+  *      - Subword slicing can be customized typically by using a lambda.
+  *        The provided DefaultSubwordSlicer assumes an `ap_(u)int`-like word
+  *        type with a member `width` and a range-based slicing operator. It
+  *        further assumes a little-endian arrangement of subwords within words
+  *        for the canonical subword stream order.
+  *      - Subwords wider than 23 bits are folded using bitwise XOR across
+  *        slices of 23 bits starting from the LSB.
+  *      - The folded subword values are weighted according to their position
+  *        in the stream relative to the start of frame by a periodic weight
+  *        sequence 1, 2, 3, ...
+  *      - The weighted folded subword values are reduced to a checksum by an
+  *        accumulation module 2^24.
+  *      - A checksum is emitted for each completed frame. It is the concatenation
+  *        of an 8-bit (modulo 256) frame counter and the 24-bit frame checksum.
+  */
+ template<typename T, unsigned K>
+ class DefaultSubwordSlicer {
+     static_assert(T::width%K == 0, "Word size must be subword multiple.");
+     static constexpr unsigned  W = T::width/K;
+ public:
+     ap_uint<W> operator()(T const &x, unsigned const  j) const {
+ #pragma HLS inline
+         return  x((j+1)*W-1, j*W);
+     }
+ };
+
+ //---------------------------------------------------------------------------
+ // Instrumentation Core
+ template<
+     unsigned  PENDING,
+     unsigned  ILEN,
+     unsigned  OLEN,
+     unsigned  KO,
+     typename  TI,
+     typename  TO
+ >
+ void instrument(
+     hls::stream<TI> &finnix,
+     hls::stream<TO> &finnox,
+     ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
+     ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
+     ap_uint<32> &latency,
+     ap_uint<32> &interval,
+     ap_uint<32> &checksum,
+     ap_uint<32> &min_latency
+ ) {
+ #pragma HLS pipeline II=1 style=flp
+
+     // Timestamp Management State
+     using clock_t = ap_uint<32>;
+     static clock_t  cnt_clk = 0;
+ #pragma HLS reset variable=cnt_clk
+     hls::stream<clock_t>  timestamps;
+ #pragma HLS stream variable=timestamps depth=PENDING
+     static bool  timestamp_ovf = false;
+     static bool  timestamp_unf = false;
+ #pragma HLS reset variable=timestamp_ovf
+ #pragma HLS reset variable=timestamp_unf
+
+     // Input Feed & Generation
+     constexpr unsigned  LFSR_WIDTH = (TI::width+15)/16 * 16;
+     static ap_uint<clog2nz(ILEN)>  icnt = 0;
+     static ap_uint<LFSR_WIDTH>  lfsr;
+ #pragma HLS reset variable=icnt
+ #pragma HLS reset variable=lfsr off
+     if(!finnix.full()) {
+
+         bool const  first = icnt == 0;
+         bool  wr;
+         if(first) {
+             // Start of new feature map
+             wr = cfg[0];
+             for(unsigned  i = 0; i < LFSR_WIDTH; i += 16) {
+ #pragma HLS unroll
+                 lfsr(15+i, i) = cfg(31, 16) ^ (i>>4)*33331;
+             }
+         }
+         else {
+             // Advance LFSR
+             wr = true;
+             for(unsigned  i = 0; i < LFSR_WIDTH; i += 16) {
+ #pragma HLS unroll
+                 lfsr(15+i, i) = (lfsr(15+i, i) >> 1) ^ ap_uint<16>(lfsr[i]? 0 : 0x8805);
+             }
+         }
+
+         if(wr) {
+             finnix.write_nb(lfsr);
+             if(first)  timestamp_ovf |= !timestamps.write_nb(cnt_clk);
+             icnt = icnt == ILEN-1? decltype(icnt)(0) : decltype(icnt)(icnt + 1);
+         }
+     }
+
+     // Output Tracking
+     static ap_uint<clog2nz(OLEN)>  ocnt = 0;
+ #pragma HLS reset variable=ocnt
+     static clock_t  ts1 = 0;	// last output timestamp
+     static clock_t  last_latency = 0;
+     static clock_t  last_interval = 0;
+     static clock_t  cur_min_latency = ~0;
+ #pragma HLS reset variable=ts1
+ #pragma HLS reset variable=last_latency
+ #pragma HLS reset variable=last_interval
+ #pragma HLS reset variable=cur_min_latency
+
+     static ap_uint<8>  pkts = 0;
+ #pragma HLS reset variable=pkts
+     static ap_uint< 2>  coeff[3];
+     static ap_uint<24>  psum;
+     static ap_uint<32>  last_checksum = 0;
+ #pragma HLS reset variable=coeff off
+ #pragma HLS reset variable=psum off
+ #pragma HLS reset variable=last_checksum
+
+     TO  oval;
+     if(finnox.read_nb(oval)) {
+         // Start of new output feature map
+         if(ocnt == 0) {
+             for(unsigned  i = 0; i < 3; i++)  coeff[i] = i+1;
+             psum = 0;
+         }
+
+         // Update checksum
+         for(unsigned  j = 0; j < KO; j++) {
+ #pragma HLS unroll
+             auto const  v0 = DefaultSubwordSlicer<TO, KO>()(oval, j);
+             constexpr unsigned  W = 1 + (decltype(v0)::width-1)/23;
+             ap_uint<KO*23>  v = v0;
+             ap_uint<   23>  w = 0;
+             for(unsigned  k = 0; k < W; k++)  w ^= v(23*k+22, 23*k);
+             psum += (coeff[j%3][1]? (w, ap_uint<1>(0)) : ap_uint<24>(0)) + (coeff[j%3][0]? w : ap_uint<23>(0));
+         }
+
+         // Re-align coefficients
+         for(unsigned  j = 0; j < 3; j++) {
+ #pragma HLS unroll
+                 ap_uint<3> const  cc = coeff[j] + ap_uint<3>(KO%3);
+                 coeff[j] = cc(1, 0) + cc[2];
+         }
+
+         // Track frame position
+         if(ocnt != OLEN-1)  ocnt++;
+         else {
+             clock_t  ts0;
+             if(!timestamps.read_nb(ts0))  timestamp_unf = true;
+             else {
+                 last_latency  = cnt_clk - ts0;	// completion - start
+                 last_interval = cnt_clk - ts1;	// completion - previous completion
+                 cur_min_latency = std::min(cur_min_latency, last_latency);
+                 ts1 = cnt_clk;	// mark completion ^
+             }
+             ocnt = 0;
+
+             last_checksum = (pkts++, psum);
+         }
+     }
+
+     // Advance Timestamp Counter
+     cnt_clk++;
+
+     // Copy Status Outputs
+     status = timestamp_ovf | (timestamp_unf << 1);
+     latency  = last_latency;
+     interval = last_interval;
+     checksum = last_checksum;
+     min_latency = cur_min_latency;
+
+ } // instrument()
+
+ void instrumentation_wrapper(
+     hls::stream<TI> &finnix,
+     hls::stream<TO> &finnox,
+     ap_uint<32>  cfg,
+     ap_uint<32> &status,
+     ap_uint<32> &latency,
+     ap_uint<32> &interval,
+     ap_uint<32> &checksum,
+     ap_uint<32> &min_latency
+ ) {
+ #pragma HLS interface axis port=finnix
+ #pragma HLS interface axis port=finnox
+ #pragma HLS interface s_axilite bundle=ctrl port=cfg
+ #pragma HLS interface s_axilite bundle=ctrl port=status
+ #pragma HLS interface s_axilite bundle=ctrl port=latency
+ #pragma HLS interface s_axilite bundle=ctrl port=interval
+ #pragma HLS interface s_axilite bundle=ctrl port=checksum
+ #pragma HLS interface s_axilite bundle=ctrl port=min_latency
+ #pragma HLS interface ap_ctrl_none port=return
+
+ #pragma HLS dataflow disable_start_propagation
+     static hls::stream<TI>  finnix0;
+     static hls::stream<Payload<TO>::type>  finnox0;
+ #pragma HLS stream variable=finnix0 depth=2
+ #pragma HLS stream variable=finnox0 depth=2
+
+     // AXI-Stream -> FIFO
+     move(finnox, finnox0);
+
+     // Main
+     instrument<PENDING, ILEN, OLEN, KO>(finnix0, finnox0, cfg, status, latency, interval, checksum, min_latency);
+
+     // FIFO -> AXI-Stream
+     move(finnix0, finnix);
+
+ } // instrumentation_wrapper
diff --git a/custom_hls/instrumentation_sim.template.tcl b/custom_hls/instrumentation_sim.template.tcl
new file mode 100644
index 0000000000..4875d799e2
--- /dev/null
+++ b/custom_hls/instrumentation_sim.template.tcl
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of AMD nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set fpga_part @FPGA_PART@
+#set output_root ".."
+# path to IP folder for instrumentation wrapper, change as needed
+#set instrwrp_ip_dir "$output_root/instrumentation_wrapper/project_instrwrap/sol1/impl/ip"
+# path to IP folder for FINN IP, change as needed
+#set finn_ip_dir "$output_root/stitched_ip/ip"
+
+create_project -force instr_sim_proj instr_sim_proj/ -part $fpga_part
+create_bd_design "dut"
+update_compile_order -fileset sources_1
+#set_property ip_repo_paths [list $instrwrp_ip_dir] [current_project]
+set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] @IP_DIRS_STR@] [current_project]
+update_ip_catalog
+
+
+create_bd_cell -type ip -vlnv xilinx_finn:finn:finn_design:1.0 finn_design_0
+create_bd_cell -type ip -vlnv xilinx.com:hls:instrumentation_wrapper:1.0 instrumentation_wrap_0
+connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/finnix] [get_bd_intf_pins finn_design_0/s_axis_0]
+connect_bd_intf_net [get_bd_intf_pins finn_design_0/m_axis_0] [get_bd_intf_pins instrumentation_wrap_0/finnox]
+make_bd_intf_pins_external  [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl]
+make_bd_pins_external  [get_bd_pins instrumentation_wrap_0/ap_clk]
+make_bd_pins_external  [get_bd_pins instrumentation_wrap_0/ap_rst_n]
+connect_bd_net [get_bd_ports ap_clk_0] [get_bd_pins finn_design_0/ap_clk]
+connect_bd_net [get_bd_ports ap_rst_n_0] [get_bd_pins finn_design_0/ap_rst_n]
+
+save_bd_design
+
+update_compile_order -fileset sources_1
+make_wrapper -files [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd] -top
+add_files -norecurse instr_sim_proj/instr_sim_proj.gen/sources_1/bd/dut/hdl/dut_wrapper.v
+
+set_property SOURCE_SET sources_1 [get_filesets sim_1]
+add_files -fileset sim_1 ./instrwrap_testbench.sv
+update_compile_order -fileset sim_1
+
+set_property synth_checkpoint_mode None [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd]
+generate_target Simulation [get_files instr_sim_proj/instr_sim_proj.srcs/sources_1/bd/dut/dut.bd]
+launch_simulation -simset sim_1 -mode behavioral
+run all
diff --git a/custom_hls/instrumentation_tb.template.sv b/custom_hls/instrumentation_tb.template.sv
new file mode 100644
index 0000000000..933104c623
--- /dev/null
+++ b/custom_hls/instrumentation_tb.template.sv
@@ -0,0 +1,172 @@
+// Copyright (c) 2023 Advanced Micro Devices, Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of AMD nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+module tb #(
+	// sampling period (in cycles) for reading instrumentation wrapper registers
+    // TODO: make configurable or adjust automatically?
+	int unsigned  INSTR_READ_PERIOD = 10000,
+    // 16-bit LFSR seed for generating fixed random data
+    int unsigned LFSR_SEED = 1
+)();
+
+
+// Clock & Reset
+logic  ap_clk = 0;
+always #5ns ap_clk = !ap_clk;
+logic  ap_rst_n = 0;
+uwire  ap_rst = !ap_rst_n;
+
+// wires for instrumentation wrapper AXI lite interface
+logic [31:0] axilite_ctrl_araddr = 'x;
+uwire axilite_ctrl_arready;
+logic axilite_ctrl_arvalid = 0;
+logic [31:0]  axilite_ctrl_awaddr = 'x;
+uwire axilite_ctrl_awready;
+logic axilite_ctrl_awvalid = 0;
+uwire axilite_ctrl_bready = 1;
+uwire [1:0]axilite_ctrl_bresp;
+uwire axilite_ctrl_bvalid;
+uwire [31:0]axilite_ctrl_rdata;
+logic axilite_ctrl_rready = 1;
+uwire [1:0]axilite_ctrl_rresp;
+uwire axilite_ctrl_rvalid;
+logic [31:0]  axilite_ctrl_wdata = 'x;
+uwire axilite_ctrl_wready;
+uwire [3:0]axilite_ctrl_wstrb = 4'b1111;
+logic  axilite_ctrl_wvalid = 0;
+
+
+
+
+dut_wrapper dut_wrapper_inst (
+	.ap_clk_0(ap_clk), .ap_rst_n_0(ap_rst_n),
+    .s_axi_ctrl_0_araddr(axilite_ctrl_araddr),
+    .s_axi_ctrl_0_arready(axilite_ctrl_arready),
+    .s_axi_ctrl_0_arvalid(axilite_ctrl_arvalid),
+    .s_axi_ctrl_0_awaddr(axilite_ctrl_awaddr),
+    .s_axi_ctrl_0_awready(axilite_ctrl_awready),
+    .s_axi_ctrl_0_awvalid(axilite_ctrl_awvalid),
+    .s_axi_ctrl_0_bready(axilite_ctrl_bready),
+    .s_axi_ctrl_0_bresp(axilite_ctrl_bresp),
+    .s_axi_ctrl_0_bvalid(axilite_ctrl_bvalid),
+    .s_axi_ctrl_0_rdata(axilite_ctrl_rdata),
+    .s_axi_ctrl_0_rready(axilite_ctrl_rready),
+    .s_axi_ctrl_0_rresp(axilite_ctrl_rresp),
+    .s_axi_ctrl_0_rvalid(axilite_ctrl_rvalid),
+    .s_axi_ctrl_0_wdata(axilite_ctrl_wdata),
+    .s_axi_ctrl_0_wready(axilite_ctrl_wready),
+    .s_axi_ctrl_0_wstrb(axilite_ctrl_wstrb),
+    .s_axi_ctrl_0_wvalid(axilite_ctrl_wvalid)
+);
+
+//---------------------------------------------------------------------------
+
+initial begin
+	$timeformat(-9, 2, " ns");
+	// perform reset
+	repeat(100)  @(posedge ap_clk);
+	ap_rst_n <= 1;
+	$display("Reset complete");
+    repeat(100) @(posedge ap_clk);
+    // instrumentation wrapper configuration:
+    // set up LFSR seed + start data generation + output sink
+    axilite_ctrl_awaddr  <= 'h10;
+    axilite_ctrl_awvalid <= 1;
+    axilite_ctrl_wdata   <= (LFSR_SEED << 16) | 'b11;
+    axilite_ctrl_wvalid  <= 1;
+    repeat(8) begin
+        @(posedge ap_clk);
+        if(axilite_ctrl_wready && axilite_ctrl_awready)  break;
+    end
+    axilite_ctrl_wvalid  <= 0;
+    axilite_ctrl_awvalid <= 0;
+    axilite_ctrl_awaddr  <= 'x;
+    axilite_ctrl_wdata   <= 'x;
+    while(1) begin
+        axilite_ctrl_araddr  <= 'h18;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] STATUS_I = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h20;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] STATUS_O = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h28;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] LATENCY = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h38;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] INTERVAL = %0d", $time, axilite_ctrl_rdata);
+                break;
+            end
+        end
+        axilite_ctrl_araddr  <= 'h48;
+        axilite_ctrl_arvalid <= 1;
+        repeat(8) begin
+            @(posedge ap_clk);
+            if(axilite_ctrl_rvalid) begin
+                $display("[t=%0t] CHECKSUM = %8x", $time, axilite_ctrl_rdata);
+                if(axilite_ctrl_rdata) begin
+                    $display("Nonzero checksum detected, stopping simulation");
+                    $finish;
+                    // TODO: simulate for configurable number of frames, like this:
+                    // if(axilite_ctrl_rdata[31:24] == 47) begin
+                    //     $display("Frame number 48 detected, stopping simulation");
+                    //     $finish;
+                    // end
+                end
+                break;
+            end
+        end
+        axilite_ctrl_arvalid <= 0;
+        repeat(INSTR_READ_PERIOD)  @(posedge ap_clk);
+    end
+end
+
+
+endmodule : tb
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 2b01f24557..014a13db27 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -456,8 +456,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n",
-    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n",
+    "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))"
    ]
   },
   {
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index b0510b0fdb..de6de23d3f 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -751,8 +751,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver\n",
-    "model = model.transform(MakePYNQDriver(\"zynq-iodma\"))"
+    "from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA\n",
+    "model = model.transform(MakePYNQDriverIODMA(\"zynq-iodma\"))"
    ]
   },
   {
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 42e015226d..c124b213ac 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -340,6 +340,10 @@ class DataflowBuildConfig(DataClassJSONMixin, DataClassYAMLMixin):
     #: debug signals in the generated hardware)
     enable_hw_debug: Optional[bool] = False
 
+    #: Whether the accelerator will be simulated and synthesized with an
+    #: instrumentation wrapper attached to accurately measure performance.
+    enable_instrumentation: Optional[bool] = False
+
     #: Whether pdb postmortem debuggig will be launched when the build fails
     enable_build_pdb_debug: Optional[bool] = False
 
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index b8d421b5bc..bf7ae19feb 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -79,7 +79,12 @@
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
-from finn.transformation.fpgadataflow.make_driver import MakeCPPDriver, MakePYNQDriver
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_driver import (
+    MakeCPPDriver,
+    MakePYNQDriverInstrumentation,
+    MakePYNQDriverIODMA,
+)
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth
@@ -617,6 +622,26 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
     """Create stitched IP for a graph after all HLS IP blocks have been generated.
     Depends on the DataflowOutputType.STITCHED_IP output product."""
 
+    # introduce tLAST marker, required for instrumentation
+    if cfg.enable_instrumentation:
+        model = model.transform(
+            InsertTLastMarker(
+                # only insert marker on output (input TLAST is ignored for these use-cases anyway)
+                both=False,
+                # use ap_axiu instead of qdma_axis
+                external=False,
+                # static number of iterations (based on what the compiler/folding sets up)
+                dynamic=False,
+            )
+        )
+        # give a proper name to the inserted node, important for codegen
+        # TODO: deal with multi-I/O accelerators?
+        model.graph.node[-1].name = "TLastMarker_0"
+        # re-run codegen and HLS IP gen, will affect only the new TLastMarker layer assuming
+        # all other IPs have been generated already
+        model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+        model = model.transform(HLSSynthIP())
+
     if DataflowOutputType.STITCHED_IP in cfg.generate_outputs:
         stitched_ip_dir = cfg.output_dir + "/stitched_ip"
         model = model.transform(
@@ -717,7 +742,14 @@ def step_make_driver(model: ModelWrapper, cfg: DataflowBuildConfig):
     driver_dir = os.path.join(cfg.output_dir, "driver")
     if DataflowOutputType.PYNQ_DRIVER in cfg.generate_outputs:
         # generate PYNQ driver
-        model = model.transform(MakePYNQDriver(cfg._resolve_driver_platform()))
+        if cfg.enable_instrumentation:
+            model = model.transform(
+                MakePYNQDriverInstrumentation(
+                    cfg._resolve_driver_platform(), cfg.synth_clk_period_ns
+                )
+            )
+        else:
+            model = model.transform(MakePYNQDriverIODMA(cfg._resolve_driver_platform()))
         shutil.copytree(model.get_metadata_prop("pynq_driver_dir"), driver_dir, dirs_exist_ok=True)
         log.info("PYNQ Python driver written into " + driver_dir)
     elif DataflowOutputType.CPP_DRIVER in cfg.generate_outputs:
@@ -779,6 +811,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                     cfg.board,
                     cfg.synth_clk_period_ns,
                     cfg.enable_hw_debug,
+                    cfg.enable_instrumentation,
                     partition_model_dir=partition_model_dir,
                 )
             )
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
index a6ff29d608..af55ee13df 100644
--- a/src/finn/qnn-data/templates/driver/driver_base.py
+++ b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -38,7 +38,7 @@
 
 # Driver base class for FINN-generated dataflow accelerators.
 # The particulars of the generated accelerator are specified via the
-# io_shape_dict (generated by the MakePYNQDriver transformation).
+# io_shape_dict (generated by the MakePYNQDriverIODMA transformation).
 
 
 class FINNExampleOverlay(Overlay):
diff --git a/src/finn/qnn-data/templates/driver/driver_instrumentation.py b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
new file mode 100644
index 0000000000..aa5225eab6
--- /dev/null
+++ b/src/finn/qnn-data/templates/driver/driver_instrumentation.py
@@ -0,0 +1,169 @@
+import argparse
+import json
+import time
+from pynq import PL, Overlay
+from pynq.pl_server.device import Device
+from pynq.ps import Clocks
+
+# Instrumentation wrapper register map #
+# ap_uint<32>  cfg,   	// [0] - 0:hold, 1:lfsr; [31:16] - LFSR seed
+# ap_uint<32> &status,	// [0] - timestamp overflow; [1] - timestamp underflow
+# ap_uint<32> &latency,
+# ap_uint<32> &interval,
+# ap_uint<32> &checksum,
+# ap_uint<32> &min_latency
+
+
+class FINNInstrumentationOverlay(Overlay):
+    def __init__(
+        self,
+        bitfile_name,
+        platform="zynq",
+        fclk_mhz=100.0,
+        device=None,
+        download=True,
+        seed=1,
+    ):
+        super().__init__(bitfile_name, download=download, device=device)
+
+        self.platform = platform
+        self.fclk_mhz = fclk_mhz
+        self.seed = seed
+
+        # configure clock (for ZYNQ platforms)
+        if self.platform == "zynq":
+            if self.fclk_mhz > 0:
+                Clocks.fclk0_mhz = self.fclk_mhz
+                self.fclk_mhz_actual = Clocks.fclk0_mhz
+
+    def instrumentation_read(self, name):
+        return self.instrumentation_wrap_0.read(
+            offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"]
+        )
+
+    def instrumentation_write(self, name, value):
+        return self.instrumentation_wrap_0.write(
+            offset=self.ip_dict["instrumentation_wrap_0"]["registers"][name]["address_offset"],
+            value=value,
+        )
+
+    def reset_accelerator(self):
+        self.axi_gpio_0.write(
+            offset=self.ip_dict["axi_gpio_0"]["registers"]["GPIO_DATA"]["address_offset"], value=0
+        )
+
+    def start_accelerator(self):
+        lfsr_seed = (self.seed << 16) & 0xFFFF0000  # upper 16 bits
+        self.instrumentation_write("cfg", lfsr_seed + 1)  # start operation
+
+    def observe_instrumentation(self, debug_print=True):
+        status_reg = self.instrumentation_read("status")
+        chksum_reg = self.instrumentation_read("checksum")
+        min_latency = self.instrumentation_read("min_latency")
+        latency = self.instrumentation_read("latency")
+        interval = self.instrumentation_read("interval")
+
+        frame = (chksum_reg >> 24) & 0x000000FF
+        checksum = chksum_reg & 0x00FFFFFF
+        overflow_err = (status_reg & 0x00000001) != 0
+        underflow_err = (status_reg & 0x00000002) != 0
+
+        if debug_print:
+            print("---INSTRUMENTATION_REPORT---")
+            if overflow_err or underflow_err:
+                print("Status ERROR")
+                print("Overflow error: %s" % overflow_err)
+                print("Underflow error: %s" % underflow_err)
+            else:
+                print("Status OK")
+            print("Frame number (8-bit): %d" % frame)
+            print("Checksum: 0x%06x" % checksum)
+            print("Min Latency (cycles): %d" % min_latency)
+            print("Latency (cycles): %d" % latency)
+            print("Interval (cycles): %d" % interval)
+            print("----------------------------")
+
+        return (overflow_err, underflow_err, frame, checksum, min_latency, latency, interval)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Profile FINN-generated accelerator using instrumentation wrapper"
+    )
+    parser.add_argument("--runtime", help="Runtime in seconds", type=int, default=10)
+    parser.add_argument(
+        "--frequency", help="FPGA clock frequency in MHz", type=float, default=100.0
+    )
+    parser.add_argument("--seed", help="LFSR seed for input data generation", type=int, default=1)
+    parser.add_argument("--device", help="FPGA device to be used", type=int, default=0)
+    parser.add_argument("--bitfile", help="Name of bitfile", default="finn-accel.bit")
+    parser.add_argument(
+        "--reportfile",
+        help="Name of output .json report file",
+        type=str,
+        default="measured_performance.json",
+    )
+    parser.add_argument(
+        "--settingsfile", help="Name of optional input .json settings file", type=str, default=""
+    )
+    # parse arguments
+    args = parser.parse_args()
+    runtime = args.runtime
+    frequency = args.frequency
+    seed = args.seed
+    bitfile = args.bitfile
+    reportfile = args.reportfile
+    settingsfile = args.settingsfile
+    devID = args.device
+    device = Device.devices[devID]
+
+    # overwrite frequency if specified in settings file
+    if settingsfile != "":
+        with open(settingsfile, "r") as f:
+            settings = json.load(f)
+            if "fclk_mhz" in settings:
+                frequency = settings["fclk_mhz"]
+
+    # instantiate FINN accelerator driver and pass batchsize and bitfile
+    print("Programming FPGA..")
+    PL.reset()  # reset PYNQ cache
+    accel = FINNInstrumentationOverlay(
+        bitfile_name=bitfile, device=device, fclk_mhz=frequency, seed=seed
+    )
+
+    # start accelerator
+    print("Running accelerator..")
+    accel.start_accelerator()
+
+    # let it run for specified runtime
+    time.sleep(runtime)
+
+    # read measurement from instrumentation
+    (
+        overflow_err,
+        underflow_err,
+        frame,
+        checksum,
+        min_latency,
+        latency,
+        interval,
+    ) = accel.observe_instrumentation()
+
+    # write report to file
+    report = {
+        "error": overflow_err or underflow_err or interval == 0,
+        "checksum": checksum,
+        "min_latency_cycles": min_latency,
+        "latency_cycles": latency,
+        "interval_cycles": interval,
+        "frequency_mhz": round(accel.fclk_mhz_actual),
+        "min_latency_ms": round(min_latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "latency_ms": round(latency * (1 / (accel.fclk_mhz_actual * 1e6)) * 1e3, 6),
+        "throughput_fps": round(1 / (interval * (1 / (accel.fclk_mhz_actual * 1e6)))),
+        "min_pipeline_depth": round(min_latency / interval, 2),
+        "pipeline_depth": round(latency / interval, 2),
+    }
+    with open(reportfile, "w") as f:
+        json.dump(report, f, indent=2)
+
+    print("Done.")
diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py
index 38312ce1ee..ceff0b4f8a 100644
--- a/src/finn/transformation/fpgadataflow/floorplan.py
+++ b/src/finn/transformation/fpgadataflow/floorplan.py
@@ -99,9 +99,13 @@ def apply(self, model):
                 # if we have SLR assignment already. use that
                 if node_slr != -1:
                     continue
+                # if available, use the SLR of the preceding node
                 srcnode = model.find_producer(node.input[0])
-                node_slr = getCustomOp(srcnode).get_nodeattr("slr")
-                node_inst.set_nodeattr("slr", node_slr)
+                if srcnode is not None:
+                    node_slr = getCustomOp(srcnode).get_nodeattr("slr")
+                    node_inst.set_nodeattr("slr", node_slr)
+                else:
+                    node_inst.set_nodeattr("slr", default_slr)
 
         if unassigned_nodes > 0:
             warning_str = f"{unassigned_nodes} nodes have no entry in\
@@ -127,25 +131,27 @@ def apply(self, model):
         )
         non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes))
 
+        # assign every DMA node to its own partition
         for node in dma_nodes:
             node_inst = getCustomOp(node)
             node_inst.set_nodeattr("partition_id", partition_cnt)
             partition_cnt += 1
 
+        # assign every dynamic tLastMarker node to its own partition
         for node in dyn_tlastmarker_nodes:
             node_inst = getCustomOp(node)
             node_inst.set_nodeattr("partition_id", partition_cnt)
             partition_cnt += 1
 
+        # handle remaining nodes
         for node in non_dma_nodes:
             pre_node = model.find_producer(node.input[0])
             node_inst = getCustomOp(node)
             if pre_node not in non_dma_nodes:
-                # input node
+                # input node -> start new partition
                 node_inst.set_nodeattr("partition_id", partition_cnt)
                 partition_cnt += 1
                 continue
-
             elif not (
                 node.op_type.startswith("MVAU")
                 and node_inst.get_nodeattr("mem_mode") is not None
@@ -153,25 +159,36 @@ def apply(self, model):
             ):
                 pre_nodes = model.find_direct_predecessors(node)
             else:
+                # exception for external weight MVAU: only consider primary input
+                # TODO: (why) is this necessary? should we consider such exceptions for other cases?
                 pre_nodes = [pre_node]
 
+            axilite_intf_name = node_inst.get_verilog_top_module_intf_names()["axilite"]
+            if len(axilite_intf_name) != 0:
+                # This node has an AXI-Lite interface -> start new partition
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
+                continue
+
+            # examine all predecessor nodes to determine partition id for this node
             node_slr = node_inst.get_nodeattr("slr")
+            slr_mismatch_count = 0
             for pre_node in pre_nodes:
                 pre_inst = getCustomOp(pre_node)
                 pre_slr = pre_inst.get_nodeattr("slr")
                 if node_slr == pre_slr:
-                    axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"]
-                    if len(axilite_intf_name) != 0:
-                        node_inst.set_nodeattr("partition_id", partition_cnt)
-                        partition_cnt += 1
-                    else:
-                        partition_id = pre_inst.get_nodeattr("partition_id")
-                        node_inst.set_nodeattr("partition_id", partition_id)
-
+                    # Default case -> assign to same partition as predecessor
+                    partition_id = pre_inst.get_nodeattr("partition_id")
+                    node_inst.set_nodeattr("partition_id", partition_id)
+                    break
                 else:
-                    # no matching, new partition
-                    node_inst.set_nodeattr("partition_id", partition_cnt)
-                    partition_cnt += 1
+                    # SLR mismatch with predecessor, can't assign same partition
+                    slr_mismatch_count += 1
+
+            if slr_mismatch_count == len(pre_nodes):
+                # SLR mismatch with ALL predecessors -> start new partition
+                node_inst.set_nodeattr("partition_id", partition_cnt)
+                partition_cnt += 1
 
         # save the updated floorplan
         floorplan = model.analysis(floorplan_params)
diff --git a/src/finn/transformation/fpgadataflow/instrumentation.py b/src/finn/transformation/fpgadataflow/instrumentation.py
new file mode 100644
index 0000000000..f2b3b21f6d
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/instrumentation.py
@@ -0,0 +1,206 @@
+import numpy as np
+import os
+import subprocess
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.base import Transformation
+
+from finn.custom_op.fpgadataflow.templates import ipgentcl_template
+from finn.util.basic import make_build_dir
+from finn.util.deps import get_deps_path
+from finn.util.hls import CallHLS
+
+
+# TODO: duplicate function from make_zynq_proj.py
+def collect_ip_dirs(model, ipstitch_path):
+    # collect list of all IP dirs
+    ip_dirs = []
+    need_memstreamer = False
+    for node in model.graph.node:
+        node_inst = getCustomOp(node)
+        ip_dir_value = node_inst.get_nodeattr("ip_path")
+        assert os.path.isdir(
+            ip_dir_value
+        ), """The directory that should
+        contain the generated ip blocks doesn't exist."""
+        ip_dirs += [ip_dir_value]
+        if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls":
+            if node_inst.get_nodeattr("mem_mode") == "internal_decoupled":
+                need_memstreamer = True
+    ip_dirs += [ipstitch_path + "/ip"]
+    if need_memstreamer:
+        # add RTL streamer IP
+        ip_dirs.append("$::env(FINN_RTLLIB)/memstream")
+    return ip_dirs
+
+
+class GenerateInstrumentationIP(Transformation):
+    def __init__(
+        self,
+        fpga_part,
+        clk_period_ns,
+        format="ip",  # "ip" for Vivado (Zynq) or "xo" for Vitis (Alveo/Versal)
+    ):
+        super().__init__()
+        self.fpga_part = fpga_part
+        self.clk_period_ns = clk_period_ns
+        self.format = format
+
+    def apply(self, model):
+        # Create directory for code-gen and HLS of instrumentation IP
+        wrapper_output_dir = make_build_dir(prefix="code_gen_ipgen_Instrumentation_")
+        model.set_metadata_prop("instrumentation_ipgen", wrapper_output_dir)
+
+        # conservative max for pending feature maps: number of layers
+        pending = len(model.graph.node)
+        # query the parallelism-dependent folded input shape from the
+        # node consuming the graph input
+        inp_name = model.graph.input[0].name
+        inp_node = getCustomOp(model.find_consumer(inp_name))
+        inp_shape_folded = list(inp_node.get_folded_input_shape())
+        inp_stream_width = inp_node.get_instream_width_padded()
+        # number of beats per input is given by product of folded input
+        # shape except the last dim (which is the stream width)
+        ilen = np.prod(inp_shape_folded[:-1])
+        ti = "ap_uint<%d>" % inp_stream_width
+        # perform the same for the output
+        out_name = model.graph.output[0].name
+        out_node = getCustomOp(model.find_producer(out_name))
+        out_shape_folded = list(out_node.get_folded_output_shape())
+        out_stream_width = out_node.get_outstream_width_padded()
+        olen = np.prod(out_shape_folded[:-1])
+        to = "ap_uint<%d>" % out_stream_width
+        ko = out_shape_folded[-1]
+        # fill out instrumentation wrapper template
+        with open(
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation.template.cpp"), "r"
+        ) as f:
+            instrwrp_cpp = f.read()
+        instrwrp_cpp = instrwrp_cpp.replace("@PENDING@", str(pending))
+        instrwrp_cpp = instrwrp_cpp.replace("@ILEN@", str(ilen))
+        instrwrp_cpp = instrwrp_cpp.replace("@OLEN@", str(olen))
+        instrwrp_cpp = instrwrp_cpp.replace("@TI@", str(ti))
+        instrwrp_cpp = instrwrp_cpp.replace("@TO@", str(to))
+        instrwrp_cpp = instrwrp_cpp.replace("@KO@", str(ko))
+        with open(wrapper_output_dir + "/top_instrumentation_wrapper.cpp", "w") as f:
+            f.write(instrwrp_cpp)
+        # fill out HLS synthesis tcl template
+        prjname = "project_instrwrap"
+        ipgentcl = ipgentcl_template
+        ipgentcl = ipgentcl.replace("$PROJECTNAME$", prjname)
+        ipgentcl = ipgentcl.replace("$HWSRCDIR$", wrapper_output_dir)
+        ipgentcl = ipgentcl.replace("$FINNHLSLIB$", str(get_deps_path() / "finn-hlslib"))
+        ipgentcl = ipgentcl.replace("$ATTENTIONHLSLIB$", str(get_deps_path() / "attention-hlslib"))
+        ipgentcl = ipgentcl.replace("$TOPFXN$", "instrumentation_wrapper")
+        ipgentcl = ipgentcl.replace("$FPGAPART$", self.fpga_part)
+        ipgentcl = ipgentcl.replace("$CLKPERIOD$", str(self.clk_period_ns))
+        ipgentcl = ipgentcl.replace("$DEFAULT_DIRECTIVES$", "")
+        if self.format == "xo":
+            # use Vitis RTL kernel (.xo) output instead of IP-XACT
+            ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "config_export -format xo")
+            ipgentcl = ipgentcl.replace(
+                "export_design -format ip_catalog", "export_design -format xo"
+            )
+        else:
+            ipgentcl = ipgentcl.replace("$EXTRA_DIRECTIVES$", "")
+        with open(wrapper_output_dir + "/hls_syn.tcl", "w") as f:
+            f.write(ipgentcl)
+        # build bash script to launch HLS synth and call it
+        code_gen_dir = wrapper_output_dir
+        builder = CallHLS()
+        builder.append_tcl(code_gen_dir + "/hls_syn.tcl")
+        builder.set_ipgen_path(code_gen_dir + "/{}".format(prjname))
+        builder.build(code_gen_dir)
+        ipgen_path = builder.ipgen_path
+        assert os.path.isdir(ipgen_path), "HLS IPGen failed: %s not found" % (ipgen_path)
+        ip_path = ipgen_path + "/sol1/impl/ip"
+        assert os.path.isdir(ip_path), "HLS IPGen failed: %s not found. Check log under %s" % (
+            ip_path,
+            code_gen_dir,
+        )
+        if self.format == "xo":
+            assert False, "Not implemented"
+            # TODO: export for use in VitisBuild or VersalBuild
+            # xo_dir = self.output_dir + "/xo"
+            # xo_dir = str(os.path.abspath(xo_dir))
+            # os.makedirs(xo_dir, exist_ok=True)
+            # xo_path = code_gen_dir + "/{}/sol1/impl/export.xo".format(prjname)
+            # xo_instr_path = xo_dir + "/instrumentation_wrapper.xo"
+            # shutil.copy(xo_path, xo_instr_path)
+        else:
+            # shutil.move(ip_path, self.output_dir)
+            pass
+
+        return (model, False)
+
+
+class PrepareInstrumentationSim(Transformation):
+    def __init__(self, fpga_part):
+        super().__init__()
+        self.fpga_part = fpga_part
+
+    def apply(self, model):
+        # Create directory for simulation of instrumentation IP + FINN IP
+        sim_output_dir = make_build_dir(prefix="sim_Instrumentation_")
+        model.set_metadata_prop("instrumentation_sim", sim_output_dir)
+
+        # check if instrumentation IP was generated
+        instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen")
+        if instr_ip_dir is None or (not os.path.isdir(instr_ip_dir)):
+            raise Exception(
+                "Instrumentation IP not generated, run GenerateInstrumentationIP first."
+            )
+
+        # TODO: Support simulation with AXI-lite control interfaces (e.g., for dynamic pipelines)
+        # fill in testbench template
+        with open(
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_tb.template.sv"),
+            "r",
+        ) as f:
+            testbench_sv = f.read()
+        with open(sim_output_dir + "/instrwrap_testbench.sv", "w") as f:
+            f.write(testbench_sv)
+        # fill in testbench project creator template
+        with open(
+            os.path.join(os.environ["FINN_CUSTOM_HLS"], "instrumentation_sim.template.tcl"),
+            "r",
+        ) as f:
+            testbench_tcl = f.read()
+
+        # collect ip repo paths for finn accelerator sub cores so Vivado can find them
+        ipstitch_path = model.get_metadata_prop("vivado_stitch_proj")
+        ip_dirs = ["list"]
+        ip_dirs += collect_ip_dirs(model, ipstitch_path)
+        ip_dirs += [instr_ip_dir]
+        ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
+        testbench_tcl = testbench_tcl.replace("@FPGA_PART@", self.fpga_part)
+        testbench_tcl = testbench_tcl.replace("@IP_DIRS_STR@", ip_dirs_str)
+        with open(sim_output_dir + "/make_instrwrap_sim_proj.tcl", "w") as f:
+            f.write(testbench_tcl)
+
+        return (model, False)
+
+
+class RunInstrumentationSim(Transformation):
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        sim_output_dir = model.get_metadata_prop("instrumentation_sim")
+        if sim_output_dir is None or (not os.path.isdir(sim_output_dir)):
+            raise Exception(
+                "Instrumentation sim not prepared, run PrepareInstrumentationSim first."
+            )
+
+        # Prepare bash script
+        bash_script = os.getcwd() + "/report_power.sh"
+        with open(bash_script, "w") as script:
+            script.write("#!/bin/bash\n")
+            script.write("cd %s\n" % (sim_output_dir))
+            script.write("vivado -mode batch -source make_instrwrap_sim_proj.tcl\n")
+
+        # Run script
+        print("Running Vivado simulation of instrumentation wrapper")
+        sub_proc = subprocess.Popen(["bash", bash_script])
+        sub_proc.communicate()
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_driver.py b/src/finn/transformation/fpgadataflow/make_driver.py
index 97fc97a4fd..1cea95f9c5 100644
--- a/src/finn/transformation/fpgadataflow/make_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_driver.py
@@ -298,7 +298,7 @@ def formatKernelName(kname: str):
         return (model, False)
 
 
-class MakePYNQDriver(Transformation):
+class MakePYNQDriverIODMA(Transformation):
     """Create PYNQ Python code to correctly interface the generated
     accelerator, including data packing/unpacking. Should be called
     after conversion to HLS layers, folding and the creation of
@@ -459,3 +459,35 @@ def apply(self, model):
                     continue
 
         return (model, False)
+
+
+class MakePYNQDriverInstrumentation(Transformation):
+    def __init__(self, platform, clk_period_ns):
+        super().__init__()
+        self.platform = platform
+        self.clk_period_ns = clk_period_ns
+
+    def apply(self, model):
+        # TODO: support runtime-writable and external weights
+        # TODO: support Alveo and Versal platforms
+
+        # create a temporary folder for the generated driver
+        pynq_driver_dir = make_build_dir(prefix="pynq_driver_")
+        model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir)
+
+        # create (copy) the static instrumentation driver
+        driver_template = (
+            os.environ["FINN_QNN_DATA"] + "/templates/driver/driver_instrumentation.py"
+        )
+        driver_py = pynq_driver_dir + "/driver.py"
+        shutil.copy(driver_template, driver_py)
+
+        # write default settings to driver config file
+        settings = {
+            "fclk_mhz": (1.0 / self.clk_period_ns) * 1e3,
+        }
+        settingsfile = pynq_driver_dir + "/settings.json"
+        with open(settingsfile, "w") as f:
+            json.dump(settings, f, indent=2)
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 8110d76461..59d4293323 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -27,6 +27,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import math
 import os
 import subprocess
 from qonnx.core.modelwrapper import ModelWrapper
@@ -43,6 +44,7 @@
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
 from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
 from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA
+from finn.transformation.fpgadataflow.instrumentation import GenerateInstrumentationIP
 from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map
@@ -93,6 +95,7 @@ def __init__(self, platform, period_ns, enable_debug=False):
         self.platform = platform
         self.period_ns = period_ns
         self.enable_debug = 1 if enable_debug else 0
+        self.enable_gpio_reset = 0
 
     def apply(self, model):
         # create a config file and empty list of xo files
@@ -100,8 +103,96 @@ def apply(self, model):
         idma_idx = 0
         odma_idx = 0
         aximm_idx = 0
+        nested_interconnect_count = 0
+        master_axilite_idx = 0
+        axilite_interconnect_idx = 0
         axilite_idx = 0
         instance_names = {}
+
+        # instantiate instrumentation IP if it was generated
+        instr_ip_dir = model.get_metadata_prop("instrumentation_ipgen")
+        if instr_ip_dir is not None and os.path.isdir(instr_ip_dir):
+            use_instrumentation = True
+
+            # instantiate GPIO IP to trigger reset
+            self.enable_gpio_reset = 1
+            # in the template this will connect to first port of interconnect_0
+            master_axilite_idx += 1
+
+            # update IP repository
+            config.append(
+                "set_property ip_repo_paths "
+                "[concat [get_property ip_repo_paths [current_project]] [list %s]] "
+                "[current_project]" % instr_ip_dir
+            )
+            config.append("update_ip_catalog -rebuild -scan_changes")
+            # create instance
+            config.append(
+                "create_bd_cell -type ip -vlnv %s %s"
+                % ("xilinx.com:hls:instrumentation_wrapper:1.0", "instrumentation_wrap_0")
+            )
+            # connect clock % reset
+            config.append(
+                "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_clk] "
+                "[get_bd_pins smartconnect_0/aclk]"
+            )
+            config.append(
+                "connect_bd_net [get_bd_pins instrumentation_wrap_0/ap_rst_n] "
+                "[get_bd_pins smartconnect_0/aresetn]"
+            )
+            # connect AXI-lite control interface
+            config.append(
+                "connect_bd_intf_net [get_bd_intf_pins instrumentation_wrap_0/s_axi_ctrl] "
+                "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]" % (master_axilite_idx)
+            )
+            config.append("assign_axi_addr_proc instrumentation_wrap_0/s_axi_ctrl")
+            master_axilite_idx += 1
+        else:
+            use_instrumentation = False
+
+        # instantiate nested AXI interconnects if required
+        # only the nested interconnects and all interfaces connected before this line
+        # will be connected to the original (master) interconnect
+        total_axilite_count = 0
+        for node in model.graph.node:
+            sdp_node = getCustomOp(node)
+            dataflow_model_filename = sdp_node.get_nodeattr("model")
+            kernel_model = ModelWrapper(dataflow_model_filename)
+            ifnames = eval(kernel_model.get_metadata_prop("vivado_stitch_ifnames"))
+            total_axilite_count += len(ifnames["axilite"])
+        if total_axilite_count > (64 - master_axilite_idx):
+            nested_interconnect_count = math.ceil(total_axilite_count / 64.0)
+            for i in range(1, nested_interconnect_count + 1):
+                # create instance
+                config.append(
+                    "create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_%d" % (i)
+                )
+                # configure instance
+                config.append(
+                    "set_property -dict [list CONFIG.NUM_MI %d] [get_bd_cells axi_interconnect_%d]"
+                    % (min(64, total_axilite_count), i)
+                )
+                # connect to master interconnect
+                config.append(
+                    "connect_bd_intf_net [get_bd_intf_pins axi_interconnect_0/M%02d_AXI] "
+                    "-boundary_type upper [get_bd_intf_pins axi_interconnect_%d/S00_AXI]"
+                    % (master_axilite_idx, i)
+                )
+                # connect clocks/reset
+                config.append(
+                    "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
+                    '"Clk /zynq_ps/$zynq_ps_clkname" [get_bd_pins axi_interconnect_%d/ACLK]' % (i)
+                )
+                master_axilite_idx += 1
+                total_axilite_count = max(0, total_axilite_count - 64)
+
+            assert total_axilite_count == 0, "Not all AXI-lite interfaces connected!"
+
+            # start populating the first nested interconnect
+            axilite_interconnect_idx = 1
+        else:
+            axilite_idx = master_axilite_idx
+
         for node in model.graph.node:
             assert node.op_type == "StreamingDataflowPartition", "Invalid link graph"
             sdp_node = getCustomOp(node)
@@ -145,7 +236,8 @@ def apply(self, model):
             # define kernel instances
             # name kernels connected to graph inputs as idmaxx
             # name kernels connected to graph outputs as odmaxx
-            if (producer is None) or (consumer == []):
+            # do not expect IDMA/ODMA when instrumentation is enabled
+            if not use_instrumentation and ((producer is None) or (consumer == [])):
                 # TODO not a good way of checking for external inp&out
                 # should look at the list of top-level in/out instead
                 if producer is None:
@@ -168,8 +260,13 @@ def apply(self, model):
                 assert axilite_intf_name is not None
                 config.append(
                     "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                    "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
-                    % (instance_names[node.name], axilite_intf_name, axilite_idx)
+                    "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]"
+                    % (
+                        instance_names[node.name],
+                        axilite_intf_name,
+                        axilite_interconnect_idx,
+                        axilite_idx,
+                    )
                 )
                 # assign_bd_address with appropriate range/offset
                 config.append(
@@ -178,6 +275,11 @@ def apply(self, model):
 
                 aximm_idx += 1
                 axilite_idx += 1
+                if axilite_idx == 64:
+                    axilite_interconnect_idx += 1
+                    axilite_idx = 0
+                if axilite_interconnect_idx == 0:
+                    master_axilite_idx += 1
             else:
                 instance_names[node.name] = node.name
                 config.append(
@@ -187,8 +289,13 @@ def apply(self, model):
                 for axilite_intf_name in ifnames["axilite"]:
                     config.append(
                         "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
-                        "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
-                        % (instance_names[node.name], axilite_intf_name, axilite_idx)
+                        "[get_bd_intf_pins axi_interconnect_%d/M%02d_AXI]"
+                        % (
+                            instance_names[node.name],
+                            axilite_intf_name,
+                            axilite_interconnect_idx,
+                            axilite_idx,
+                        )
                     )
                     # assign_bd_address with appropriate range/offset
                     config.append(
@@ -196,6 +303,11 @@ def apply(self, model):
                         % (instance_names[node.name], axilite_intf_name)
                     )
                     axilite_idx += 1
+                    if axilite_idx == 64:
+                        axilite_interconnect_idx += 1
+                        axilite_idx = 0
+                    if axilite_interconnect_idx == 0:
+                        master_axilite_idx += 1
             sdp_node.set_nodeattr("instance_name", instance_names[node.name])
 
             config.append(
@@ -223,6 +335,33 @@ def apply(self, model):
                             )
                         )
 
+            # connect first/last dataflow partition to instrumentation wrapper
+            if use_instrumentation:
+                if producer is None:
+                    config.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/s_axis_0] "
+                        "[get_bd_intf_pins instrumentation_wrap_0/finnix]"
+                        % (instance_names[node.name])
+                    )
+                if consumer == []:
+                    config.append(
+                        "connect_bd_intf_net [get_bd_intf_pins %s/m_axis_0] "
+                        "[get_bd_intf_pins instrumentation_wrap_0/finnox]"
+                        % (instance_names[node.name])
+                    )
+
+        # TODO: WORKAROUND, do not instantiate smartconnect when not needed!
+        if use_instrumentation:
+            config.append("delete_bd_objs [get_bd_cells smartconnect_0]")
+            aximm_idx = 1
+
+        # finalize nested interconnect clock/reset
+        for i in range(1, nested_interconnect_count + 1):
+            config.append(
+                "apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config "
+                '"Clk /zynq_ps/$zynq_ps_clkname"  [get_bd_pins axi_interconnect_%d/M*_ACLK]' % (i)
+            )
+
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_zynq_proj_")
         model.set_metadata_prop("vivado_pynq_proj", vivado_pynq_proj_dir)
@@ -238,12 +377,13 @@ def apply(self, model):
                     templates.custom_zynq_shell_template
                     % (
                         fclk_mhz,
-                        axilite_idx,
+                        master_axilite_idx,
                         aximm_idx,
                         self.platform,
                         pynq_part_map[self.platform],
                         config,
                         self.enable_debug,
+                        self.enable_gpio_reset,
                     )
                 ).replace("$BOARDFILES$", str(get_deps_path() / "board_files"))
             )
@@ -307,6 +447,7 @@ def __init__(
         platform,
         period_ns,
         enable_debug=False,
+        enable_instrumentation=False,
         partition_model_dir=None,
     ):
         super().__init__()
@@ -315,19 +456,27 @@ def __init__(
         self.period_ns = period_ns
         self.platform = platform
         self.enable_debug = enable_debug
+        self.enable_instrumentation = enable_instrumentation
         self.partition_model_dir = partition_model_dir
 
     def apply(self, model):
         # first infer layouts
         model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
-        prep_transforms = [
-            InsertIODMA(self.axi_port_width),
-            InsertDWC(),
-            SpecializeLayers(self.fpga_part),
-            Floorplan(),
-            CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
-        ]
+        if self.enable_instrumentation:
+            prep_transforms = [
+                GenerateInstrumentationIP(self.fpga_part, self.period_ns),
+                Floorplan(),
+                CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
+            ]
+        else:
+            prep_transforms = [
+                InsertIODMA(self.axi_port_width),
+                InsertDWC(),
+                SpecializeLayers(self.fpga_part),
+                Floorplan(),
+                CreateDataflowPartition(partition_model_dir=self.partition_model_dir),
+            ]
         for trn in prep_transforms:
             model = model.transform(trn)
             model = model.transform(GiveUniqueNodeNames())
@@ -339,7 +488,10 @@ def apply(self, model):
             sdp_node = getCustomOp(sdp_node)
             dataflow_model_filename = sdp_node.get_nodeattr("model")
             kernel_model = ModelWrapper(dataflow_model_filename)
-            kernel_model = kernel_model.transform(InsertFIFO())
+            # InsertFIFO at this stage interferes with tLastMarker
+            # TODO: is this really needed here at all?
+            if not self.enable_instrumentation:
+                kernel_model = kernel_model.transform(InsertFIFO())
             kernel_model = kernel_model.transform(SpecializeLayers(self.fpga_part))
             kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix))
             kernel_model.save(dataflow_model_filename)
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 018e2c041c..63a6b00766 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -48,6 +48,10 @@
 set FPGA_PART %s
 create_project finn_zynq_link ./ -part $FPGA_PART
 
+# Prevent limitation on number of elements for string representations of Vivado collections of objects
+# Otherwise we might run into the default limit of 500 if we have many IP_REPO_PATHS
+set_param tcl.collectionResultDisplayLimit 0
+
 # set board part repo paths to find PYNQ-Z1/Z2
 set paths_prop [get_property BOARD_PART_REPO_PATHS [current_project]]
 set paths_param [get_param board.repoPaths]
@@ -90,6 +94,7 @@
 create_bd_design "top"
 if {$ZYNQ_TYPE == "zynq_us+"} {
     set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]]
+    set zynq_ps_clkname "pl_clk0"
     create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ps]
     #activate one slave port, deactivate the second master port
@@ -100,6 +105,7 @@
     set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps]
 } elseif {$ZYNQ_TYPE == "zynq_7000"} {
     set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]]
+    set zynq_ps_clkname "FCLK_CLK0"
     create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps
     apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ps]
     set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps]
@@ -166,6 +172,17 @@
                                                              ]
 }
 
+# set up GPIO to trigger reset
+if {%d == 1} {
+    create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0
+    set_property -dict [list CONFIG.C_ALL_OUTPUTS {1} CONFIG.C_DOUT_DEFAULT {0x00000001} CONFIG.C_GPIO_WIDTH {1}] [get_bd_cells axi_gpio_0]
+    connect_bd_intf_net [get_bd_intf_pins axi_gpio_0/S_AXI] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/M00_AXI]
+    assign_axi_addr_proc axi_gpio_0/S_AXI
+    connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aclk] [get_bd_pins axi_interconnect_0/ACLK]
+    connect_bd_net [get_bd_pins axi_gpio_0/s_axi_aresetn] [get_bd_pins axi_interconnect_0/ARESETN]
+    connect_bd_net [get_bd_pins axi_gpio_0/gpio_io_o] [get_bd_pins rst_zynq_ps_*/aux_reset_in]
+}
+
 #finalize clock and reset connections for interconnects
 if {$ZYNQ_TYPE == "zynq_us+"} {
     apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} }  [get_bd_pins axi_interconnect_0/M*_ACLK]
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 9a2da7a45e..9d40b3ba93 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -73,7 +73,7 @@
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
-from finn.transformation.fpgadataflow.make_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_driver import MakePYNQDriverIODMA
 from finn.transformation.fpgadataflow.minimize_accumulator_width import MinimizeAccumulatorWidth
 from finn.transformation.fpgadataflow.minimize_weight_bit_width import MinimizeWeightBitWidth
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
@@ -812,7 +812,7 @@ def test_make_pynq_driver(self, topology, wbits, abits, board):
         prev_chkpt_name = get_checkpoint_name(board, topology, wbits, abits, "build")
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma"
-        model = model.transform(MakePYNQDriver(board_to_driver_platform))
+        model = model.transform(MakePYNQDriverIODMA(board_to_driver_platform))
         model.save(get_checkpoint_name(board, topology, wbits, abits, "driver"))
 
     def test_deploy(self, topology, wbits, abits, board):