diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
index a4bf40760e..819782184d 100644
--- a/src/finn/analysis/fpgadataflow/dataflow_performance.py
+++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -29,6 +29,7 @@
 
 from qonnx.custom_op.registry import getCustomOp
 
+from finn.util.basic import decompress_string_to_numpy
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 
@@ -76,3 +77,84 @@ def dataflow_performance(model):
         "max_cycles": int(max_cycles),
         "max_cycles_node_name": max_node_name,
     }
+
+
+def max_period(model):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+
+    for node in model.graph.node:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+
+    return {
+        "max_cycles": int(max_cycles),
+    }
+
+
+def max_remaining_period(model, node):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+    node_index = list(model.graph.node).index(node)
+    for node in model.graph.node[node_index:]:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles = int(inst.get_nodeattr("io_chrc_period"))
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+    return {
+        "max_cycles": int(max_cycles),
+    }
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index a08fc3a04c..d0c42c8e44 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -40,11 +40,30 @@
 
 class AutoFIFOSizingMethod(str, Enum):
     "Select the type of automatic FIFO sizing strategy."
-
-    CHARACTERIZE = "characterize"
+    ANALYTIC = "analytical"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
 
 
+class TAVGenerationMethod(str, Enum):
+    "Select the strategy for constructing token access vectors of an operator."
+    RTLSIM = "rtlsim"
+    TREE_MODEL = "tree_model"
+
+
+class TAVUtilizationMethod(str, Enum):
+    """Select the strategy for utilizing token access vectors of an operator
+    for buffer sizing."""
+
+    # worst-case ratio of data rates between a consumer and producer
+    CONSERVATIVE_RELAXATION = "conservative_relaxation"
+
+    # average-case ratio of data rates between a consumer and producer
+    AGGRESSIVE_RELAXATION = "aggressive_relaxation"
+
+    # no relaxation, use the token access vectors as-is
+    NO_RELAXATION = "no_relaxation"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -278,6 +297,31 @@ class DataflowBuildConfig:
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_generation_strategy: Optional[TAVGenerationMethod] = TAVGenerationMethod.RTLSIM
+
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_utilization_strategy: Optional[
+        TAVUtilizationMethod
+    ] = TAVUtilizationMethod.CONSERVATIVE_RELAXATION
+
+    #: When True, skips the resynthesis steps after fifo sizing. This makes it
+    #: possible to run the step for rapid fifo size analysis during
+    #: automatic folding optimizations or as a first approximation.
+    skip_resynth_during_fifo_sizing: Optional[bool] = False
+
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index e81d7d09f7..afa35f0a4b 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -53,7 +53,10 @@
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.analysis.fpgadataflow.dataflow_performance import (
+    dataflow_performance,
+    max_period,
+)
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.op_and_param_counts import (
@@ -80,8 +83,13 @@
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.derive_characteristic import (
-    DeriveCharacteristic,
+    DelayCharacteristicFunctions,
     DeriveFIFOSizes,
+    DeriveTokenAccessVectors,
+    HandleBranches,
+    JustInTimeSynthesize,
+    LocalStretchCharacteristicFunctions,
+    ProducerDelayCharacteristicFunctions,
 )
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
@@ -102,6 +110,7 @@
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
+    CapConvolutionFIFODepths,
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
     SplitLargeFIFOs,
@@ -573,19 +582,79 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        if cfg.auto_fifo_strategy == "characterize":
-            model = model.transform(InsertDWC())
-            model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
-            model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InsertDWC())
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(AnnotateCycles())
+
+        if cfg.auto_fifo_strategy == "analytical":
+            if cfg.tav_generation_strategy == "tree_model":
+                # if we have tree models, only rtlsim nodes for which we dont
+                only_jit_nodes_without_tree = True
+            else:
+                # rtlsim everything by force if not using trees
+                only_jit_nodes_without_tree = False
             model = model.transform(
-                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+                JustInTimeSynthesize(
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                    only_jit_nodes_without_tree,
+                )
+            )
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                DeriveTokenAccessVectors(
+                    model,
+                    period,
+                    cfg.tav_generation_strategy,
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                LocalStretchCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
             )
-            model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim(behav=True))
-            model = model.transform(AnnotateCycles())
-            period = model.analysis(dataflow_performance)["max_cycles"] + 10
-            model = model.transform(DeriveCharacteristic(period))
-            model = model.transform(DeriveFIFOSizes())
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(HandleBranches(model, period))
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                DelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(
+                ProducerDelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(max_period)["max_cycles"])
+
+            model = model.transform(
+                DeriveFIFOSizes(
+                    period=period,
+                    nodes_to_ignore=[],
+                    global_offset_correction=True,
+                    tav_utilization_strategy=cfg.tav_utilization_strategy,
+                )
+            )
+
             model = model.transform(
                 InsertFIFO(
                     vivado_ram_style=cfg.large_fifo_mem_style,
@@ -593,9 +662,13 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
                     create_shallow_fifos=True,
                 )
             )
+
             model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
+            if cfg.default_swg_exception:
+                model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=256))
+
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
             if cfg.fifosim_save_waveform:
                 report_dir = cfg.output_dir + "/report"
@@ -665,8 +738,10 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
-    model = model.transform(HLSSynthIP())
+    if not cfg.skip_resynth_during_fifo_sizing:
+        model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+        model = model.transform(HLSSynthIP())
+
     return model
 
 
diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py
index c11fb3db3e..a049ea8dcc 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams.py
+++ b/src/finn/custom_op/fpgadataflow/addstreams.py
@@ -32,6 +32,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class AddStreams(HWCustomOp):
@@ -149,7 +150,17 @@ def execute_node(self, context, graph):
         result = inp0_values + inp1_values
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
 
-    def derive_characteristic_fxns(self, period):
+    def prepare_tree_model(self):
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        read_write = Characteristic_Node("passing addstreams layer", [(dim, [1, 1])], True)
+        addstreams_top = Characteristic_Node("compute addstreams", [(1, read_write)], False)
+
+        return addstreams_top  # top level phase of this node
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
@@ -158,4 +169,7 @@ def derive_characteristic_fxns(self, period):
             },
             "outputs": {"out0": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py
index abb1adc1fb..083dac17ce 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op.py
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py
@@ -34,6 +34,7 @@
 from qonnx.util.basic import qonnx_make_model
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 # ONNX i/o tensor shape assumptions for channelwise ops:
 # input 0 is the input tensor, shape (..., NumChannels)
@@ -240,3 +241,13 @@ def execute_node(self, context, graph):
         sess = rt.InferenceSession(model_func.SerializeToString())
         result = sess.run(None, idict)
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def get_tree_model(self):
+        # key parameters
+
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        pass_channelwise = Characteristic_Node("passing channelwise layer", [(dim, [1, 1])], True)
+        channelwise_top = Characteristic_Node("compute pool", [(1, pass_channelwise)], False)
+
+        return channelwise_top  # top level phase of this node
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 8c1a36232f..7504ca6e4e 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -26,6 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import numpy as np
 import warnings
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
@@ -35,6 +36,7 @@
 from qonnx.util.basic import qonnx_make_model
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
 # input 0 is the input tensor, shape NHWC = (1, IFMDim, IFMDim, IFMChannels)
@@ -259,3 +261,606 @@ def execute_node(self, context, graph):
         # this automatically updates the execution context
         inst = getCustomOp(im2col_node)
         inst.execute_node(context, model_im2col.graph)
+
+    def get_tree_model_uniform_distribution_based(self):
+        def distribute_outputs_uniform(
+            out_total, in_total, stride_y=1, stride_x=1, feature_map_x=1, kernel_x=1, kernel_y=1
+        ):
+            if in_total == 0:
+                return [out_total]
+
+            # if kernel_y > 1:
+            # stride_y = stride_y - (kernel_y-1) // 2
+            # if kernel_x > 1:
+            # stride_x = stride_x - (kernel_x-1) // 2
+
+            spacing_y = max(feature_map_x * (stride_y - 1), 1)
+            spacing_x = max((stride_x - 1 + (kernel_x - 1) // 2), 1)
+
+            weights = []
+            for i in range(in_total):
+                weight = 1
+                if stride_y > 1:
+                    if i % spacing_y == 0:
+                        weight += spacing_y
+                if stride_x > 1:
+                    if i % spacing_x == 0:
+                        weight += spacing_x
+                weights.append(weight)
+
+            # Normalize weights to match out_total
+            total_weight = sum(weights)
+            raw_counts = [w * out_total / total_weight for w in weights]
+
+            # Round to nearest integers
+            int_counts = [int(round(x)) for x in raw_counts]
+
+            # Adjust rounding error
+            diff = sum(int_counts) - out_total
+            if diff != 0:
+                adjustments = sorted(
+                    enumerate(raw_counts), key=lambda x: x[1] - int_counts[x[0]], reverse=(diff > 0)
+                )
+                for i, _ in adjustments:
+                    if diff == 0:
+                        break
+                    int_counts[i] -= int(diff / abs(diff))
+                    diff -= int(diff / abs(diff))
+
+            return int_counts
+
+        IMPL_STYLE = "rtl" if "_rtl" in (self.__class__.__name__) else "hls"
+        assert IMPL_STYLE in ["rtl", "hls"], "Implementation style must be 'rtl' or 'hls'"
+
+        # Extract node attributes
+        ifm_dim_y, ifm_dim_x = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        k_h, k_w = self.get_nodeattr("ConvKernelDim")
+        stride_y, stride_x = self.get_nodeattr("Stride")
+        dilation_y, dilation_x = self.get_nodeattr("Dilation")
+        is1d = self.get_nodeattr("is1D")
+        parallel_window = self.get_nodeattr("parallel_window")
+        # numReps = 1
+
+        assert ifm_ch % simd == 0
+        factor = ifm_ch // simd
+        ofm_dim_y = compute_conv_output_dim(ifm_dim_y, k_h, stride_y, 0, dilation_y)
+        ofm_dim_x = compute_conv_output_dim(ifm_dim_x, k_w, stride_x, 0, dilation_x)
+        total_outputs = ofm_dim_y * ofm_dim_x
+        total_inputs = ifm_dim_y * ifm_dim_x
+        if parallel_window:
+            k_h = 1
+            k_w = 1
+        # if not is1d:
+        #     # 2D convolution
+        #     output_tokens = total_outputs * (k_h * k_w)
+        # else:
+        #     # 1D convolution
+        #     output_tokens = total_outputs * (k_h)
+
+        # key parameters
+        # IFMDim_x = self.get_nodeattr("IFMDim")[0]
+        # OFMDim_x = self.get_nodeattr("OFMDim")[0]
+        ConvKernelDim_x = self.get_nodeattr("ConvKernelDim")[0]
+        # Stride_x = self.get_nodeattr("Stride")[0]
+
+        # OFMDim_y = self.get_nodeattr("OFMDim")[1]
+        ConvKernelDim_y = self.get_nodeattr("ConvKernelDim")[1]
+        # Stride_y = self.get_nodeattr("Stride")[1]
+
+        # SIMD = self.get_nodeattr("SIMD")
+
+        # IFMChannels = self.get_nodeattr("IFMChannels")
+
+        DEPTHWISE = self.get_nodeattr("depthwise")
+        is1d = self.get_nodeattr("is1D")
+
+        # SF = IFMChannels // SIMD
+        # OUTPUT_SIZE = OFMDim_x * ConvKernelDim_x * SF
+        # INPUT_SIZE = IFMDim_x * SF
+        # WINDOW_SIZE = ConvKernelDim_x * SF
+        # if DEPTHWISE:
+        #     BUFFER_SIZE = ConvKernelDim_x * SF
+        #     READ_CYCLES = SF * (ConvKernelDim_x - 1) - (ConvKernelDim_x - 1)
+        #     FINISH = IFMDim_x - ConvKernelDim_x - 2
+        # else:
+        #     BUFFER_SIZE = (ConvKernelDim_x - 1) * SF
+        #     READ_CYCLES = 0
+        #     FINISH = 0
+
+        assert ifm_ch % simd == 0
+        factor = ifm_ch // simd
+
+        # OCNT_INITIAL = BUFFER_SIZE + (Stride_x - 1)
+
+        # DEFAULT_FIFO_DEPTH = 2
+
+        ofm_dim_y = compute_conv_output_dim(ifm_dim_y, k_h, stride_y, 0, dilation_y)
+        ofm_dim_x = compute_conv_output_dim(ifm_dim_x, k_w, stride_x, 0, dilation_x)
+
+        if DEPTHWISE:
+            ofm_dim_y = ofm_dim_y * ConvKernelDim_y
+            ofm_dim_x = ofm_dim_x * ConvKernelDim_x
+
+        if DEPTHWISE:
+            flip_factor = factor
+        else:
+            flip_factor = 1
+
+        total_outputs = ofm_dim_y * ofm_dim_x * flip_factor
+        total_inputs = ifm_dim_y * ifm_dim_x * flip_factor
+        if parallel_window:
+            k_h = 1
+            k_w = 1
+        # if not is1d:
+        #     # 2D convolution
+        #     output_tokens = total_outputs * (k_h * k_w)
+        # else:
+        #     # 1D convolution
+        #     output_tokens = total_outputs * (k_h)
+
+        ch_write = Characteristic_Node("Output Write", [(factor // flip_factor, [0, 1])], True)
+        ch_read = Characteristic_Node("Streamed Read", [(factor // flip_factor, [1, 0])], True)
+        ch_both = Characteristic_Node("Streamed Read", [(factor // flip_factor, [1, 1])], True)
+
+        out_total = np.prod(self.get_folded_output_shape()[:-1]) // factor * flip_factor
+        in_total = np.prod(self.get_folded_input_shape()[:-1]) // factor * flip_factor
+
+        # Calculate startup and steady reads
+        if not is1d:
+            startup_reads = (k_h - 1) * ifm_dim_x + k_w  # - (ifm_dim_x-k_w)
+            #  startup_writes = ofm_dim_x - (ofm_dim_x-k_w) // (stride_x * stride_y)# *
+            # factor # we can only write the middle in this section!!!
+            if not DEPTHWISE:
+                if k_h > 1:
+                    startup_writes = ofm_dim_x  # k_w*stride_x # // (stride_x)
+                else:
+                    startup_writes = ofm_dim_x  # // (stride_x * stride_y)
+            else:
+                if k_h > 1:
+                    startup_writes = 0
+                else:
+                    startup_writes = 0
+        else:
+            startup_reads = ifm_dim_x
+            startup_writes = ofm_dim_x // stride_x
+
+        startup_reads = startup_reads * flip_factor
+        startup_writes = startup_writes * flip_factor
+
+        # startup_reads = 0
+        steady_reads = total_inputs - startup_reads
+        steady_writes = total_outputs - startup_writes
+
+        total_inputs = total_inputs - startup_reads
+        total_outputs = total_outputs - startup_writes
+        # inputs_read = startup_reads
+
+        if startup_writes == 0:
+            offset_writing = 1
+        else:
+            offset_writing = 0
+
+        # Steady-state reads > 0, normal case
+        # Spread steady reads evenly across output_tokens cycles
+        in_total = in_total - startup_reads
+        out_total = out_total - startup_writes
+
+        if startup_writes > startup_reads:
+            schedule = distribute_outputs_uniform(
+                startup_writes, startup_reads, stride_x, stride_y, k_w, k_h, ifm_dim_x
+            )
+            per_cycle_nodes = []
+
+            for tokens_this_cycle in schedule:
+                cycle = Characteristic_Node(
+                    "Cycle",
+                    [
+                        (1 - offset_writing, ch_both),
+                        (
+                            1,
+                            Characteristic_Node(
+                                "Output Write",
+                                [(tokens_this_cycle - 1 + offset_writing, ch_write)],
+                                False,
+                            ),
+                        ),
+                    ],
+                    False,
+                )
+                per_cycle_nodes.append((1, cycle))
+
+            startup = Characteristic_Node("Processing Loop", per_cycle_nodes, False)
+        else:
+            schedule = distribute_outputs_uniform(
+                startup_reads, startup_writes, stride_x, stride_y, k_w, k_h, ifm_dim_x
+            )
+            per_cycle_nodes = []
+
+            for tokens_this_cycle in schedule:
+                cycle = Characteristic_Node(
+                    "Cycle",
+                    [
+                        (1 - offset_writing, ch_both),
+                        (
+                            1,
+                            Characteristic_Node(
+                                "Input Read",
+                                [(tokens_this_cycle - 1 + offset_writing, ch_read)],
+                                False,
+                            ),
+                        ),
+                    ],
+                    False,
+                )
+                per_cycle_nodes.append((1, cycle))
+
+            startup = Characteristic_Node("Processing Loop", per_cycle_nodes, False)
+
+        if out_total > in_total:
+            if steady_reads <= 0:
+                return Characteristic_Node(
+                    "SlidingWindow_2D", [(1, startup), (steady_writes, ch_write)], False
+                )
+
+            schedule = distribute_outputs_uniform(
+                out_total, in_total, stride_x, stride_y, k_w, k_h, ifm_dim_x
+            )
+            per_cycle_nodes = []
+
+            for tokens_this_cycle in schedule:
+                cycle = Characteristic_Node(
+                    "Cycle",
+                    [
+                        (1, ch_both),
+                        (
+                            1,
+                            Characteristic_Node(
+                                "Output Write", [(tokens_this_cycle - 1, ch_write)], False
+                            ),
+                        ),
+                    ],
+                    False,
+                )
+                per_cycle_nodes.append((1, cycle))
+
+            steady = Characteristic_Node("Processing Loop", per_cycle_nodes, False)
+
+            return Characteristic_Node("SlidingWindow_2D", [(1, startup), (1, steady)], False)
+
+        else:
+            if steady_reads <= 0:
+                return Characteristic_Node(
+                    "SlidingWindow_2D", [(1, startup), (steady_writes, ch_write)], False
+                )
+
+            schedule = distribute_outputs_uniform(
+                in_total, out_total, stride_x, stride_y, k_w, k_h, ifm_dim_x
+            )
+            per_cycle_nodes = []
+
+            for tokens_this_cycle in schedule:
+                cycle = Characteristic_Node(
+                    "Cycle",
+                    [
+                        (1, ch_both),
+                        (
+                            1,
+                            Characteristic_Node(
+                                "Output Write", [(tokens_this_cycle - 1, ch_read)], False
+                            ),
+                        ),
+                    ],
+                    False,
+                )
+                per_cycle_nodes.append((1, cycle))
+
+            steady = Characteristic_Node("Processing Loop", per_cycle_nodes, False)
+
+            return Characteristic_Node("SlidingWindow_2D", [(1, startup), (1, steady)], False)
+            
+    def get_tree_model(self):
+        # Extract node attributes
+        ifm_dim_y, ifm_dim_x = self.get_nodeattr("IFMDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        simd = self.get_nodeattr("SIMD")
+        k_y, k_x = self.get_nodeattr("ConvKernelDim")
+        stride_y, stride_x = self.get_nodeattr("Stride")
+        dilation_y, dilation_x = self.get_nodeattr("Dilation")
+        parallel_window = self.get_nodeattr("parallel_window")
+        depthwise = self.get_nodeattr("depthwise")
+        SF = ifm_ch // simd
+
+        # hyper parameter for when we stop merging
+        buffering_threshold = 1024
+        #
+        # print("simd: ", simd)
+        # print("ifm y, x: ", ifm_dim_y, ifm_dim_x)
+        # print("K: ", k_y, k_x)
+        # print("stride: ", stride_y, stride_x)
+        # print("dilation: ", dilation_y, dilation_x)
+        # print("parallel_window: ", parallel_window)
+        # print("dw: ", depthwise)
+        # print("buffer depth: ", self.get_buffer_depth())
+        # print("buffering threshold: ", buffering_threshold)
+
+        stride_y_skips = (stride_y - 1) * ifm_dim_x
+
+        import math
+
+        kernels_in_line = math.ceil(
+            (ifm_dim_x - (k_x - 1 + (k_x - 1) * (dilation_x - 1))) / stride_x
+        )
+        kernel_lines = math.ceil(
+            (ifm_dim_y - ((k_y - 1) + (k_y - 1) * (dilation_y - 1))) / stride_y
+        )
+
+        # compute tail end of a kernel line which has to be read
+        shifts_x = (kernels_in_line - 1) * stride_x
+        starting_index_x = k_x + (k_x - 1) * (dilation_x - 1)
+        remainder_x = ifm_dim_x - (starting_index_x + shifts_x)
+
+        # compute tail end rows of the full feature map which have to be read
+        shifts_y = (kernel_lines - 1) * stride_y
+        starting_index_y = k_y + (k_y - 1) * (dilation_y - 1)
+        remainder_y = (ifm_dim_y - (starting_index_y + shifts_y)) * ifm_dim_x
+
+        reads_to_prepare_line = (k_x - 1) + (k_x - 1) * (dilation_x - 1)
+        reads_to_prepare_first_line = ((k_y - 1) + (k_y - 1) * (dilation_y - 1)) * ifm_dim_x
+        total_kernel_y = k_y + (k_y - 1) * (dilation_y - 1)
+        first_line_kernel_buffer = k_x + (k_x - 1) * (dilation_x - 1)
+        first_line_buffer = (total_kernel_y - 1) * ifm_dim_x
+
+        if parallel_window == 1:
+            writes_per_kernel = 1
+        else:
+            writes_per_kernel = k_y * k_x
+
+        # inner line first buffer fill
+        inner_line_buffer_reads = (stride_y - 1) * ifm_dim_x
+
+        # handling of a kernel shift on x axis
+        single_move_dif = writes_per_kernel - stride_x
+        if single_move_dif > 0:
+            # more writes than reads, dif both, write rest
+            do_both = stride_x
+            writes_only = single_move_dif
+            reads_only = 0
+        else:
+            # more reads than writes
+            do_both = writes_per_kernel
+            reads_only = -single_move_dif
+            writes_only = 0
+
+        first_do_both = 0
+        first_writes_only = writes_per_kernel
+        first_reads_only = first_line_kernel_buffer
+
+        # absorb some remaining reads into writes if possible
+        absorbing_kernels = 0
+
+        # only allow absorbing up to kernels_in_line-1 as the first kernel is an exception
+        remaining_buffer_reads = inner_line_buffer_reads
+        if inner_line_buffer_reads > 0 and ((kernels_in_line - 1) * writes_only) > 0:
+            # determine how many lines can absorb them
+            absorbing_kernels = min(
+                math.floor((inner_line_buffer_reads) // writes_only), kernels_in_line - 1
+            )
+            absorbed_reads = absorbing_kernels * writes_only
+
+            # print("absorbing krn: ", absorbing_kernels)
+            # print("absorved reads: ", absorbed_reads)
+            # print("remaining hanging reads: ", (inner_line_buffer_reads) - absorbed_reads)
+            # print("remaining old kernels: ", (kernels_in_line - 2) - absorbing_kernels)
+            inner_line_buffer_reads -= absorbed_reads
+            remaining_buffer_reads -= absorbed_reads
+
+        # first kernel is a special case, we absorb the buffer reads into it as well
+        first_reads = first_line_kernel_buffer + remaining_buffer_reads
+        first_single_move_dif = writes_per_kernel - first_reads
+        if first_single_move_dif > 0:
+            # more writes than reads, dif both, write rest
+            first_do_both = first_reads
+            first_writes_only = first_single_move_dif
+            first_reads_only = 0
+        else:
+            # more reads than writes
+            first_do_both = writes_per_kernel
+            first_reads_only = -first_single_move_dif
+            first_writes_only = 0
+
+        # first kernel is a special case, we absorb the buffer reads into it as well
+        absolute_first_reads = first_line_kernel_buffer + first_line_buffer
+        absolute_first_single_move_dif = writes_per_kernel - absolute_first_reads
+
+        absolute_first_do_both = 0
+        absolute_first_writes_only = writes_per_kernel
+        absolute_first_reads_only = absolute_first_reads
+
+        if depthwise == 0:
+            if absolute_first_single_move_dif > 0:
+                # more writes than reads, dif both, write rest
+                absolute_first_do_both = absolute_first_reads
+                absolute_first_writes_only = absolute_first_single_move_dif
+                absolute_first_reads_only = 0
+            else:
+                # more reads than writes
+                absolute_first_do_both = writes_per_kernel
+                absolute_first_reads_only = -absolute_first_single_move_dif
+                absolute_first_writes_only = 0
+
+        ch_idle = Characteristic_Node("Output Write", [(SF, [0, 0])], True)
+        ch_write = Characteristic_Node("Output Write", [(SF, [0, 1])], True)
+
+        ch_read = Characteristic_Node("Streamed Read", [(SF, [1, 0])], True)
+        ch_both = Characteristic_Node("Streamed Read+Write", [(SF, [1, 1])], True)
+
+        if parallel_window == 2:
+            # parallel window path works reliably, but should
+            # eventually be using paralle window 0's structure
+            # however currently is still inaccurate for some
+            # configs with parallel window=0
+            ch_handle = Characteristic_Node("write out", [(1, ch_both)], False)
+
+            handle_kernel = Characteristic_Node(
+                "handle one kernel", [(1, ch_handle), (stride_x - 1, ch_read)], False
+            )
+
+            handle_last_kernel = Characteristic_Node(
+                "handle last kernel",
+                [
+                    (1, ch_handle),
+                    (remainder_x, ch_read),
+                ],
+                False,
+            )
+
+            handle_line = Characteristic_Node(
+                "write_one_line",
+                [
+                    (reads_to_prepare_line, ch_read),
+                    (kernels_in_line - 1, handle_kernel),
+                    (1, handle_last_kernel),
+                    (stride_y_skips, ch_read),
+                ],
+                False,
+            )
+            handle_last_line = Characteristic_Node(
+                "write line without stride at end",
+                [
+                    (reads_to_prepare_line, ch_read),
+                    (kernels_in_line, handle_kernel),
+                    (remainder_y, ch_read),
+                ],
+                False,
+            )
+            swg = Characteristic_Node(
+                "SlidingWindowGenerator",
+                [
+                    (1, ch_idle),
+                    (reads_to_prepare_first_line, ch_read),
+                    (kernel_lines - 1, handle_line),
+                    (1, handle_last_line),
+                ],
+                False,
+            )
+
+        else:
+            # --- handle_first_kernel ---
+            # print("\n\nhandle first kernel")
+            # print(f"do_both: {first_do_both}\n")
+            # print(f"reads_only: {first_reads_only}\n")
+            # print(f"writes_only: {first_writes_only}\n")
+
+            handle_absolute_kernel = Characteristic_Node(
+                "handle one kernel",
+                [
+                    (absolute_first_do_both, ch_both),
+                    (absolute_first_reads_only, ch_read),
+                    (absolute_first_writes_only, ch_write),
+                ],
+                False,
+            )
+
+            # --- handle_first_kernel ---
+            # print("\n\nhandle first kernel")
+            # print(f"do_both: {first_do_both}\n")
+            # print(f"reads_only: {first_reads_only}\n")
+            # print(f"writes_only: {first_writes_only}\n")
+
+            handle_first_kernel = Characteristic_Node(
+                "handle one kernel",
+                [
+                    (first_do_both, ch_both),
+                    (first_reads_only, ch_read),
+                    (first_writes_only, ch_write),
+                ],
+                False,
+            )
+
+            # --- handle_kernel ---
+            # print("\n\nhandle kernel")
+            # print(f"do_both: {do_both}\n")
+            # print(f"reads_only: {reads_only}\n")
+            # print(f"writes_only: {writes_only}\n")
+
+            handle_kernel = Characteristic_Node(
+                "handle one kernel",
+                [
+                    (do_both, ch_both),
+                    (reads_only, ch_read),
+                    (writes_only, ch_write),
+                ],
+                False,
+            )
+
+            # --- handle_kernel_absorbed ---
+            # print("\n\nhandle absorbed kernel")
+            # print(f"do_both: {do_both+writes_only}\n")
+            # print(f"reads_only: {reads_only}\n")
+            #
+            handle_kernel_absorbed = Characteristic_Node(
+                "handle one kernel with fused writes",
+                [
+                    (do_both + writes_only, ch_both),
+                    (reads_only, ch_read),
+                ],
+                False,
+            )
+
+            # --- handle_first_line ---
+            # print("\n\nhandle first line")
+            # print(f"first_line_buffer: {first_line_buffer}\n")
+            # print(f"first line kernelbuffer: {first_line_kernel_buffer}\n")
+            # print(f"kernels_in_line: {kernels_in_line}\n")
+            # print(f"remainder_x: {remainder_x}\n")
+
+            handle_first_line = Characteristic_Node(
+                "write first line",
+                [
+                    # (first_line_buffer, ch_read),
+                    (1, handle_absolute_kernel),
+                    (kernels_in_line - 1, handle_kernel),
+                    (remainder_x, ch_read),
+                ],
+                False,
+            )
+
+            # --- handle_line ---
+            # print("\n\nhandle regular line")
+            # print(f"inner_line_buffer_reads: {inner_line_buffer_reads}\n")
+            # print(f"absorbing_kernels: {absorbing_kernels}\n")
+            # print("kernels_in_line - absorbing_kernels: ")
+            # print(f"{kernels_in_line - absorbing_kernels}\n")
+            # print(f"remainder_x: {remainder_x}\n")
+
+            handle_line = Characteristic_Node(
+                "write one inner line",
+                [
+                    # (remaining_buffer_reads, ch_read),
+                    (1, handle_first_kernel),
+                    (absorbing_kernels, handle_kernel_absorbed),
+                    (kernels_in_line - 1 - absorbing_kernels, handle_kernel),
+                    (remainder_x, ch_read),
+                ],
+                False,
+            )
+
+            # --- swg ---
+            # print("\n\nswg")
+            # print(f"kernel_lines - 1: {kernel_lines - 1}\n")
+            # print(f"remainder_y: {remainder_y}\n")
+
+            swg = Characteristic_Node(
+                "SlidingWindowGenerator",
+                [
+                    (1, handle_first_line),
+                    (kernel_lines - 1, handle_line),
+                    (remainder_y, ch_read),
+                ],
+                False,
+            )
+
+        return swg
+
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
index 4a52a36006..ac095fa9af 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
@@ -31,6 +31,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class DuplicateStreams(HWCustomOp):
@@ -148,12 +149,27 @@ def execute_node(self, context, graph):
         for outp in node.output:
             context[outp] = output
 
-    def derive_characteristic_fxns(self, period):
+    def get_tree_model(self):
+        # key parameters
+
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        read_write = Characteristic_Node("passing duplicate layer", [(dim, [1, 1])], True)
+        duplicatestreams_top = Characteristic_Node("compute duplicate", [(1, read_write)], False)
+
+        return duplicatestreams_top  # top level phase of this node
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
                 "in0": [0 for i in range(n_inps)],
             },
-            "outputs": {"out0": [], "out1": []},
+            "outputs": {*[f"out{x}" for x in range(self.get_num_output_streams())]},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py
index 2ff9bb13b7..322d12c9de 100644
--- a/src/finn/custom_op/fpgadataflow/fmpadding.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding.py
@@ -31,6 +31,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class FMPadding(HWCustomOp):
@@ -111,6 +112,13 @@ def get_folded_output_shape(self, ind=0):
         folded_oshape = normal_oshape[:-1] + [fold, simd]
         return tuple(folded_oshape)
 
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpect input shape for FMPadding."
+        return super().make_const_shape_op(oshape)
+
     def infer_node_datatype(self, model):
         node = self.onnx_node
         idt = model.get_tensor_datatype(node.input[0])
@@ -124,6 +132,9 @@ def infer_node_datatype(self, model):
         self.set_nodeattr("inputDataType", idt.name)
         model.set_tensor_datatype(node.output[0], idt)
 
+    def verify_node(self):
+        pass
+
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
@@ -146,6 +157,10 @@ def get_outstream_width(self, ind=0):
         simd = self.get_nodeattr("SIMD")
         return obits * simd
 
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
     def execute_node(self, context, graph):
         # simulate behavior with Python functionality
         node = self.onnx_node
@@ -156,3 +171,61 @@ def execute_node(self, context, graph):
             inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant"
         )
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def get_tree_model(self):
+        # key parameters
+        # this depends on the kernel type, hls or rtl etc
+
+        # extract node attr
+        IMGDIM = self.get_nodeattr("ImgDim")
+        PADDING = self.get_nodeattr("Padding")
+        NUMCHANNELS = self.get_nodeattr("NumChannels")
+        SIMD = self.get_nodeattr("SIMD")
+        batch_size = self.get_nodeattr("numInputVectors")
+        IMPL_STYLE = "rtl" if "_rtl" in (self.__class__.__name__) else "hls"
+        assert IMPL_STYLE in ["rtl", "hls"], "Implementation style must be 'rtl' or 'hls'"
+
+        # compute new parameters
+        NF = int(NUMCHANNELS / SIMD)
+        y_padding_top, x_padding_left, y_padding_bottom, x_padding_right = PADDING
+        y_dim = IMGDIM[0]
+        x_dim = IMGDIM[1]
+
+        if IMPL_STYLE == "hls" and NF == 1:
+            loop_overhead = 1
+        else:
+            loop_overhead = 0
+
+        ch_pad = Characteristic_Node("Channel_Pad", [(NF, [0, 1]), (loop_overhead, [0, 0])], True)
+
+        ch_pass = Characteristic_Node("Channel_Pass", [(NF, [1, 1]), (loop_overhead, [0, 0])], True)
+
+        x_inner_line = Characteristic_Node(
+            "Fill X full inner line",
+            [(x_padding_left, ch_pad), (x_dim, ch_pass), (x_padding_right, ch_pad)],
+            False,
+        )
+
+        x_outer_line = Characteristic_Node(
+            "Pad X outer line", [(x_padding_left + x_dim + x_padding_right, ch_pad)], False
+        )
+
+        fmpadding = Characteristic_Node(
+            "FMPadding FM",
+            [
+                (y_padding_top, x_outer_line),
+                (y_dim, x_inner_line),
+                (y_padding_bottom, x_outer_line),
+            ],
+            False,
+        )
+
+        fmpadding_top = Characteristic_Node(
+            "FMPadding FM",
+            [
+                (batch_size, fmpadding),
+            ],
+            False,
+        )
+
+        return fmpadding_top  # top level phase of this node
diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py
index f8f7a73c54..61ffe66579 100644
--- a/src/finn/custom_op/fpgadataflow/hwcustomop.py
+++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py
@@ -33,12 +33,20 @@
 
 import numpy as np
 import os
-import warnings
 from abc import abstractmethod
 from qonnx.custom_op.base import CustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
 
-from finn.util.basic import get_liveness_threshold_cycles, is_versal
+from finn.util.basic import (
+    compress_numpy_to_string,
+    get_liveness_threshold_cycles,
+    is_versal,
+)
+
+try:
+    import pyxsi_utils
+except ModuleNotFoundError:
+    pyxsi_utils = None
 
 
 class HWCustomOp(CustomOp):
@@ -87,14 +95,19 @@ def get_nodeattr_types(self):
             "inFIFODepths": ("ints", False, [2]),
             "outFIFODepths": ("ints", False, [2]),
             "output_hook": ("s", False, ""),
-            # accumulated characteristic function over two periods
-            "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)),
-            "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)),
+            # token access vectors used for analytical FIFO sizing
+            "io_chrc_in": ("s", False, ""),
+            "io_chrc_out": ("s", False, ""),
+            "io_chrc_in_stretch": ("s", False, ""),
+            "io_chrc_out_stretch": ("s", False, ""),
+            "io_chrc_in_original": ("s", False, ""),
+            "io_chrc_out_original": ("s", False, ""),
             # the period for which the characterization was run
             "io_chrc_period": ("i", False, 0),
-            # amount of zero padding inserted during chrc.
-            "io_chrc_pads_in": ("ints", False, []),
-            "io_chrc_pads_out": ("ints", False, []),
+            # extra buffers added to a branch, needed for coupling
+            # token access vectors at the end of
+            # branches during analytical FIFO sizing
+            "extra_branch_fifos": ("ints", False, [0, 0]),
         }
 
     def make_shape_compatible_op(self, model):
@@ -219,6 +232,19 @@ def reset_rtlsim(self, sim):
         back to one"""
         finnxsi.reset_rtlsim(sim)
 
+    def rtlsim_multi_io_custom(self, sim, io_dict, sname="_V", batch_size=1):
+        "Run rtlsim for this node, supports multiple i/o streams."
+        num_out_values = self.get_number_output_values() * batch_size
+        total_cycle_count = finnxsi.rtlsim_multi_io(
+            sim,
+            io_dict,
+            num_out_values,
+            sname=sname,
+            liveness_threshold=get_liveness_threshold_cycles(),
+        )
+
+        self.set_nodeattr("cycles_rtlsim", total_cycle_count)
+
     def rtlsim_multi_io(self, sim, io_dict, sname="_V"):
         "Run rtlsim for this node, supports multiple i/o streams."
         num_out_values = self.get_number_output_values()
@@ -297,11 +323,166 @@ def get_outstream_width_padded(self, ind=0):
         out_width = self.get_outstream_width(ind=ind)
         return roundup_to_integer_multiple(out_width, 8)
 
+    def get_tree_model(self):
+        """Returns the characteristic function of a node, default is None and forces
+        to skip the analytical characterization of the node and fallback to rtlsim.
+        Implemented in each node, potentially overriding between rtl and hls"""
+        return None
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        if override_dict is None:
+            n_inps = np.prod(self.get_folded_input_shape()[:-1])
+            io_dict = {
+                "inputs": {
+                    "in0": [i for i in range(n_inps)],
+                },
+                "outputs": {"out0": []},
+            }
+        else:
+            io_dict = override_dict
+        if strategy == "tree_model":
+            # check for override function
+            if self.get_tree_model() is not None:
+                print(f"using tree model for node {self}")
+                self.derive_token_access_vectors_using_tree_model(period, io_dict=io_dict)
+                return
+        print(f"using rtlsim for node {self}")
+        # RTL-based flow
+        # there is a 20 clock marging added for when get_exp_cycles()
+        # is underestimating the real operator runtime.
+        period = self.get_exp_cycles() + 20
+        self.derive_token_access_vectors_using_rtlsim(model, period, fpga_part, clk_period, io_dict)
+
+    def derive_token_access_vectors_using_tree_model(self, period, io_dict):
+        # Analytical flow
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in0" in key}
+        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out0" in key}
+
+        chr_node = self.get_tree_model()
+        period, in_clocks, _ = chr_node.get_total_cycles(0)
+
+        self.set_nodeattr("io_chrc_period", period)
+
+        txn_in = []
+        txn_out = []
+        counter = 0
+
+        top_level_phase = self.get_tree_model()
+        # first period
+        cycles = 0
+
+        counter, cycles, txn_in = top_level_phase.traverse_phase_tree(0, counter, cycles, txn_in)
+
+        def apply_micro_buffer_correction(start, txn_in, period):
+            """There are cases where a node can buffer up the very first 1-2 inputs
+            immediately, even if it has not started properly consuming inputs yet
+            This behavior is extremely difficult to model in a characterization tree
+            and so we perform a manual correction by incrementing the number of
+            inputs read by 1 and detracting 1 read from the tail of the period
+
+            Which node types & configurations this applies for is yet to be
+            fully determined, but the corrections should happen here.
+            This correction is not critical for buffer sizing, as it will only
+            lead to two extra fifos in the absolute worst case, which should be very
+            rare regardless. However it is necessary if attempting to perfectly model
+            the rtlsim result."""
+
+            buffer = 0
+
+            if "FMPadding" in self.onnx_node.name:
+                if "_rtl" in (self.__class__.__name__):
+                    buffer = 1
+                else:
+                    buffer = 2
+
+            if "StreamingDataWidthConverter" in self.onnx_node.name:
+                if "_rtl" in (self.__class__.__name__):
+                    buffer = 1
+                else:
+                    buffer = 2
+
+            if "Pool" in self.onnx_node.name:
+                if "_rtl" in (self.__class__.__name__):
+                    buffer = 1
+                else:
+                    buffer = 2
+
+            if "MVAU" in self.onnx_node.name:
+                if "_rtl" in (self.__class__.__name__):
+                    buffer = 1
+                else:
+                    buffer = 2
+
+            if buffer > 0:
+                # buffering does not happen in nodes with short wind-ups
+                if period < 14:
+                    return txn_in
+
+                # main routine
+                if buffer == 2:
+                    if txn_in[start + 1] - txn_in[start] >= 1:
+                        buffer = 1
+                    else:
+                        txn_in[start + 1] += 1
+
+                idx = start + buffer
+                while idx < len(txn_in):
+                    if txn_in[idx] - txn_in[idx - 1] < buffer:
+                        txn_in[idx] += buffer
+                    idx += 1
+
+                idx = len(txn_in) - 1
+                last = txn_in[idx]
+
+                # deduct 1 read from the tail
+                while last == txn_in[idx]:
+                    txn_in[idx] -= buffer
+                    idx -= 1
+
+                # one extra element to deduct in case of 2 buffers
+                if buffer == 2:
+                    txn_in[idx] -= 1
+
+            return txn_in
+
+        txn_in = apply_micro_buffer_correction(0, txn_in, period)
+
+        # second period
+        cycles = len(txn_in)
+
+        counter, cycles, txn_in = top_level_phase.traverse_phase_tree(0, counter, cycles, txn_in)
+        txn_in = apply_micro_buffer_correction(period, txn_in, period)
+
+        # final assignments
+
+        all_txns_in = np.empty((len(txns_in.keys()), cycles), dtype=np.int32)
+        all_txns_in[0, :] = np.array(txn_in[:])
+        compressed_np_array = compress_numpy_to_string(all_txns_in)
+        self.set_nodeattr("io_chrc_in", compressed_np_array)
+        self.set_nodeattr("io_chrc_in_original", compressed_np_array)
+
+        counter = 0
+        cycles = 0
+
+        counter, cycles, txn_out = top_level_phase.traverse_phase_tree(1, counter, cycles, txn_out)
+
+        cycles = period
+
+        counter, cycles, txn_out = top_level_phase.traverse_phase_tree(1, counter, cycles, txn_out)
+
+        all_txns_out = np.empty((len(txns_out.keys()), cycles), dtype=np.int32)
+        all_txns_out[0, :] = np.array(txn_out[:])
+        compressed_np_array = compress_numpy_to_string(all_txns_out)
+        self.set_nodeattr("io_chrc_out", compressed_np_array)
+        self.set_nodeattr("io_chrc_out_original", compressed_np_array)
+
     def generate_hdl_memstream(self, fpgapart, pumped_memory=0):
         """Helper function to generate verilog code for memstream component.
         Currently utilized by MVAU, VVAU and HLS Thresholding layer."""
         ops = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl", "Thresholding_hls"]
-        if self.onnx_node.op_type in ops or self.onnx_node.op_type.startswith("Elementwise"):
+        if self.onnx_node.op_type in ops:
             template_path = (
                 os.environ["FINN_ROOT"] + "/finn-rtllib/memstream/hdl/memstream_wrapper_template.v"
             )
@@ -374,21 +555,28 @@ def generate_hdl_dynload(self):
         ) as f:
             f.write(template_wrapper)
 
-    def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
-        """Return the unconstrained characteristic functions for this node."""
+    def derive_token_access_vectors_using_rtlsim(
+        self, model, period, fpga_part, clk_period, override_rtlsim_dict=None
+    ):
+        """Return the token access vectors for this node using rtlsim."""
         # ensure rtlsim is ready
+
+        periods_to_simulate = 5
+        periods_to_store = 2
+
+        if self.get_nodeattr("rtlsim_so") == "":
+            self.prepare_rtlsim()
+
         assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name
-        if self.get_nodeattr("io_chrc_period") > 0:
-            warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name)
-            return
-        exp_cycles = self.get_exp_cycles()
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        n_outs = np.prod(self.get_folded_output_shape()[:-1])
+
+        exp_cycles = (self.get_exp_cycles() + 20) * periods_to_simulate
+        n_inps = np.prod(self.get_folded_input_shape()[:-1]) * periods_to_simulate
+        n_outs = np.prod(self.get_folded_output_shape()[:-1]) * periods_to_simulate
         if exp_cycles == 0:
             # try to come up with an optimistic estimate
             exp_cycles = min(n_inps, n_outs)
         assert (
-            exp_cycles <= period
+            exp_cycles <= period * periods_to_simulate
         ), "Period %d too short to characterize %s : expects min %d cycles" % (
             period,
             self.onnx_node.name,
@@ -397,6 +585,10 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
         sim = self.get_rtlsim()
         if override_rtlsim_dict is not None:
             io_dict = override_rtlsim_dict
+
+            for input_key in io_dict["inputs"]:
+                io_dict["inputs"][input_key] = io_dict["inputs"][input_key] * periods_to_simulate
+
         else:
             io_dict = {
                 "inputs": {
@@ -407,25 +599,23 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
 
         # extra dicts to keep track of cycle-by-cycle transaction behavior
         # note that we restrict key names to filter out weight streams etc
-        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key}
-        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key}
+        txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in0" in key}
+        txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out0" in key}
         # signal name, note no underscore at the end (new finnxsi behavior)
         sname = "_V"
         self.reset_rtlsim(sim)
+
         # create stream tracers for all input and output streams
         for k in txns_in.keys():
             txns_in[k] = sim.trace_stream(k + sname)
         for k in txns_out.keys():
             txns_out[k] = sim.trace_stream(k + sname)
-        self.rtlsim_multi_io(sim, io_dict)
+
+        self.rtlsim_multi_io_custom(sim, io_dict, sname="_V", batch_size=periods_to_simulate)
+
         total_cycle_count = self.get_nodeattr("cycles_rtlsim")
-        assert (
-            total_cycle_count <= period
-        ), """Total cycle count from rtl simulation is higher than
-            specified period, please set the period higher than {}""".format(
-            total_cycle_count
-        )
-        self.set_nodeattr("io_chrc_period", period)
+
+        self.set_nodeattr("io_chrc_period", total_cycle_count)
         # call str() on stream tracers to get their outputs, and convert
         # to list of ints
         for k in txns_in.keys():
@@ -433,27 +623,33 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None):
         for k in txns_out.keys():
             txns_out[k] = [int(c) for c in str(txns_out[k])]
 
-        def accumulate_char_fxn(chrc):
-            p = len(chrc)
+        period = total_cycle_count // periods_to_simulate
+
+        def accumulate_char_fxn(chrc, period_to_simulate, periods_to_store, period):
+            mid_point = period * 2
             ret = []
-            for t in range(2 * p):
-                if t == 0:
-                    ret.append(chrc[0])
+            for t in range(
+                mid_point, mid_point + period * 2
+            ):  # *2 when running 1 sim and replicating
+                if t == mid_point:
+                    ret.append(chrc[t])
                 else:
-                    ret.append(ret[-1] + chrc[t % p])
+                    ret.append(ret[-1] + chrc[t])
             return np.asarray(ret, dtype=np.int32)
 
-        all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32)
-        all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32)
+        all_txns_in = np.empty((len(txns_in.keys()), period * periods_to_store), dtype=np.int32)
+        all_txns_out = np.empty((len(txns_out.keys()), period * periods_to_store), dtype=np.int32)
         all_pad_in = []
         all_pad_out = []
+        pad_in = 0
+        pad_out = 0
         for in_idx, in_strm_nm in enumerate(txns_in.keys()):
             txn_in = txns_in[in_strm_nm]
             pad_in = 0
             if len(txn_in) < period:
                 pad_in = period - len(txn_in)
                 txn_in += [0 for x in range(pad_in)]
-            txn_in = accumulate_char_fxn(txn_in)
+            txn_in = accumulate_char_fxn(txn_in, periods_to_simulate, periods_to_store, period)
             all_txns_in[in_idx, :] = txn_in
             all_pad_in.append(pad_in)
 
@@ -463,11 +659,14 @@ def accumulate_char_fxn(chrc):
             if len(txn_out) < period:
                 pad_out = period - len(txn_out)
                 txn_out += [0 for x in range(pad_out)]
-            txn_out = accumulate_char_fxn(txn_out)
+            txn_out = accumulate_char_fxn(txn_out, periods_to_simulate, periods_to_store, period)
             all_txns_out[out_idx, :] = txn_out
             all_pad_out.append(pad_out)
 
-        self.set_nodeattr("io_chrc_in", all_txns_in)
-        self.set_nodeattr("io_chrc_out", all_txns_out)
-        self.set_nodeattr("io_chrc_pads_in", all_pad_in)
-        self.set_nodeattr("io_chrc_pads_out", all_pad_out)
+        compressed_np_array_in = compress_numpy_to_string(all_txns_in)
+        self.set_nodeattr("io_chrc_in", compressed_np_array_in)
+        self.set_nodeattr("io_chrc_in_original", compressed_np_array_in)
+
+        compressed_np_array_out = compress_numpy_to_string(all_txns_out)
+        self.set_nodeattr("io_chrc_out", compressed_np_array_out)
+        self.set_nodeattr("io_chrc_out_original", compressed_np_array_out)
diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py
index f925b51652..cb9339b78c 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect.py
+++ b/src/finn/custom_op/fpgadataflow/labelselect.py
@@ -32,6 +32,7 @@
 from qonnx.util.basic import qonnx_make_model, roundup_to_integer_multiple
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class LabelSelect(HWCustomOp):
@@ -95,6 +96,21 @@ def get_folded_output_shape(self, ind=0):
         oshape = tuple(vecs + [k, 1])
         return oshape
 
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape."
+        return helper.make_node(
+            "RandomNormal",
+            inputs=[],
+            outputs=[self.onnx_node.output[0]],
+            mean=0.0,
+            scale=1.0,
+            dtype=TensorProto.INT64,
+            shape=list(oshape),
+        )
+
     def infer_node_datatype(self, model):
         node = self.onnx_node
         # check input datatype against property
@@ -104,6 +120,9 @@ def infer_node_datatype(self, model):
         odt = self.get_output_datatype()
         model.set_tensor_datatype(self.onnx_node.output[0], odt)
 
+    def verify_node(self):
+        pass
+
     def get_input_datatype(self, ind=0):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
@@ -161,5 +180,37 @@ def execute_node(self, context, graph):
     def get_exp_cycles(self):
         nlabels = self.get_nodeattr("Labels")
         pe = self.get_nodeattr("PE")
-        exp_cycles = nlabels / pe
+        K = self.get_nodeattr("K")
+        exp_cycles = nlabels // pe + K
         return int(exp_cycles)
+
+    def get_tree_model(self):
+        # key parameters
+        # this depends on the kernel type, hls or rtl etc
+
+        # extract node attr
+        num_in_words = self.get_nodeattr("Labels")
+        PE = self.get_nodeattr("PE")
+        # PE = 1
+        K = self.get_nodeattr("K")
+
+        NF = num_in_words // PE
+
+        output_delay = int(np.log2(num_in_words)) + 1
+        # output_delay = NF
+
+        print("num_in_words,PE,K,NF,output_delay")
+        print(num_in_words, PE, K, NF, output_delay)
+        print(f"exp cycles: {self.get_exp_cycles()}")
+
+        read_k = Characteristic_Node("read only", [(NF, [1, 0])], True)
+
+        compute_k = Characteristic_Node("compute k", [(output_delay, [0, 0])], True)
+
+        write_k = Characteristic_Node("write k", [(K, [0, 1])], True)
+
+        labelselect_top = Characteristic_Node(
+            "Fill feature map", [(1, read_k), (1, compute_k), (1, write_k)], False
+        )
+
+        return labelselect_top  # top level phase of this node
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 08d88ac069..2ece81c4a3 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -41,6 +41,7 @@
 )
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
 
 # ONNX i/o tensor shape assumptions for MatrixVectorActivation:
@@ -467,6 +468,7 @@ def get_exp_cycles(self):
         mw = self.get_nodeattr("MW")
         # since mmv != 1 is not supported yet, we set mmv for now to 1
         mmv = 1
+
         exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv
         return int(exp_cycles)
 
@@ -882,21 +884,6 @@ def get_op_and_param_counts(self):
             ret_dict[thres_param_type] = thres_count
         return ret_dict
 
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out0": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["internal_decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["in1"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
-
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         try:
@@ -1107,3 +1094,73 @@ def code_generation_ipi(self):
         else:
             raise Exception("Unrecognized mem_mode for MatrixVectorActivation")
         return cmd
+
+    def get_tree_model(self):
+        MW = self.get_nodeattr("MW")
+        MH = self.get_nodeattr("MH")
+
+        SIMD = self.get_nodeattr("SIMD")
+        PE = self.get_nodeattr("PE")
+        numVectors = np.prod(self.get_nodeattr("numInputVectors"))
+        SF = int(MW / SIMD)
+        NF = int(MH / PE)
+
+        IMPL_STYLE = "rtl" if "_rtl" in (self.__class__.__name__) else "hls"
+        assert IMPL_STYLE in ["rtl", "hls"], "Implementation style must be 'rtl' or 'hls'"
+
+        # additional precision which is typically unnecessary for FIFO size modelling
+        # if IMPL_STYLE == "hls":
+        #     output_delay = 0  # cycles before output starts
+        # writing when input is read. Typically 2
+        #     wind_up = 0  # about 3 cycles of wind-up for HLS MVAU
+        # else:
+        #     # RTL implementation
+        #     output_delay = 0
+        wind_up = 0
+
+        idle = Characteristic_Node("idle cycles", [(1, [0, 0])], True)
+        read = Characteristic_Node("Read a burst of input", [(1, [1, 0])], True)
+        write = Characteristic_Node("update output", [(1, [0, 1])], True)
+        read_and_write = Characteristic_Node("update output", [(1, [1, 1])], True)
+
+        write_PE = Characteristic_Node(
+            "iterate MW/SIMD and update an output",
+            [
+                (SF - 1, idle),
+                (1, write),
+            ],
+            False,
+        )
+
+        feature_map = Characteristic_Node(
+            "Compute single feature map",
+            [(wind_up, idle), (SF - 1, read), (0, idle), (1, read_and_write), (NF - 1, write_PE)],
+            False,
+        )
+
+        all_feature_maps = Characteristic_Node(
+            "compute set of feature maps", [(1, idle), (numVectors, feature_map)], False
+        )
+
+        return all_feature_maps
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [i for i in range(n_inps)],
+            },
+            "outputs": {"out0": []},
+        }
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["internal_decoupled", "external"]:
+            n_weight_inps = self.calc_wmem()
+            num_w_reps = int(np.prod(self.get_nodeattr("numInputVectors")))
+            io_dict["inputs"]["in1"] = [i for i in range(num_w_reps * n_weight_inps)]
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py
index 4a1013af05..a72a55c2b4 100644
--- a/src/finn/custom_op/fpgadataflow/pool.py
+++ b/src/finn/custom_op/fpgadataflow/pool.py
@@ -30,6 +30,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class Pool(HWCustomOp):
@@ -211,3 +212,50 @@ def execute_node(self, context, graph):
             result = np.right_shift(result.astype(int), shift_bits)
         oshape = context[node.output[0]].shape
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def get_tree_model(self):
+        # extract node attr
+
+        PE = self.get_nodeattr("PE")
+        Channels = self.get_nodeattr("Channels")
+        KernelSize = self.get_nodeattr("KernelSize")
+        OutImgDims = self.get_nodeattr("OutImgDims")
+        BatchSize = self.get_nodeattr("BatchSize")
+
+        # Derived parameters
+        NF = Channels // PE  # neuron folding
+        func = self.get_nodeattr("Function")
+        if func == "MaxPool":
+            SF = KernelSize[1] ** 2  # spatial folding per pooling window
+            if KernelSize[0] == 1 or KernelSize[1] == 1:
+                if KernelSize[0] == 1:
+                    SF = KernelSize[1] ** 2
+                else:
+                    SF = KernelSize[0] ** 2
+                SF = np.prod(KernelSize)
+            reps = BatchSize * np.prod(OutImgDims)  # number of pooling windows to process
+        else:
+            SF = np.prod(KernelSize)  # spatial folding per pooling window
+            reps = BatchSize * np.prod(OutImgDims)  # number of pooling windows to process
+
+        # One input read per SF iteration
+        read_pooling_input = Characteristic_Node("Read Pool Input", [(1, [1, 0])], True)
+
+        readwrite_pooling_input = Characteristic_Node("Read Write Pool Input", [(1, [1, 1])], True)
+
+        # SF - 1 reads + 1 read that overlaps with write
+        compute_pool_window = Characteristic_Node(
+            "Compute Pool Window",
+            [(SF - 1, read_pooling_input), (1, readwrite_pooling_input)],  # overlap with output
+            False,
+        )
+
+        # For each NF tile per pooling window
+        compute_all_tiles = Characteristic_Node(
+            "Compute All Tiles", [(NF, compute_pool_window)], False
+        )
+
+        # For each image region (spatial + batch)
+        pool_top = Characteristic_Node("Top Pool Loop", [(reps, compute_all_tiles)], False)
+
+        return pool_top
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
index 8fcbae5fcc..4582cb22cf 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py
@@ -32,6 +32,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 # does not do anything at the ONNX node-by-node level, and input-output
 # tensor shapes are the same. performs data width conversion at the rtlsim level
@@ -125,6 +126,14 @@ def get_folded_output_shape(self, ind=0):
 
         return dummy_t.shape
 
+    def get_number_input_values(self):
+        folded_ishape = self.get_folded_input_shape()
+        return np.prod(folded_ishape[:-1])
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
     def get_instream_width(self, ind=0):
         in_width = self.get_nodeattr("inWidth")
         return in_width
@@ -175,6 +184,9 @@ def execute_node(self, context, graph):
         output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
         context[node.output[0]] = output
 
+    def get_exp_cycles(self):
+        return np.prod(self.get_folded_input_shape()) + np.prod(self.get_folded_output_shape())
+
     def lut_estimation(self):
         """Calculates resource estimations for LUTs"""
         inw = self.get_instream_width()
@@ -203,3 +215,78 @@ def lut_estimation(self):
             cset_luts += outw
 
         return int(cnt_luts + cset_luts)
+
+    def get_tree_model(self):
+        inWidth = self.get_nodeattr("inWidth")
+        outWidth = self.get_nodeattr("outWidth")
+
+        wind_up = 0
+
+        idle = Characteristic_Node("idle", [(1, [0, 0])], True)
+
+        if inWidth > outWidth:
+            numReps = self.get_number_input_values()
+            # down-conversion
+            if inWidth % outWidth != 0:
+                return None  # no support for gcd partial conversion yet
+
+            writes_per_read = inWidth // outWidth
+            # read 1, write many, repeats for in-word count
+
+            read_input = Characteristic_Node("read 1 word", [(1, [1, 1])], True)
+
+            write_output = Characteristic_Node("write words", [(writes_per_read - 1, [0, 1])], True)
+
+            down_convert_word = Characteristic_Node(
+                "down convert all words in a single transaction",
+                [(1, read_input), (1, write_output)],
+                False,
+            )
+
+            dwc_top = Characteristic_Node(
+                "compute a set of DWCs with down conversion",
+                [(wind_up, idle), (numReps, down_convert_word)],
+                False,
+            )
+
+        elif inWidth < outWidth:
+            numReps = self.get_number_output_values()
+            # up-conversion
+
+            if outWidth % inWidth != 0:
+                return None  # no support for gcd partial conversion yet
+
+            reads_per_write = outWidth // inWidth
+            # read 1, write many, repeats for in-word count
+
+            read_input = Characteristic_Node(
+                "read first N-1 words", [(reads_per_write - 1, [1, 0])], True
+            )
+
+            write_output = Characteristic_Node(
+                "read Nth word and write output word", [(1, [1, 1])], True
+            )
+
+            up_convert_word = Characteristic_Node(
+                "down convert all words in a single transaction",
+                [(1, read_input), (1, write_output)],
+                False,
+            )
+
+            dwc_top = Characteristic_Node(
+                "compute a set of DWCs with up conversion",
+                [(wind_up, idle), (numReps, up_convert_word)],
+                False,
+            )
+
+        else:
+            # pass-through
+            numReps = self.get_number_input_values()
+
+            pass_through = Characteristic_Node("pass-through", [(1, [1, 1])], True)
+
+            dwc_top = Characteristic_Node(
+                "DWC pass-through, no conversion", [(wind_up, idle), (numReps, pass_through)], False
+            )
+
+        return dwc_top
diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py
index 93871b4e11..70bd1a81cb 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding.py
+++ b/src/finn/custom_op/fpgadataflow/thresholding.py
@@ -33,6 +33,7 @@
 from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class Thresholding(HWCustomOp):
@@ -271,3 +272,50 @@ def calc_tmem(self):
         num_channels = self.get_nodeattr("NumChannels")
         pe = self.get_nodeattr("PE")
         return num_channels // pe
+
+    def get_tree_model(self):
+        reps = list(self.get_nodeattr("numInputVectors"))[0]
+
+        NumChannels = self.get_nodeattr("NumChannels")
+        PE = self.get_nodeattr("PE")
+        ImgDim = np.prod(list(self.get_nodeattr("numInputVectors"))) // reps
+
+        act = DataType[self.get_nodeattr("outputDataType")]
+        IMPL_STYLE = "rtl" if "_rtl" in (self.__class__.__name__) else "hls"
+        assert IMPL_STYLE in ["rtl", "hls"], "Implementation style must be 'rtl' or 'hls'"
+
+        NF = NumChannels // PE
+        total_iterations = ImgDim * NF
+
+        if IMPL_STYLE == "hls":
+            output_delay = 0  # 4 if 2023.1 vivado
+        else:
+            if act == DataType["BIPOLAR"]:
+                output_delay = 0  # 4 if 2023.1 vivado
+            else:
+                output_delay = 0
+
+        if total_iterations > output_delay:
+            read = Characteristic_Node("read", [(output_delay, [1, 0])], True)
+
+            read_write = Characteristic_Node(
+                "Compute", [(total_iterations - output_delay, [1, 1])], True
+            )
+
+            write = Characteristic_Node("write", [(output_delay, [0, 1])], True)
+
+            threshold_top = Characteristic_Node(
+                "Thresholding Top", [(1, read), (1, read_write), (1, write)], False
+            )
+
+        else:
+            read = Characteristic_Node("Rush-in", [(total_iterations, [1, 0])], True)
+            idle = Characteristic_Node("Idle", [(output_delay - total_iterations, [0, 0])], True)
+
+            write = Characteristic_Node("Compute", [(total_iterations, [0, 1])], True)
+
+            threshold_top = Characteristic_Node(
+                "Thresholding Top", [(1, read), (1, idle), (1, write)], False
+            )
+
+        return threshold_top  # top level phase of this node
diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
index 965fad66e1..5799ba49b2 100644
--- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py
@@ -41,6 +41,7 @@
 )
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string
 
 
@@ -779,21 +780,6 @@ def get_op_and_param_counts(self):
             ret_dict[thres_param_type] = thres_count
         return ret_dict
 
-    def derive_characteristic_fxns(self, period):
-        n_inps = np.prod(self.get_folded_input_shape()[:-1])
-        io_dict = {
-            "inputs": {
-                "in0": [0 for i in range(n_inps)],
-            },
-            "outputs": {"out0": []},
-        }
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode in ["internal_decoupled", "external"]:
-            n_weight_inps = self.calc_wmem()
-            num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
-            io_dict["inputs"]["in1"] = [0 for i in range(num_w_reps * n_weight_inps)]
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
-
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
         mem_mode = self.get_nodeattr("mem_mode")
@@ -913,3 +899,79 @@ def code_generation_ipi(self):
         else:
             raise Exception("Unrecognized mem_mode for VectorVectorActivation")
         return cmd
+
+    def get_tree_model(self):
+        # key parameters
+        IMPL_STYLE = "rtl" if "_rtl" in (self.__class__.__name__) else "hls"
+        assert IMPL_STYLE in ["rtl", "hls"], "Implementation style must be 'rtl' or 'hls'"
+
+        SIMD = self.get_nodeattr("SIMD")
+        PE = self.get_nodeattr("PE")
+        Channels = self.get_nodeattr("Channels")
+        Kernel_2 = np.prod(self.get_nodeattr("Kernel"))
+        NF = int(Channels / PE)
+        numReps = np.prod(self.get_nodeattr("Dim"))
+        dim_h, dim_w = self.get_nodeattr("Dim")
+
+        if IMPL_STYLE == "rtl":
+            SF = Kernel_2 // SIMD
+        # wind_up = 5
+        else:
+            SF = Kernel_2 // SIMD
+            # wind_up = 7
+
+        # INNER = TOTAL_FOLD // SF
+
+        # wind_up_stage = Characteristic_Node(
+        #     "write only",
+        #     [(wind_up, [1,0])],
+        #     True)
+
+        # the windup stage should also exist and delay the outputs
+        # this requires the same pattern of limiting SF and is probably best done as a correction
+        # after the feature map?
+        # alternative is to construct a split of first, middle and last sf,
+        # with the first having a longer read phase (sf+windup-1) and the last (sf-windup-1)
+
+        write_out = Characteristic_Node("write out simd (1 for hls)", [(1, [1, 1])], True)
+
+        compute_one_sf = Characteristic_Node("read one SF input", [(1, [1, 0])], True)
+
+        compute_sf = Characteristic_Node(
+            "process SF-1 inputs", [(SF - 1, compute_one_sf), (1, write_out)], False
+        )
+
+        compute_transaction = Characteristic_Node(
+            "Compute VVAU one transaction",
+            [
+                (NF, compute_sf),
+            ],
+            False,
+        )
+
+        vvau_top = Characteristic_Node(
+            "Compute VVAU input set", [(numReps, compute_transaction)], False
+        )
+
+        return vvau_top  # top level phase of this node
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
+        n_inps = np.prod(self.get_folded_input_shape()[:-1])
+        io_dict = {
+            "inputs": {
+                "in0": [i for i in range(n_inps)],
+            },
+            "outputs": {"out0": []},
+        }
+
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode in ["internal_decoupled", "external"]:
+            # n_weight_inps = self.calc_wmem()
+            # num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+            io_dict["inputs"]["in1"] = [0 for i in range(1 * n_inps)]
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py
index 4d3ac7dc67..0cf5a75beb 100644
--- a/src/finn/transformation/fpgadataflow/derive_characteristic.py
+++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py
@@ -28,15 +28,112 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
+import numpy as np
+import os
 import qonnx.custom_op.registry as registry
 import warnings
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.transformation.base import NodeLocalTransformation
+from qonnx.transformation.base import NodeLocalTransformation, Transformation
 
+from finn.transformation.fpgadataflow.prepare_ip import _codegen_single_node
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.util.basic import (
+    compress_numpy_to_string,
+    decompress_string_to_numpy,
+    stretch,
+)
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 
-class DeriveCharacteristic(NodeLocalTransformation):
+class JustInTimeSynthesize(Transformation):
+    def __init__(self, part, clk_period, only_without_tree_model=False):
+        super().__init__()
+        self.part = part
+        self.clk_period = clk_period
+        self.only_without_tree_model = only_without_tree_model
+
+    def apply(self, model):
+        for node in model.graph.node:
+            inst = registry.getCustomOp(node)
+            if (is_hls_node(node) or is_rtl_node(node)) and (
+                (
+                    (inst.get_tree_model() is None and self.only_without_tree_model)
+                    or not self.only_without_tree_model
+                )
+                and (inst.get_nodeattr("io_chrc_in") == "")
+            ):
+                _codegen_single_node(
+                    node,
+                    model,
+                    self.part,
+                    self.clk_period,
+                )
+
+                op_type = node.op_type
+                if is_hls_node(node):
+                    try:
+                        # ensure that code is generated
+                        assert (
+                            inst.get_nodeattr("code_gen_dir_ipgen") != ""
+                        ), """Node
+                        attribute "code_gen_dir_ipgen" is empty. Please run
+                        transformation PrepareIP first."""
+                        if not os.path.isdir(
+                            inst.get_nodeattr("ipgen_path")
+                        ) or not inst.get_nodeattr("code_gen_dir_ipgen") in inst.get_nodeattr(
+                            "ipgen_path"
+                        ):
+                            # call the compilation function for this node
+                            inst.ipgen_singlenode_code()
+                        else:
+                            warnings.warn("Using pre-existing IP for %s" % node.name)
+                        # ensure that executable path is now set
+                        assert (
+                            inst.get_nodeattr("ipgen_path") != ""
+                        ), """Transformation
+                        HLSSynthIP was not successful. Node attribute "ipgen_path"
+                        is empty."""
+                    except KeyError:
+                        raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+        model = model.transform(ReplaceVerilogRelPaths())
+        for node in model.graph.node:
+            inst = registry.getCustomOp(node)
+            if (
+                (is_hls_node(node) or is_rtl_node(node))
+                and (
+                    (inst.get_tree_model() is None and self.only_without_tree_model)
+                    or not self.only_without_tree_model
+                )
+                and (
+                    node.op_type
+                    not in [
+                        "AddStreams_hls",
+                        "DuplicateStreams_hls",
+                        "StreamingFIFO_hls",
+                        "StreamingFIFO_rtl",
+                    ]
+                )
+                and (inst.get_nodeattr("rtlsim_so") == "")
+            ):
+                try:
+                    inst.prepare_rtlsim()
+                    # ensure that executable path is now set
+                    assert (
+                        inst.get_nodeattr("rtlsim_so") != ""
+                    ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+                except KeyError:
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+        model = model.transform(SetExecMode("rtlsim"))
+
+        return (model, False)
+
+
+class DeriveTokenAccessVectors(NodeLocalTransformation):
     """For each node in the graph, run rtlsim to obtain the i/o
     characteristic function for FIFO sizing and set the attribute.
     It is assumed that the PrepareRTLSim transformation was already
@@ -52,18 +149,51 @@ class DeriveCharacteristic(NodeLocalTransformation):
       NodeLocalTransformation for more details.
     """
 
-    def __init__(self, period, num_workers=None, manual_bypass=False):
+    def __init__(
+        self,
+        model,
+        period,
+        strategy,
+        fpga_part,
+        clk_period,
+        num_workers=None,
+        manual_bypass=False,
+        nodes_to_ignore=[],
+    ):
         super().__init__(num_workers=num_workers)
+        self.model = model
         self.period = period
+        self.strategy = strategy
+        self.fpga_part = fpga_part
+        self.clk_period = clk_period
         self.manual_bypass = manual_bypass
+        self.nodes_to_ignore = set(nodes_to_ignore)
 
     def applyNodeLocal(self, node):
         op_type = node.op_type
         if is_hls_node(node) or is_rtl_node(node):
             try:
                 # lookup op_type in registry of CustomOps
+                print("deriving: ", node.name)
                 inst = registry.getCustomOp(node)
-                inst.derive_characteristic_fxns(period=self.period)
+                if node.name in self.nodes_to_ignore:
+                    print(f"ignoring derivation of node {node.name}")
+                    return (node, False)
+
+                if op_type not in [
+                    "AddStreams_hls",
+                    "DuplicateStreams_hls",
+                    "StreamingFIFO_hls",
+                    "StreamingFIFO_rtl",
+                ]:
+                    inst.derive_token_access_vectors(
+                        model=self.model,
+                        period=self.period,
+                        strategy=self.strategy,
+                        fpga_part=self.fpga_part,
+                        clk_period=self.clk_period,
+                        op_type=op_type,
+                    )
             except KeyError:
                 # exception if op_type is not supported
                 raise Exception("Custom op_type %s is currently not supported." % op_type)
@@ -73,114 +203,1053 @@ def apply(self, model: ModelWrapper):
         (model, run_again) = super().apply(model)
         if not self.manual_bypass:
             return (model, run_again)
-        # apply manual fix for DuplicateStreams and AddStreams for
-        # simple residual reconvergent paths with bypass
+
+        return (model, run_again)
+
+
+class LocalStretchCharacteristicFunctions(NodeLocalTransformation):
+    """Prerequisite: DeriveTokenAccessVectors already called on graph.
+    For each node in the graph, use the accumulated I/O characteristic function
+    and stretch it if there is a difference in periods between the producer and consumer.
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+      period (int or None) the period to stretch the individual node chr function dumps to.
+    """
+
+    def __init__(self, num_workers=None, period=None, nodes_to_ignore=[]):
+        super().__init__(num_workers=num_workers)
+        self.period = period
+        self.nodes_to_ignore = set(nodes_to_ignore)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_hls_node(node) or is_rtl_node(node):
+            try:
+                if node.name in self.nodes_to_ignore or node.op_type in [
+                    "AddStreams_hls",
+                    "DuplicateStreams_hls",
+                    "StreamingFIFO_hls",
+                    "StreamingFIFO_rtl",
+                ]:
+                    return (node, False)
+
+                # model = self.ref_input_model
+
+                # lookup op_type in registry of CustomOps
+                prod = registry.getCustomOp(node)
+
+                prod_chrc_out_original = decompress_string_to_numpy(
+                    prod.get_nodeattr("io_chrc_out")
+                )[0]
+                prod_chrc_in_original = decompress_string_to_numpy(prod.get_nodeattr("io_chrc_in"))[
+                    0
+                ]
+
+                prod_chrc_out = prod_chrc_out_original
+                prod_chrc_in = prod_chrc_in_original
+
+                compressed_prod_chrc_out = compress_numpy_to_string(np.array([prod_chrc_out]))
+                compressed_prod_chrc_in = compress_numpy_to_string(np.array([prod_chrc_in]))
+
+                period = max(len(prod_chrc_in), len(prod_chrc_out))
+
+                #period = self.period
+
+                # perform stretching if necessary
+                prod_chrc_in = stretch(prod_chrc_in, period)
+                prod_chrc_out = stretch(prod_chrc_out, period)
+
+                compressed_prod_chrc_in = compress_numpy_to_string(np.array([prod_chrc_in]))
+                compressed_prod_chrc_out = compress_numpy_to_string(np.array([prod_chrc_out]))
+
+               # prod.set_nodeattr("io_chrc_in", compressed_prod_chrc_in)
+               # prod.set_nodeattr("io_chrc_out", compressed_prod_chrc_out)
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
+        return (node, False)
+
+
+def get_top_producer_period(node, model):
+    highest_period = 0
+    for indx, input_name in enumerate(node.input):
+        #prod_node = model.find_producer(input_name)
+        prod_node = find_non_dwc_producer(model,node)
+
+        if prod_node is not None:
+            prod_chrc = decompress_string_to_numpy(
+                registry.getCustomOp(prod_node).get_nodeattr("io_chrc_out")
+            )[0]
+            cons_chrc = decompress_string_to_numpy(
+                registry.getCustomOp(prod_node).get_nodeattr("io_chrc_in")
+            )[0]
+            period = max(len(prod_chrc) // 2, len(cons_chrc) // 2)
+            highest_period = max(period, highest_period)
+    return highest_period, prod_node
+
+
+def get_top_consumer_period(node, model):
+    highest_period = 0
+    for indx, output_name in enumerate(node.output):
+        #prod_node = model.find_consumer(output_name)
+        prod_node = find_non_dwc_consumer(model,node)
+
+        if prod_node is not None:
+            prod_chrc = decompress_string_to_numpy(
+                registry.getCustomOp(prod_node).get_nodeattr("io_chrc_out")
+            )[0]
+            cons_chrc = decompress_string_to_numpy(
+                registry.getCustomOp(prod_node).get_nodeattr("io_chrc_in")
+            )[0]
+            period = max(len(prod_chrc) // 2, len(cons_chrc) // 2)
+            highest_period = max(period, highest_period)
+    return highest_period, prod_node
+
+
+def max_throughput(trace, max_depth=10, min_size=10):
+    """
+    Recursively find the maximum throughput (delta / time) from a cumulative trace.
+
+    Parameters:
+        trace (np.ndarray): 1D cumulative access trace.
+        max_depth (int): maximum depth of recursive splitting.
+        min_size (int): minimum size of segment allowed for consideration.
+
+    Returns:
+        float: maximum throughput found in any segment.
+    """
+    segments = [(0, len(trace) - 1)]
+    best_throughput = 0.0
+
+    for _ in range(max_depth):
+        max_local_throughput = 0
+        max_segment = None
+
+        # Evaluate current segments
+        for start, end in segments:
+            duration = end - start
+            if duration < min_size:
+                continue
+            delta = trace[end] - trace[start]
+            throughput = delta / duration
+            if throughput > max_local_throughput:
+                max_local_throughput = throughput
+                max_segment = (start, end)
+
+        if max_segment is None:
+            break
+
+        best_throughput = max(best_throughput, max_local_throughput)
+
+        # Subdivide the fastest segment if large enough
+        start, end = max_segment
+        mid = (start + end) // 2
+        if (mid - start) < min_size or (end - mid) < min_size:
+            break
+
+        segments = [s for s in segments if s != max_segment]
+        segments += [(start, mid), (mid, end)]
+
+    return best_throughput
+
+
+def get_nodes_until_converging(node, model):
+    # init_node = node
+    count = 0
+    while node is not None:
+        if node.name.startswith("DuplicateStreams"):
+            return count
+        node = model.find_producer(node.input[0])
+        count += 1
+    return count
+
+
+def get_throughput(node, dir="in"):
+    # calculate all budgets for nodes faster than the global period
+
+    trace = None
+    throughput = 0
+    inst = registry.getCustomOp(node)
+    if inst.get_nodeattr(f"io_chrc_{dir}_stretch") != "":
+        trace = decompress_string_to_numpy(inst.get_nodeattr(f"io_chrc_{dir}_stretch"))[0]
+        period = len(trace) // 2
+    else:
+        if inst.get_nodeattr(f"io_chrc_{dir}") != "":
+            trace = decompress_string_to_numpy(inst.get_nodeattr(f"io_chrc_{dir}"))[0]
+            period = len(trace) // 2
+        else:
+            period = 0
+    if period != 0:
+        # throughput = max_throughput(trace,min_size=int(np.sqrt(period)))
+        throughput = trace[-1] / inst.get_nodeattr("io_chrc_period")
+    # throughput = max_throughput(trace,min_size=1000)
+    return throughput
+
+
+def get_parent_throughput(node, model):
+    throughputs = []
+    for indx, input_name in enumerate(node.input):
+        prod_node = model.find_producer(input_name)
+        if prod_node is not None:
+            throughputs.append(get_throughput(prod_node, "out"))
+        else:
+            throughputs.append(0)
+    return max(throughputs)
+
+
+def get_parent(node, model):
+    for indx, input_name in enumerate(node.input):
+        prod_node = model.find_producer(input_name)
+        if prod_node is not None:
+            return prod_node
+        else:
+            return None
+    return None
+
+
+def get_consumer(node, model):
+    for indx, output_name in enumerate(node.output):
+        cons = model.find_consumer(output_name)
+        return cons
+
+
+def get_consumer_throughput(node, model):
+    throughputs = []
+    for indx, output_name in enumerate(node.output):
+        prod_node = model.find_consumer(output_name)
+        if prod_node is not None:
+            throughputs.append(get_throughput(prod_node, "in"))
+        else:
+            throughputs.append(0)
+    return max(throughputs)
+
+
+def get_true_period(node):
+    in_chrc = decompress_string_to_numpy(node.get_nodeattr("io_chrc_in"))[0]
+    out_chrc = decompress_string_to_numpy(node.get_nodeattr("io_chrc_out"))[0]
+
+    return max(len(in_chrc) // 2, len(out_chrc) // 2)
+
+
+def get_branch_nodes(last_node, model):
+    branch_nodes = []
+    while last_node.op_type != "DuplicateStreams_hls":
+        branch_nodes.append(last_node)
+        last_node = model.find_producer(last_node.input[0])
+    return branch_nodes, last_node
+
+
+def get_branch_volume(as_node, indx, model):
+    last_node = model.find_producer(as_node.input[indx])
+    branch_nodes, ds_node = get_branch_nodes(last_node, model)
+    branch = [as_node, *branch_nodes, ds_node]
+
+    # now perform volume calculation based on characteristic functions
+    # note that the nodes are reversed, we start at addstreams node
+    volume = 0
+    max_i = 0
+    max_period = 0
+    latency = 0
+    for i, node in enumerate(branch[1:]):
+        volume += 1  # placeholder
+        period = registry.getCustomOp(node).get_nodeattr("io_chrc_period")
+        if period > max_period:
+            max_period = period
+            max_i = i
+
+        # actual calculation has to consider the exp cycles and total nr of elements.
+        # maybe maximum amount of values per period?
+        # we can do this sort of calc by comparing the first consumed token to the
+        # last produced token in some form.
+    print("returning vol,max_i,lat: ", volume, max_i, latency)
+
+    return volume, branch, max_i + 1, latency, max_period
+
+def find_non_dwc_producer(model, node):
+    producer = model.find_producer(node.input[0])
+    if producer is None:
+        return None
+    if "StreamingDataWidthConverter" in producer.name:
+        producer = model.find_producer(producer.input[0])
+    return producer
+
+def find_non_dwc_consumer(model, node):
+    consumer = model.find_consumer(node.output[0])
+    if consumer is None:
+        return None
+    if "StreamingDataWidthConverter" in consumer.name:
+        consumer = model.find_consumer(consumer.output[0])
+    return consumer
+
+
+def calculate_peak_volume_delta(b0_lat, node_0, b1_lat, node_1, period_0, period_1, global_period):
+    n0 = registry.getCustomOp(node_0)
+    n1 = registry.getCustomOp(node_1)
+    p0_v = decompress_string_to_numpy(n0.get_nodeattr("io_chrc_out"))[0]
+    p1_v = decompress_string_to_numpy(n1.get_nodeattr("io_chrc_out"))[0]
+
+    p0_v = stretch(p0_v, global_period)
+    p1_v = stretch(p1_v, global_period)
+
+    # pad vectors with latency
+    p0_v = np.concatenate((np.zeros(b0_lat, dtype=p0_v.dtype), p0_v))
+    p1_v = np.concatenate((np.zeros(b1_lat, dtype=p1_v.dtype), p1_v))
+
+    if len(p0_v) > len(p1_v):
+        # pad p1_v end
+        last = p1_v[-1]
+        p1_v = np.concatenate((p1_v, np.array([last] * (len(p0_v) - len(p1_v)), dtype=p1_v.dtype)))
+    else:
+        # pad p0_v end
+        last = p0_v[-1]
+        p0_v = np.concatenate((p0_v, np.array([last] * (len(p1_v) - len(p0_v)), dtype=p0_v.dtype)))
+
+    p = max(len(p0_v), len(p1_v))
+
+    max_positive_delta = 0
+    max_negative_delta = 0
+    peak_b0 = 0
+    peak_b1 = 0
+    peak_deltas = [0, 0]
+
+    for i in range(p):
+        delta = p0_v[i] - p1_v[i]
+        if delta > max_positive_delta:
+            max_positive_delta = delta
+            peak_deltas[0] = delta
+        if delta < max_negative_delta:
+            max_negative_delta = delta
+            peak_deltas[1] = delta * -1
+
+        peak_b0 = max(p0_v[i], peak_b0)
+        peak_b1 = max(p1_v[i], peak_b1)
+
+    final_fifos = [int(max(0, (b1_lat)) + peak_deltas[1]), int(max(0, (b0_lat)) + peak_deltas[0])]
+    return final_fifos
+
+
+def compute_node_latency_init_periods(node, branch_max):
+    cons_chrc = decompress_string_to_numpy(node.get_nodeattr("io_chrc_in"))[0]
+    prod_chrc = decompress_string_to_numpy(node.get_nodeattr("io_chrc_out"))[0]
+
+    cons_chrc = stretch(cons_chrc, branch_max)
+    prod_chrc = stretch(prod_chrc, branch_max)
+
+    def max_dist(a, b):
+        a_last = a[-1]
+        b_last = b[-1]
+
+        idx_a = np.argmax(a == a_last)
+        idx_b = np.argmax(b == b_last)
+
+        return abs(idx_a - idx_b)
+
+    max_distance = max_dist(cons_chrc, prod_chrc)
+    return max_distance
+
+def get_full_branch_latency(nodes, branch_max):
+    total_latency = 0
+    for node in nodes:
+        total_latency += compute_node_latency_init_periods(registry.getCustomOp(node), branch_max)
+    return total_latency
+
+
+def assign_extra_fifo_volume(as_node, model, global_period):
+    assert len(as_node.input) > 1
+
+    _, branch_0, _, _, period_0 = get_branch_volume(as_node, 0, model)
+    _, branch_1, _, _, period_1 = get_branch_volume(as_node, 1, model)
+
+
+    # propagate the producer to duplicatestreams node
+    ds_node = registry.getCustomOp(branch_0[-1])
+    prod_node = model.find_producer(branch_0[-1].input[0])
+
+    period_ds = get_true_period(registry.getCustomOp(prod_node))
+
+    tav_ds = registry.getCustomOp(prod_node).get_nodeattr("io_chrc_out")
+    tav_stretched_ds = registry.getCustomOp(prod_node).get_nodeattr("io_chrc_out_stretch")
+    tav_pad_ds = registry.getCustomOp(prod_node).get_nodeattr("io_chrc_out_original")
+    ds_node.set_nodeattr("io_chrc_in", tav_ds)
+    ds_node.set_nodeattr("io_chrc_out", tav_ds)
+
+    ds_node.set_nodeattr("io_chrc_in_original", tav_pad_ds)
+    ds_node.set_nodeattr("io_chrc_out_original", tav_pad_ds)
+
+    ds_node.set_nodeattr("io_chrc_in_stretch", tav_stretched_ds)
+    ds_node.set_nodeattr("io_chrc_out_stretch", tav_stretched_ds)
+
+    ds_node.set_nodeattr("io_chrc_period", period_ds)
+
+    # last node with latencies version
+    latency_to_first_output_0 = get_full_branch_latency(branch_0[1:], period_0)
+    latency_to_first_output_1 = get_full_branch_latency(branch_1[1:], period_1)
+    peak_deltas = calculate_peak_volume_delta(
+        latency_to_first_output_0,
+        branch_0[1],
+        latency_to_first_output_1,
+        branch_1[1],
+        period_0,
+        period_1,
+        global_period,
+    )
+
+    # latency_delta = max(latency_0, latency_1) - min(latency_0, latency_1)
+    # peak delta should also contain additional fifos
+    # for any latency differences between nodes
+    # here we take the sum input to output latency
+    # of each node in a branch and take the
+    # last node's volume at that clock
+    # This is a severe over-estimation to improve in the future
+
+    addstrm_node_inst = registry.getCustomOp(as_node)
+
+    add_strm_child = get_consumer(as_node, model)
+    volumes = [0, 0]
+
+
+    volumes[0] = peak_deltas[1]
+    volumes[1] = peak_deltas[0]
+
+    print([volumes[0], volumes[1]])
+    ds_node.set_nodeattr("extra_branch_fifos", volumes)
+
+    old_sizes = ds_node.get_nodeattr("outFIFODepths")
+    old_sizes[0] += volumes[0]
+    old_sizes[1] += volumes[1]
+    ds_node.set_nodeattr("outFIFODepths", old_sizes)
+
+
+
+    tav = registry.getCustomOp(add_strm_child).get_nodeattr("io_chrc_in")
+    tav_pad = registry.getCustomOp(add_strm_child).get_nodeattr("io_chrc_in_original")
+
+
+
+    period_add = get_true_period(registry.getCustomOp(add_strm_child))
+
+    addstrm_node_inst.set_nodeattr("io_chrc_in", tav)
+    addstrm_node_inst.set_nodeattr("io_chrc_out", tav)
+
+    addstrm_node_inst.set_nodeattr("io_chrc_out_original", tav_pad)
+    addstrm_node_inst.set_nodeattr("io_chrc_in_original", tav_pad)
+
+    addstrm_node_inst.set_nodeattr("io_chrc_period", period_add)
+    return sum(volumes)
+
+
+class HandleBranches(Transformation):
+    """Given a characterized model, additionally generate the token
+    access vectors for DuplicateStreams and AddStreams such that no
+    deadlocks occur. These nodes were not characterized in the
+    DeriveTokenAccessVectors step and must inherit the edge node
+    token access vectors of the faster of the two branches'.
+    The inherited token access vector is also further padded in this
+    case to simulate additional stalling on the faster branch.
+    We expect the stretching operation afterwards to stretch the
+    faster branch 'less' due to this padding, thus introducing FIFO
+      depth during the DeriveFIFOSizes transform
+    """
+
+    def __init__(self, model, period):
+        super().__init__()
+        self.model = model
+        self.period = period
+
+    def apply(self, model: ModelWrapper):
+        depth_added = 0
         addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls")
+        if len(addstrm_nodes) == 0:
+            warnings.warn("No AddStreams nodes found, skipping")
+            return (model, False)
+
         for addstrm_node in addstrm_nodes:
-            # we currently only support the case where one branch is
-            # a bypass
-            b0 = model.find_producer(addstrm_node.input[0])
-            b1 = model.find_producer(addstrm_node.input[1])
-            if (b0 is None) or (b1 is None):
-                warnings.warn("Found unsupported AddStreams, skipping")
-                return (model, run_again)
-            b0_is_bypass = b0.op_type == "DuplicateStreams_hls"
-            b1_is_bypass = b1.op_type == "DuplicateStreams_hls"
-            if (not b0_is_bypass) and (not b1_is_bypass):
-                warnings.warn("Found unsupported AddStreams, skipping")
-                return (model, run_again)
-            ds_node = b0 if b0_is_bypass else b1
-            comp_branch_last = b1 if b0_is_bypass else b0
-
-            ds_comp_bout = ds_node.output[0] if b0_is_bypass else ds_node.output[1]
-            comp_branch_first = model.find_consumer(ds_comp_bout)
-            if comp_branch_first is None or comp_branch_last is None:
-                warnings.warn("Found unsupported DuplicateStreams, skipping")
-                return (model, run_again)
-            comp_branch_last = registry.getCustomOp(comp_branch_last)
-            comp_branch_first = registry.getCustomOp(comp_branch_first)
-            # for DuplicateStreams, use comp_branch_first's input characterization
-            # for AddStreams, use comp_branch_last's output characterization
-            period = comp_branch_first.get_nodeattr("io_chrc_period")
-            comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[: 2 * period]
-            comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[2 * period :]
-            ds_node_inst = registry.getCustomOp(ds_node)
-            addstrm_node_inst = registry.getCustomOp(addstrm_node)
-            ds_node_inst.set_nodeattr("io_chrc_period", period)
-            ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2)
-            addstrm_node_inst.set_nodeattr("io_chrc_period", period)
-            addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2)
-            warnings.warn(f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}")
-            warnings.warn(f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}")
-        return (model, run_again)
+            depth_added += assign_extra_fifo_volume(addstrm_node, model, self.period)
+
+        return (model, False)
 
 
-class DeriveFIFOSizes(NodeLocalTransformation):
-    """Prerequisite: DeriveCharacteristic already called on graph.
+class ProducerDelayCharacteristicFunctions(NodeLocalTransformation):
+    """Prerequisite: DeriveTokenAccessVectors already called on graph.
     For each node in the graph, use the accumulated I/O characteristic function
-    to perform FIFO sizing, setting the in/outFIFODepths attributes of HLSCustomOp
-    nodes.
+    and delay it if there is a difference in periods between the producer and consumer.
+    This step adjusts for a delayed consumer and a fast producer so that additional
+    depth is not introduced by stretching the consumer too much in the next step
+    The consumer is 'faster' than what an immediate stretch might produce if
+    we dont adjust for the latency of the producer's output starting to arrive
 
     * num_workers (int or None) number of parallel workers, see documentation in
       NodeLocalTransformation for more details.
+      period (int or None) the period to stretch the individual node chr function dumps to.
     """
 
-    def __init__(self, num_workers=None, io_fifo_depth=32):
+    def __init__(self, num_workers=None, period=None, nodes_to_ignore=[]):
         super().__init__(num_workers=num_workers)
-        self.io_fifo_depth = io_fifo_depth
+        self.period = period
+        self.nodes_to_ignore = set(nodes_to_ignore)
 
     def applyNodeLocal(self, node):
         op_type = node.op_type
         if is_hls_node(node) or is_rtl_node(node):
+            print(f"PRODUCER delaying {node.name}")
             try:
                 # lookup op_type in registry of CustomOps
                 prod = registry.getCustomOp(node)
-                assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs"
-                period = prod.get_nodeattr("io_chrc_period")
-                prod_chrc = prod.get_nodeattr("io_chrc_out")[0]
-                assert len(prod_chrc) == 2 * period, "Found unexpected characterization attribute"
-                if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]):
-                    # FIFO depth already set, can skip this node
+
+                if node.op_type in [
+                    "DuplicateStreams_hls",
+                    "StreamingFIFO_hls",
+                    "StreamingFIFO_rtl",
+                ]:
                     return (node, False)
 
-                # find consumers
+                if node.name in self.nodes_to_ignore:
+                    return (node, False)
+
+                prod_chrc_out = decompress_string_to_numpy(prod.get_nodeattr("io_chrc_out"))[0]
+                period = len(prod_chrc_out) // 2
+                prod.set_nodeattr("io_chrc_period", period)
+
                 model = self.ref_input_model
-                out_fifo_depths = []
                 for output_name in node.output:
-                    cons_node = model.find_consumer(output_name)
-                    if cons_node is None:
-                        # could be final node, will be overridden if so
-                        # need an entry in the list anyway
-                        out_fifo_depths.append(self.io_fifo_depth)
+                    #cons = model.find_consumer(output_name)
+                    cons = find_non_dwc_consumer(model, node)
+                    if cons is None:
+                        print("first node, skip")
                         continue
-                    cons = registry.getCustomOp(cons_node)
-                    cons_chrc = cons.get_nodeattr("io_chrc_in")[0]
-                    # find minimum phase shift satisfying the constraint
-                    pshift_min = period - 1
-                    for pshift_cand in range(period):
-                        prod_chrc_part = prod_chrc[pshift_cand:period]
-                        cons_chrc_part = cons_chrc[: period - pshift_cand]
-                        if (prod_chrc_part >= cons_chrc_part).all():
-                            pshift_min = pshift_cand
-                            break
-                    prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period)]
-                    cons_chrc_part = cons_chrc[:period]
-                    fifo_depth = int((prod_chrc_part - cons_chrc_part).max())
-                    out_fifo_depths.append(fifo_depth)
-                # set output FIFO depth for this (producing) node
-                # InsertFIFO looks at the max of (outFIFODepths, inFIFODepths)
-                # for each tensor
-                prod.set_nodeattr("outFIFODepths", out_fifo_depths)
-
-                # finally, check node inputs to ensure FIFOs are added to
-                # any top-level inputs (at least self.io_fifo_depth deep)
-                in_fifo_depths = prod.get_nodeattr("inFIFODepths")
-                for i, input_name in enumerate(node.input):
-                    if input_name in [x.name for x in model.graph.input]:
-                        in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i])
-                prod.set_nodeattr("inFIFODepths", in_fifo_depths)
+
+                    cons = registry.getCustomOp(cons)
+                    cons_chrc_in = decompress_string_to_numpy(cons.get_nodeattr("io_chrc_in"))[0]
+
+                    diff = len(cons_chrc_in) - len(prod_chrc_out)
+
+                    if diff > 0:
+                        # stretching
+                        prod_chrc_out_stretch = stretch(prod_chrc_out, len(cons_chrc_in))
+
+                        # padding
+                        # prod_chrc_out_stretch = np.concatenate(
+                        #     [prod_chrc_out, np.array([prod_chrc_out[-1]] * diff)]
+                        # )
+
+
+                        prod.set_nodeattr(
+                            "io_chrc_out_stretch",
+                            compress_numpy_to_string(np.array([prod_chrc_out_stretch])),
+                        )
 
             except KeyError:
                 # exception if op_type is not supported
                 raise Exception("Custom op_type %s is currently not supported." % op_type)
         return (node, False)
+
+
+class DelayCharacteristicFunctions(NodeLocalTransformation):
+    """Prerequisite: DeriveTokenAccessVectors already called on graph.
+    For each node in the graph, use the accumulated I/O characteristic function
+    and delay it if there is a difference in periods between the producer and consumer.
+    This step adjusts for a delayed consumer and a fast producer so that additional
+    depth is not introduced by stretching the consumer too much in the next step
+    The consumer is 'faster' than what an immediate stretch might produce if
+    we dont adjust for the latency of the producer's output starting to arrive
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+      period (int or None) the period to stretch the individual node chr function dumps to.
+    """
+
+    def __init__(self, num_workers=None, period=None, nodes_to_ignore=[]):
+        super().__init__(num_workers=num_workers)
+        self.period = period
+        self.nodes_to_ignore = set(nodes_to_ignore)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if is_hls_node(node) or is_rtl_node(node):
+            print(f"delaying {node.name}'s consumer")
+            try:
+                # lookup op_type in registry of CustomOps
+                # prod = registry.getCustomOp(node)
+
+                if node.op_type in [
+                    "DuplicateStreams_hls",
+                    "StreamingFIFO_hls",
+                    "StreamingFIFO_rtl",
+                ]:
+                    return (node, False)
+                # assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs"
+                # we allow a FIFO, it will get removed in the next transform and is used to
+                # fill in a bypass branch
+                if node.name in self.nodes_to_ignore:
+                    print(f"ignoring delaying of node {node.name} consumers")
+                    return (node, False)
+
+                    # perform stretching if necessary
+                # prod_period = prod.get_nodeattr("io_chrc_period")
+
+                model = self.ref_input_model
+                for input_name in node.input:
+                    #prod = model.find_producer(input_name)
+                    prod = find_non_dwc_producer(model, node)
+                    if prod is None:
+                        print("last node, skip")
+                        continue
+
+                    prod = registry.getCustomOp(prod)
+
+                    prod_chrc_out = decompress_string_to_numpy(prod.get_nodeattr("io_chrc_out"))[0]
+                    # period = len(prod_chrc_out) // 2
+
+                    cons = registry.getCustomOp(node)
+                    cons_chrc_in = decompress_string_to_numpy(cons.get_nodeattr("io_chrc_in"))[0]
+
+                    cons_period = len(cons_chrc_in) // 2
+
+                    cons.set_nodeattr("io_chrc_period", cons_period)
+
+                    import sys
+
+                    np.set_printoptions(threshold=sys.maxsize)
+
+                    diff = len(prod_chrc_out) - len(cons_chrc_in)
+
+                    if diff > 0:
+                        print("padding cons input")
+
+                        # stretch
+                        cons_chrc_in_stretch = stretch(cons_chrc_in, len(prod_chrc_out))
+                       
+                        # padding
+                        # cons_chrc_in_stretch = np.concatenate(
+                        #     [np.array([cons_chrc_in[-1]] * diff), cons_chrc_in]
+                        # )
+                        #
+                        cons.set_nodeattr(
+                            "io_chrc_in_stretch",
+                            compress_numpy_to_string(np.array([cons_chrc_in_stretch])),
+                        )
+
+                    compressed_cons_chrc_in = compress_numpy_to_string(np.array([cons_chrc_in]))
+                    # compressed_cons_chrc_out = compress_numpy_to_string(np.array([cons_chrc_out]))
+
+                    # setting these parameters here will make final
+                    # characterization func comparisons impossible!
+                    cons.set_nodeattr("io_chrc_in", compressed_cons_chrc_in)
+                    print(f"updated {cons.onnx_node.name} period to {len(cons_chrc_in)}")
+
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
+        return (node, False)
+
+
+def inter_token_gaps(tav):
+    if tav is None or tav.size == 0:
+        return np.array([1]), np.array([0])  # reasonable defaults
+
+    # Find indices where tokens are added (nonzero diff indicates a new token)
+    token_times = np.flatnonzero(np.diff(tav) > 0) + 1  # +1 to align with time index
+
+    if token_times.size < 2:
+        # Not enough token events to compute gaps
+        # Default gap of 1 between tokens (or 0 if no tokens)
+        return np.array([1]), token_times
+
+    # Compute gaps between token emissions
+    # median = np.median
+    gaps = np.diff(token_times)
+    #  median_gap = np.array([int(np.median(gaps))])
+    return gaps, token_times  # ,gaps_min
+
+
+def remove_trailing_duplicates_keep_one(arr):
+    arr = np.asarray(arr)
+    if arr.size == 0:
+        return arr
+
+    last_val = arr[-1]
+    # Find index where values stop being the same as the last value (from the end)
+    i = len(arr) - 1
+    while i > 0 and arr[i - 1] == last_val:
+        i -= 1
+
+    # Keep everything before the trailing duplicates + one final instance
+    return np.concatenate((arr[:i], [last_val]))
+
+
+def remove_leading_duplicates_keep_one(arr):
+    arr = np.asarray(arr)
+    if arr.size == 0:
+        return arr
+
+    first_val = arr[0]
+    # Find index where values stop being the same as the first value (from the start)
+    i = 0
+    while i < len(arr) - 1 and arr[i + 1] == first_val:
+        i += 1
+
+    # Keep one leading instance, then the rest
+    return np.concatenate(([first_val], arr[i + 1 :]))
+
+class DeriveFIFOSizes(Transformation):
+    """Prerequisite: DeriveTokenAccessVectors, ProducerDelayCharacteristic
+    #  and DelayCharacteristic already called on graph.
+    For each node in the graph, use the accumulated Token Access Vectors
+    to perform FIFO sizing, setting the in/outFIFODepths attributes of HLSCustomOp
+    nodes.
+    """
+
+    def __init__(
+        self,
+        num_workers=None,
+        io_fifo_depth=5,
+        period=None,
+        nodes_to_ignore=[],
+        global_offset_correction=False,
+        tav_utilization_strategy="conservative_relaxation",
+    ):
+        super().__init__()
+        self.io_fifo_depth = io_fifo_depth
+        self.period = period
+        self.minimum_size = 2
+        self.nodes_to_ignore = set(nodes_to_ignore)
+        self.global_budgets = []
+        self.slowdown_so_far = [0, 0]
+        self.fifos_removed = 0
+        self.max_delay_so_far = 0
+        self.nodes_parsed = 0
+        self.global_offset_correction = global_offset_correction
+        self.tav_utilization_strategy = tav_utilization_strategy
+        self.delta_total_fifo_size = 0
+        self.delta_adjusted_fifo_size = 0
+        self.hybrid_fifo_size_rate = 0
+        self.data_rate_total_fifo_size = 0
+        self.data_rate_adjusted_fifo_size = 0
+        self.hybrid_fifo_size = 0
+
+    def apply(self, model):
+        nodes = [node for node in model.graph.node]
+
+        for node in nodes:
+            op_type = node.op_type
+            if is_hls_node(node) or is_rtl_node(node):
+                try:
+                    # lookup op_type in registry of CustomOps
+                    self.nodes_parsed += 1
+
+                    if node.name in self.nodes_to_ignore:
+                        continue
+
+                    if "StreamingDataWidthConverter" in node.name:
+                        continue 
+
+                    assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs"
+
+                    prod = registry.getCustomOp(node)
+                    out_fifo_depths = []
+                    for indx, output_name in enumerate(node.output):
+                        #cons_node = model.find_consumer(output_name)
+                        cons_node = find_non_dwc_consumer(model,node)
+                        if cons_node is None:
+                            # could be final node, will be overridden if so
+                            # need an entry in the list anyway
+                            out_fifo_depths.append(self.io_fifo_depth)
+                            continue
+
+                        cons = registry.getCustomOp(cons_node)
+
+                        if node.op_type != "AddStreams_hls":
+                            # determine which of prod and cons TAVs to compare
+                            # based on which one was stretched
+                            chr_pairs = []
+
+                            if prod.get_nodeattr("io_chrc_out_stretch") != "":
+                                chr_pairs.append(["io_chrc_out_stretch", "io_chrc_in"])
+
+                            if cons.get_nodeattr("io_chrc_in_stretch") != "":
+                                chr_pairs.append(["io_chrc_out", "io_chrc_in_stretch"])
+
+                            if len(chr_pairs) == 0:
+                                chr_pairs = [["io_chrc_out", "io_chrc_in"]]
+
+
+                            depth_attempts = []
+                            # currently only testing the first (main) pair
+
+                            if (prod.get_nodeattr(chr_pairs[0][0])) == "":
+                                out_fifo_depths.append(2)
+                                continue
+
+                            if (cons.get_nodeattr(chr_pairs[0][1])) == "":
+                                out_fifo_depths.append(2)
+                                continue
+
+                            for pair in chr_pairs[:1]:
+                                if (prod.get_nodeattr(pair[0])) != "":
+                                    prod_chrc = decompress_string_to_numpy(
+                                        prod.get_nodeattr(pair[0])
+                                    )[0]
+                                else:
+                                    out_fifo_depths.append(2)
+                                    continue
+
+                                if (cons.get_nodeattr(pair[1])) != "":
+                                    cons_chrc = decompress_string_to_numpy(
+                                        cons.get_nodeattr(pair[1])
+                                    )[0]
+                                else:
+                                    out_fifo_depths.append(2)
+                                    continue
+
+                                if len(cons_chrc) != len(prod_chrc):
+                                    period_prod = max(len(prod_chrc) // 2, len(cons_chrc) // 2)
+                                    cons_chrc = stretch(cons_chrc, period_prod * 2)
+                                    prod_chrc = stretch(prod_chrc, period_prod * 2)
+                                else:
+                                    period_prod = len(prod_chrc) // 2
+
+                                global_period = self.period
+
+                                prod_original_chr = decompress_string_to_numpy(
+                                    prod.get_nodeattr("io_chrc_out")
+                                )[0]
+                                cons_original_chr = decompress_string_to_numpy(
+                                    cons.get_nodeattr("io_chrc_in")
+                                )[0]
+
+                                prod_chr_original = decompress_string_to_numpy(
+                                    prod.get_nodeattr("io_chrc_out_original")
+                                )[0]
+                                cons_chr_original = decompress_string_to_numpy(
+                                    cons.get_nodeattr("io_chrc_in_original")
+                                )[0]
+
+                                period_true = len(prod_original_chr) // 2
+
+                                period_cons = len(cons_original_chr) // 2
+
+                                # Step 1: Compute un-relaxed initial FIFO size guess - a conservative estimate to further
+                                # decrease in size using relaxation strategies
+
+                                # find phase shift
+                                pshift_min = 0
+
+                                for pshift_cand in range(period_prod):
+                                    prod_chrc_part = prod_chrc[pshift_cand:period_prod]
+                                    cons_chrc_part = cons_chrc[: period_prod - pshift_cand]
+                                    if (prod_chrc_part >= cons_chrc_part).all():
+                                        pshift_min = pshift_cand
+                                        break
+
+                                # shift TAVs by that amount
+                                pshift_min = max(0, pshift_min - max(0, period_true - period_cons))
+                                prod_chrc_part = prod_chrc[pshift_min : (pshift_min + period_prod)]
+                                cons_chrc_part = cons_chrc[:period_prod]
+                                diff = prod_chrc_part - cons_chrc_part
+
+                                # find peak delta between the two TAVs and use as initial FIFO guess
+                                max_pos = np.argmax(diff)
+                                fifo_depth_maximum = max(0, int(diff[max_pos]))
+
+                                # Step 2: Compute relaxation factors to refine the fifo size computed in Step 1
+                                # using the original tav for determining data rates
+                                
+                                parent_period, producer_node = get_top_producer_period(node, model)
+                                consumer_period, consumer_node = get_top_consumer_period(
+                                    node, model
+                                )
+
+                                gaps, token_times = inter_token_gaps(prod_chr_original)
+                                gaps_cons, token_times_cons = inter_token_gaps(cons_chr_original)
+
+                                local_max_delay_prod_list = sorted(gaps, reverse=True)
+                                local_max_delay_cons_list = sorted(gaps_cons, reverse=True)
+
+                                local_max_delay_prod = local_max_delay_prod_list[-1]
+                                local_max_delay_cons = local_max_delay_cons_list[
+                                    min(0, len(local_max_delay_cons_list) - 1)
+                                ]
+                                print("prod del: ",local_max_delay_prod_list)
+                                print("cons:delay: ",local_max_delay_cons_list)
+
+                                min_gap = min(
+                                    len(local_max_delay_prod_list), len(local_max_delay_cons_list)
+                                )
+
+                                gap_ratios = np.array(
+                                    local_max_delay_cons_list[:min_gap]
+                                ) / np.array(local_max_delay_prod_list[:min_gap])
+
+                                self.max_delay_so_far = max(
+                                    self.max_delay_so_far, local_max_delay_prod
+                                )
+
+                                # Compute the slowdown numerator using the new logic
+                                effective_depth = min(len(gap_ratios), fifo_depth_maximum)
+                                remainder = fifo_depth_maximum - effective_depth
+
+                                if len(gap_ratios) > 0:
+                                    last_value = gap_ratios[-1]
+                                else:
+                                    last_value = 0
+                                    # or raise an error if gap_ratios is
+                                    # expected to have at least one element
+
+                                slowdown_numerator = (
+                                    sum(gap_ratios[:effective_depth]) + remainder * last_value
+                                )
+
+                                fifo_slowdown = slowdown_numerator / period_true
+                                fifo_slowdown = sum(gap_ratios) / period_true
+
+                                minimum_fifos_true = int(
+                                    (local_max_delay_prod + local_max_delay_cons)
+                                    / local_max_delay_prod
+                                )
+                                minimum_fifos = minimum_fifos_true
+
+                                fifo_slowdown_rate = (
+                                    minimum_fifos_true * local_max_delay_prod
+                                ) / period_true
+
+                                cycle_loss_of_fifo = max(
+                                    1, local_max_delay_cons - local_max_delay_prod
+                                )
+                                parent_period = min(parent_period, global_period)
+
+                                # ======= TOLERABLE SLOWDOWN CALCULATION =========================
+                                tolerable_slowdown_parent = max(
+                                    0,
+                                    1
+                                    - (
+                                        parent_period / (global_period - self.slowdown_so_far[indx])
+                                    ),
+                                )
+                                tolerable_slowdown_prod = max(
+                                    0,
+                                    1
+                                    - (period_prod / (global_period - self.slowdown_so_far[indx])),
+                                )
+                                tolerable_slowdown = min(
+                                    [tolerable_slowdown_parent, tolerable_slowdown_prod]
+                                )
+
+                                prod_loss = (global_period - period_true) // cycle_loss_of_fifo
+                                cons_loss = (global_period - period_cons) // cycle_loss_of_fifo
+                                pred_loss = (global_period - parent_period) // cycle_loss_of_fifo
+                                # print("node: ",node.name)
+                                # print("pred, prod, cons periods and losses:")
+                                # print(parent_period, period_true, period_cons)
+                                # print(pred_loss, prod_loss, cons_loss)
+                                #ignorable_fifos = int(max(0,min(prod_loss, cons_loss, pred_loss)))
+                                ignorable_fifos = int(max(0,min([prod_loss])))
+
+                                if producer_node is not None:
+                                    if producer_node.op_type.startswith("DuplicateStreams"):
+                                        ignorable_fifos = 0
+                                if consumer_node is not None:
+                                    if consumer_node.op_type.startswith("AddStreams"):
+                                        ignorable_fifos = 0
+
+                                minimized_depth = max(2, fifo_depth_maximum - ignorable_fifos)
+                                minimum_fifos = max(1, minimum_fifos - ignorable_fifos)
+
+                                if fifo_slowdown > tolerable_slowdown:
+                                    fifos_to_remove = int(
+                                        fifo_depth_maximum * tolerable_slowdown / fifo_slowdown
+                                    )
+                                else:
+                                    fifos_to_remove = fifo_depth_maximum
+
+                                if fifo_slowdown_rate > tolerable_slowdown:
+                                    fifos_to_remove_rate = int(
+                                        minimum_fifos_true * tolerable_slowdown / fifo_slowdown_rate
+                                    )
+                                else:
+                                    fifos_to_remove_rate = minimum_fifos_true
+
+
+                                delta_fifo_size_post_adjustment = max(
+                                    0, fifo_depth_maximum - max(fifos_to_remove, ignorable_fifos )
+                                )
+                                #print("fifos to remove: ", fifos_to_remove)
+                                delta_fifo_size_post_adjustment_rate = max(
+                                    0, minimum_fifos_true - fifos_to_remove_rate
+                                )
+
+                                hybrid_size = max(minimum_fifos, delta_fifo_size_post_adjustment)
+                                hybrid_size_rate = max(
+                                    delta_fifo_size_post_adjustment,
+                                    delta_fifo_size_post_adjustment_rate,
+                                )
+
+                                self.delta_total_fifo_size += fifo_depth_maximum
+                                self.delta_adjusted_fifo_size += delta_fifo_size_post_adjustment
+
+                                self.data_rate_total_fifo_size += minimum_fifos_true
+                                self.data_rate_adjusted_fifo_size += minimum_fifos
+                                self.hybrid_fifo_size += hybrid_size
+                                self.hybrid_fifo_size_rate += hybrid_size_rate
+
+                                if self.tav_utilization_strategy == "conservative_relaxation":
+                                    # minimized TAV different
+                                    fifo_depth = minimized_depth
+                                elif self.tav_utilization_strategy == "aggressive_relaxation":
+                                    # minimized delta based, uses slowdown tracking
+                                    fifo_depth = delta_fifo_size_post_adjustment
+                                elif self.tav_utilization_strategy == "no_relaxation":
+                                    # maximum from TAV comparisons
+                                    fifo_depth = fifo_depth_maximum
+
+                                # print(
+                                #     f"initial size, new sizes: "
+                                #     f"{fifo_depth_maximum}, "
+                                #     f"{minimized_depth}, "
+                                #     f"{self.delta_adjusted_fifo_size}, "
+                                #     f"{self.hybrid_fifo_size}, "
+                                #     f"{self.hybrid_fifo_size_rate}, "
+                                #     f"{self.data_rate_adjusted_fifo_size}"
+                                # )
+
+
+                                # override for testing:
+                                #fifo_depth = delta_fifo_size_post_adjustment
+
+                                #print(f"sized {node.name} with {fifo_depth} ")
+                                depth_attempts.append(fifo_depth)
+                            fifo_depth = min(depth_attempts)
+                        else:
+                            fifo_depth = 0
+
+                        if node.op_type == "DuplicateStreams_hls":
+                            # propagate slowdown
+                            if indx == 0:
+                                self.slowdown_so_far[1] = self.slowdown_so_far[0]
+
+                            extra_volume = prod.get_nodeattr("extra_branch_fifos")[indx]
+                            fifo_depth += extra_volume
+                        else:
+                            extra_volume = prod.get_nodeattr("extra_branch_fifos")[0]
+                            fifo_depth += extra_volume
+
+                        out_fifo_depths.append(max(fifo_depth, self.minimum_size))
+                        
+                        prod.set_nodeattr("outFIFODepths", out_fifo_depths)
+
+                        in_fifo_depths = prod.get_nodeattr("inFIFODepths")
+                        for i, input_name in enumerate(node.input):
+                            if input_name in [x.name for x in model.graph.input]:
+                                in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i])
+                        prod.set_nodeattr("inFIFODepths", in_fifo_depths)
+
+                        if node.op_type == "AddStreams_hls":
+                            self.slowdown_so_far[0] = max(self.slowdown_so_far)
+
+                except KeyError:
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+        #print("final sizes for each strategy: ",self.delta_total_fifo_size, self.delta_adjusted_fifo_size, self.data_rate_total_fifo_size,self.data_rate_adjusted_fifo_size,self.hybrid_fifo_size, self.hybrid_fifo_size_rate)
+        return (model, False)
+
+
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 164971f0f8..4010f3fc26 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -26,10 +26,15 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import base64
+import gzip
+import json
+import numpy as np
 import os
 import subprocess
 import sys
 import tempfile
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.util.basic import roundup_to_integer_multiple
 
 # test boards used for bnn pynq tests
@@ -311,3 +316,122 @@ def get_dsp_block(fpgapart):
         return "DSP48E1"
     else:
         return "DSP48E2"
+
+
+def stretch(a, new_length):
+    n = len(a)
+    x_old = np.arange(n)
+    x_new = np.linspace(0, n - 1, new_length)
+    stretched = np.interp(x_new, x_old, a).round().astype(a.dtype)
+    return stretched
+
+
+class Characteristic_Node:
+    def __init__(self, name, sub_phases, leaf):
+        self.name = name
+        self.sub_phases = sub_phases
+        self.cycles_eval = None
+        self.cycles_inputs = None
+        self.cycles_outputs = None
+        self.leaf = leaf
+        self.debug = False
+
+    def sum(self, op):
+        if self.leaf:
+            if op == 2:
+                return sum([x[0] for x in self.sub_phases])
+            else:
+                return sum([x[0] * x[1][op] for x in self.sub_phases])
+        else:
+            return sum([x[0] * x[1].sum(op) for x in self.sub_phases])
+
+    def traverse_phase_tree(self, op, counter, cycles, ch_fnc):
+        """
+        The tree traversal function to get the token access vector.
+        We call it multiple times to get input, output and cycle count vectors.
+
+
+        op: 0 input, 1 output, 2 cycle count
+        counter: current count of op
+        cycles: current cycle count
+        ch_fnc: list of counter values at each cycle (the token access vector)
+        """
+
+        if (
+            self.leaf
+        ):  # immediate write out of the counter state to the array due to being a leaf node
+            for phase in self.sub_phases:
+                for _ in range(phase[0]):
+                    if op == 2:
+                        counter += 1
+                    else:
+                        counter += phase[1][op]
+                    cycles += 1
+                    ch_fnc.append(counter)
+            return counter, cycles, ch_fnc
+        else:  # recursive call to the next sub-node
+            for phase in self.sub_phases:
+                for _ in range(phase[0]):
+                    counter, cycles, ch_fnc = phase[1].traverse_phase_tree(
+                        op, counter, cycles, ch_fnc
+                    )
+            return counter, cycles, ch_fnc
+
+    def get_total_cycles(self, op):
+        """
+        Returns the total length of a characterized node period with the final
+        timesample being either the final input our output transaction.
+        op ["in", "out"]
+        """
+        counter = 0
+        cycles = 0
+        ch_fnc = []
+        counter, cycles, ch_fnc = self.traverse_phase_tree(op, counter, cycles, ch_fnc)
+        last_update = 0
+        last_val = ch_fnc[op]
+        for i in range(1, len(ch_fnc[1:]) + 1):
+            if ch_fnc[i] > last_val:
+                last_update = i
+                last_val = ch_fnc[i]
+
+        return cycles, last_update, ch_fnc
+
+
+def compress_numpy_to_string(arr):
+    metadata = {
+        "dtype": str(arr.dtype),  # Store dtype as string
+        "shape": arr.shape,  # Store shape as a tuple
+    }
+    metadata_str = json.dumps(metadata)  # Convert metadata to JSON string
+    metadata_bytes = metadata_str.encode("utf-8")  # Convert metadata to bytes
+
+    compressed_data = gzip.compress(arr.tobytes())  # Compress array data
+    combined_data = (
+        metadata_bytes + b"||" + compressed_data
+    )  # Concatenate metadata & compressed data
+    s = base64.b64encode(combined_data).decode("utf-8")
+    return s  # Encode to string
+
+
+def decompress_string_to_numpy(s):
+    combined_data = base64.b64decode(s.encode("utf-8"))  # Decode from base64
+    metadata_bytes, compressed_data = combined_data.split(b"||", 1)  # Split metadata & data
+
+    metadata = json.loads(metadata_bytes.decode("utf-8"))  # Decode metadata
+    dtype = np.dtype(metadata["dtype"])  # Convert dtype back
+    shape = tuple(metadata["shape"])  # Convert shape back
+
+    decompressed_data = gzip.decompress(compressed_data)  # Decompress data
+    return np.frombuffer(decompressed_data, dtype=dtype).reshape(shape)  # Reshape into array
+
+
+def compute_total_model_fifo_size(model):
+    size = 0
+    total_depth = 0
+    for node in model.graph.node:
+        if node.op_type in ["StreamingFIFO", "StreamingFIFO_hls", "StreamingFIFO_rtl"]:
+            depth = getCustomOp(node).get_nodeattr("depth")
+            width = getCustomOp(node).get_instream_width()
+            size += width * depth
+            total_depth += depth
+    return size, total_depth
diff --git a/src/finn/util/test.py b/src/finn/util/test.py
index 2115e058a8..53869fbee8 100644
--- a/src/finn/util/test.py
+++ b/src/finn/util/test.py
@@ -28,22 +28,45 @@
 
 import pytest
 
+import copy
 import importlib_resources as importlib
+import matplotlib.pyplot as plt
 import numpy as np
 import onnx
 import onnx.numpy_helper as nph
 import os
+import qonnx.custom_op.registry as registry
+
+# import time
 import torchvision.transforms.functional as torchvision_util
 import warnings
 from brevitas_examples import bnn_pynq, imagenet_classification
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.registry import getCustomOp
+from qonnx.transformation.general import GiveUniqueNodeNames
 
+from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
 from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles
+from finn.transformation.fpgadataflow.derive_characteristic import (
+    DeriveTokenAccessVectors,
+)
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.prepare_ip import _codegen_single_node
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.fpgadataflow.vitis_build import VitisBuild, VitisOptStrategy
-from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
+from finn.util.basic import (
+    alveo_default_platform,
+    alveo_part_map,
+    decompress_string_to_numpy,
+    make_build_dir,
+    pynq_part_map,
+)
+from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 # map of (wbits,abits) -> model
 example_map = {
@@ -184,3 +207,391 @@ def resize_smaller_side(target_pixels, img):
 def crop_center(size, img):
     """Crop central size*size window out of a PIL image."""
     return torchvision_util.center_crop(img, size)
+
+
+def compare_two_chr_funcs(a, b, max_allowed_volume_delta, max_allowed_length_delta):
+    # relaxation determines how much leeway we allow for the
+    # analytical implementation to be off from RTL ground truth
+    # this leeway may produce larger fifos.
+    # Output delays due to long pipelines generally do not effect
+    # fifo sizes and so large relaxation factors for them are expected.
+
+    lower_len = min(len(a), len(b))
+    if len(a) != len(b):
+        len_dif = abs(len(a) - len(b))
+        print(f"TAV length delta: {len_dif}")
+        if len_dif > max_allowed_length_delta:
+            return False
+
+    peak_volume_delta = np.max(np.abs(a[:lower_len] - b[:lower_len]))
+    print(f"TAV peak volume delta: {peak_volume_delta}")
+    if peak_volume_delta > max_allowed_volume_delta:
+        return False
+    return True
+
+
+def get_characteristic_fnc(model, node0, part, target_clk_ns, strategy, caching=False):
+    """
+    This helper performs FINN node characterization using either rtlsim
+    or characteristic functions. If chacteristic function strategy is
+    requested, but the node does not support it, a fallback to rtlsim
+    is performed. The primary purpose of this helper is for testing purposes
+    to evaluate characteristic function final dump equivalence between rtlsim
+    and characteristic functions.
+    The CACHING flag controls storing the .onnx model in the build dir to reuse,
+    which is useful for vastly speeding up debugging of characterization trees"""
+
+    model_cache = None
+    if caching:
+        # search for prepared model
+        build_dir = os.environ["FINN_BUILD_DIR"]
+        for x in os.listdir(build_dir):
+            if x.startswith(str(node0)):
+                model_cache = f"{build_dir}/{x}/model_{strategy}.onnx"
+                if os.path.exists(model_cache):
+                    model = ModelWrapper(model_cache)
+                else:
+                    model_cache = None
+
+    if model_cache is None:
+        model = model.transform(SpecializeLayers(part))
+        model = model.transform(GiveUniqueNodeNames())
+
+        node = model.graph.node[0]
+        inst = registry.getCustomOp(node)
+        if (is_hls_node(node) or is_rtl_node(node)) and (
+            inst.get_tree_model() is None or strategy == "rtlsim"
+        ):
+            _codegen_single_node(node, model, part, target_clk_ns)
+
+            op_type = node.op_type
+            if is_hls_node(node):
+                try:
+                    # lookup op_type in registry of CustomOps
+
+                    # ensure that code is generated
+                    assert (
+                        inst.get_nodeattr("code_gen_dir_ipgen") != ""
+                    ), """Node
+                    attribute "code_gen_dir_ipgen" is empty. Please run
+                    transformation PrepareIP first."""
+                    if not os.path.isdir(inst.get_nodeattr("ipgen_path")) or not inst.get_nodeattr(
+                        "code_gen_dir_ipgen"
+                    ) in inst.get_nodeattr("ipgen_path"):
+                        # call the compilation function for this node
+                        inst.ipgen_singlenode_code()
+                    else:
+                        warnings.warn("Using pre-existing IP for %s" % node.name)
+                    # ensure that executable path is now set
+                    assert (
+                        inst.get_nodeattr("ipgen_path") != ""
+                    ), """Transformation
+                    HLSSynthIP was not successful. Node attribute "ipgen_path"
+                    is empty."""
+                except KeyError:
+                    # exception if op_type is not supported
+                    raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+        model = model.transform(ReplaceVerilogRelPaths())
+
+        node = model.graph.node[0]
+        inst = registry.getCustomOp(node)
+        if (is_hls_node(node) or is_rtl_node(node)) and (
+            inst.get_tree_model() is None or strategy == "rtlsim"
+        ):
+            try:
+                # lookup op_type in registry of CustomOps
+                # inst = registry.getCustomOp(node)
+                inst.prepare_rtlsim()
+                # ensure that executable path is now set
+                assert (
+                    inst.get_nodeattr("rtlsim_so") != ""
+                ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+            except KeyError:
+                # exception if op_type is not supported
+                raise Exception("Custom op_type %s is currently not supported." % op_type)
+
+        model = model.transform(AnnotateCycles())
+
+        period = int(model.analysis(dataflow_performance)["max_cycles"] + 12)
+
+        model = model.transform(
+            DeriveTokenAccessVectors(
+                model,
+                period,
+                strategy,
+                part,
+                target_clk_ns,
+            )
+        )
+        if caching:
+            tmp_caching_output_dir = make_build_dir(str(node0))
+            model.save(tmp_caching_output_dir + f"/model_{strategy}.onnx")
+
+    return getCustomOp(model.graph.node[0])
+
+
+def debug_chr_funcs(chr_in, chr_out, rtlsim_in, rtlsim_out, printout_limit=100):
+    """This helper prints out characteristic functions for a clean comparison
+    between the rtlsim-based and characteristic-function-based flows to find bugs
+    """
+
+    DEBUG_RAW_FUNCS = True
+    DEBUG_CONCAT_FUNCS = True
+
+    if DEBUG_RAW_FUNCS or DEBUG_CONCAT_FUNCS:
+
+        def concat_list(a):
+            b = []
+            current = a[0]
+            b.append(1)
+            for i in a[1:]:
+                if i == current:
+                    b[-1] += 1
+                else:
+                    b.append(1)
+                    current = i
+            return b
+
+        chr_in_concat = concat_list(chr_in[0])
+        chr_out_concat = concat_list(chr_out[0])
+        rtlsim_in_concat = concat_list(rtlsim_in[0])
+        rtlsim_out_concat = concat_list(rtlsim_out[0])
+
+        np.set_printoptions(threshold=np.inf)
+
+        # input port
+        if DEBUG_RAW_FUNCS:
+            print(f"\nchr IN:    {chr_in[0][:printout_limit]}, {len(chr_in[0])}")
+            print(f"rtlsim IN: {rtlsim_in[0][:printout_limit]}, {len(rtlsim_in[0])}")
+
+        if DEBUG_CONCAT_FUNCS:
+            print(f"chr IN CONCAT:    {chr_in_concat[:printout_limit]}, {len(chr_in_concat)}")
+            print(f"rtlsim IN CONCAT: {rtlsim_in_concat[:printout_limit]}, {len(rtlsim_in_concat)}")
+
+        # output port
+        if DEBUG_RAW_FUNCS:
+            print(f"\nchr OUT:    {chr_out[0][:printout_limit]}, {len(chr_out[0])}")
+            print(f"rtlsim OUT: {rtlsim_out[0][:printout_limit]}, {len(rtlsim_out[0])}")
+
+        if DEBUG_CONCAT_FUNCS:
+            print(f"chr OUT CONCAT:    {chr_out_concat[:printout_limit]}, {len(chr_out_concat)}")
+            print(
+                f"rtlsim OUT CONCAT: {rtlsim_out_concat[:printout_limit]}, {len(rtlsim_out_concat)}"
+            )
+    else:
+        return True
+
+
+def tree_model_test(
+    model,
+    node_details,
+    part,
+    target_clk_ns,
+    max_allowed_volume_delta,
+    max_allowed_length_delta,
+    CACHING=False,
+    DEBUGGING=False,
+):
+    # caching means to run RTLSIM only once and store the model
+    # so we can reuse the token access vector whenever we
+    # update the tree model and want to test correctness
+    # CACHING = True
+
+    # should the token access vectors and
+    # concatenated token access vectors be printed out?
+    # useful for debugging
+    # DEBUGING = False
+
+    # ground truth model to rtlsim
+    model_rtl = copy.deepcopy(model)
+
+    # t0 = time.time()
+    node_analytical = get_characteristic_fnc(
+        model,
+        (*node_details, "tree_model"),
+        part,
+        target_clk_ns,
+        "tree_model",
+        False,
+    )
+
+    node_rtlsim = get_characteristic_fnc(
+        model_rtl,
+        (*node_details, "rtlsim"),
+        part,
+        target_clk_ns,
+        "rtlsim",
+        CACHING,
+    )
+
+    chr_in = decompress_string_to_numpy(node_analytical.get_nodeattr("io_chrc_in"))
+    chr_out = decompress_string_to_numpy(node_analytical.get_nodeattr("io_chrc_out"))
+
+    rtlsim_in = decompress_string_to_numpy(node_rtlsim.get_nodeattr("io_chrc_in"))
+    rtlsim_out = decompress_string_to_numpy(node_rtlsim.get_nodeattr("io_chrc_out"))
+
+    if DEBUGGING:
+        debug_chr_funcs(chr_in, chr_out, rtlsim_in, rtlsim_out)
+        res = compare_nodes(
+            node_details,
+            node_analytical,
+            node_rtlsim,
+            subsample=1,
+            start_cycle=0,
+            max_cycle=None,
+            compare_deltas_only=False,
+        )
+        print(res)
+    # test input port
+    input_check = compare_two_chr_funcs(
+        chr_in[0],
+        rtlsim_in[0],
+        max_allowed_volume_delta,
+        max_allowed_length_delta,
+    )
+
+    # test output port
+    output_check = compare_two_chr_funcs(
+        chr_out[0],
+        rtlsim_out[0],
+        max_allowed_volume_delta,
+        max_allowed_length_delta,
+    )
+
+    return input_check and output_check
+
+
+def node_id_finder(m_model, node_id_to_find):
+    i = 0
+    found = False
+    final_id = 0
+    for i in range(len(m_model.graph.node)):
+        if m_model.graph.node[i].name == node_id_to_find:
+            final_id = i
+            found = True
+            break
+    if found:
+        return final_id
+    else:
+        return -1
+
+
+def inter_token_gaps(tav):
+    if tav is None or tav.size == 0:
+        return np.array([1]), np.array([0])  # reasonable defaults
+
+    # Find indices where tokens are added (nonzero diff indicates a new token)
+    token_times = np.flatnonzero(np.diff(tav) > 0) + 1  # +1 to align with time index
+
+    if token_times.size < 2:
+        # Not enough token events to compute gaps
+        return np.array([1]), token_times  # Default gap of 1 between tokens (or 0 if no tokens)
+
+    # Compute gaps between token emissions
+    gaps = np.diff(token_times)
+    return gaps, token_times  # ,gaps_min
+
+
+def compare_nodes(
+    node_details,
+    model_node,
+    ref_node,
+    subsample=1,
+    start_cycle=0,
+    max_cycle=None,
+    compare_deltas_only=False,
+):
+    # Extract and decompress the input/output trace arrays
+    tav_ref_in = decompress_string_to_numpy(ref_node.get_nodeattr("io_chrc_in"))[0]
+    tav_ref_out = decompress_string_to_numpy(ref_node.get_nodeattr("io_chrc_out"))[0]
+    tav_model_in = decompress_string_to_numpy(model_node.get_nodeattr("io_chrc_in"))[0]
+    tav_model_out = decompress_string_to_numpy(model_node.get_nodeattr("io_chrc_out"))[0]
+
+    # gaps_prod, _ = inter_token_gaps(tav_model_out)
+    # gaps_cons, _ = inter_token_gaps(tav_model_in)
+
+    # local_max_delay_cons_list = sorted(gaps_cons, reverse=True)
+    # local_max_delay_prod_list = sorted(gaps_prod, reverse=True)
+
+    # print("top 10 consumption and production data rates of the node:")
+    # print("tree-model consumption: ", local_max_delay_cons_list[:10])
+    # print("tree-model production: ", local_max_delay_prod_list[:10])
+
+    # gaps_prod, _ = inter_token_gaps(tav_ref_out)
+    # gaps_cons, _ = inter_token_gaps(tav_ref_in)
+
+    # local_max_delay_prod_list = sorted(gaps_prod, reverse=True)
+    # local_max_delay_cons_list = sorted(gaps_cons, reverse=True)
+
+    # print("reference consumption: ", local_max_delay_cons_list[:10])
+    # print("reference production: ", local_max_delay_prod_list[:10])
+
+    # Determine max length for slicing
+    max_len = max(len(tav_ref_in), len(tav_model_in), len(tav_ref_out), len(tav_model_out))
+    if max_cycle is None or max_cycle > max_len:
+        max_cycle = max_len
+
+    # Slice without padding
+    y_ref_in = tav_ref_in[start_cycle:max_cycle]
+    y_model_in = tav_model_in[start_cycle:max_cycle]
+    y_ref_out = tav_ref_out[start_cycle:max_cycle]
+    y_model_out = tav_model_out[start_cycle:max_cycle]
+
+    # Compute differences over common lengths only
+    def max_diff(a, b):
+        common_len = min(len(a), len(b))
+        if common_len == 0:
+            return float("nan")
+        return np.max(np.abs(a[:common_len] - b[:common_len]))
+
+    in_diff = max_diff(y_ref_in, y_model_in)
+    out_diff = max_diff(y_ref_out, y_model_out)
+    if compare_deltas_only:
+        return {"max_in_diff": in_diff, "max_out_diff": out_diff}
+
+    # Plotting
+    plt.figure(figsize=(12, 6))
+
+    def plot_with_subsample(y, label, color, linestyle="-"):
+        y_slice = y[start_cycle:max_cycle]
+        y_sub = y_slice[::subsample]
+        x_sub = np.arange(start_cycle, start_cycle + len(y_sub) * subsample, subsample)
+        plt.plot(x_sub, y_sub, label=label, color=color, linestyle=linestyle)
+        if "ref" in label:
+            y_offset = int(y_sub[-1] * 0.1)
+        else:
+            y_offset = 0
+        if len(x_sub) > 0:
+            plt.text(
+                x_sub[-1],
+                y_sub[-1] + y_offset,
+                f"  {label} {y_sub[-1]:.2f}",
+                color=color,
+                va="center",
+                fontsize=9,
+            )
+
+    plot_with_subsample(tav_ref_in, "in: ref", "blue")
+    plot_with_subsample(tav_model_in, "in: tree model", "blue", linestyle="--")
+    plot_with_subsample(tav_ref_out, "out: ref", "red")
+    plot_with_subsample(tav_model_out, "out: tree model", "red", linestyle="--")
+
+    metrics_ref = f"ref in: {tav_ref_in[-1]}, out: {tav_ref_out[-1]}"
+    metrics_model = f"model in: {tav_model_in[-1]}, out: {tav_model_out[-1]}"
+
+    plt.legend()
+    plt.xlabel("Cycle")
+    plt.ylabel("Accumulated Tokens")
+    plt.title(
+        f"Node {node_details} \n max_in_diff:"
+        f"{in_diff} max_out_diff: {out_diff}\n (Cycles "
+        f"{start_cycle}:{max_cycle})\n{metrics_ref}\n{metrics_model}"
+    )
+    plt.grid(True)
+    plt.tight_layout()
+    plt.show()
+    folder_path = "tree_modeling_plots"
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
+    plt.savefig(f"{folder_path}/{node_details}.png")
diff --git a/tests/fpgadataflow/output.txt b/tests/fpgadataflow/output.txt
new file mode 100644
index 0000000000..ab91cba5b8
--- /dev/null
+++ b/tests/fpgadataflow/output.txt
@@ -0,0 +1,9205 @@
+============================= test session starts ==============================
+platform linux -- Python 3.10.12, pytest-6.2.5, py-1.11.0, pluggy-1.6.0 -- /usr/bin/python3
+cachedir: .pytest_cache
+metadata: {'Python': '3.10.12', 'Platform': 'Linux-5.4.0-216-generic-x86_64-with-glibc2.35', 'Packages': {'pytest': '6.2.5', 'py': '1.11.0', 'pluggy': '1.6.0'}, 'Plugins': {'cov': '4.1.0', 'html': '3.0.0', 'metadata': '1.7.0', 'parallel': '0.1.1', 'xdist': '3.2.0', 'dependency': '0.5.1', 'anyio': '4.11.0', 'forked': '1.6.0'}}
+rootdir: /home/lstasytis/finn_prs/finn, configfile: setup.cfg
+plugins: cov-4.1.0, html-3.0.0, metadata-1.7.0, parallel-0.1.1, xdist-3.2.0, dependency-0.5.1, anyio-4.11.0, forked-1.6.0
+collecting ... collected 384 items
+
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 16
+TAV peak volume delta: 11
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 8
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV peak volume delta: 11
+TAV peak volume delta: 1
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 260
+TAV length delta: 260
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 28
+TAV peak volume delta: 20
+TAV length delta: 28
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 16
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 20
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 516
+TAV length delta: 516
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 4
+TAV peak volume delta: 11
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 4
+TAV peak volume delta: 23
+TAV length delta: 4
+TAV peak volume delta: 7
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 44
+TAV length delta: 44
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 48
+TAV length delta: 48
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 92
+TAV length delta: 92
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 108
+TAV length delta: 108
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 40
+TAV peak volume delta: 4
+TAV length delta: 40
+TAV peak volume delta: 26
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 19
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 84
+TAV length delta: 84
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 16
+TAV peak volume delta: 11
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 8
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV peak volume delta: 11
+TAV peak volume delta: 1
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 260
+TAV length delta: 260
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 28
+TAV peak volume delta: 20
+TAV length delta: 28
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 16
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 20
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 516
+TAV length delta: 516
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 4
+TAV peak volume delta: 11
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 4
+TAV peak volume delta: 23
+TAV length delta: 4
+TAV peak volume delta: 7
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 44
+TAV length delta: 44
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 48
+TAV length delta: 48
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 92
+TAV length delta: 92
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 108
+TAV length delta: 108
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 40
+TAV peak volume delta: 4
+TAV length delta: 40
+TAV peak volume delta: 26
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 19
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 84
+TAV length delta: 84
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 6
+TAV peak volume delta: 7
+TAV length delta: 6
+TAV peak volume delta: 6
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 16
+TAV peak volume delta: 8
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV length delta: 8
+TAV peak volume delta: 13
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 25
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 10
+TAV peak volume delta: 13
+TAV length delta: 10
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 50
+TAV length delta: 4
+TAV peak volume delta: 5
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 50
+TAV length delta: 50
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 24
+TAV peak volume delta: 19
+TAV length delta: 24
+TAV peak volume delta: 17
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 74
+TAV length delta: 74
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 132
+TAV length delta: 132
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 30
+TAV peak volume delta: 16
+TAV length delta: 30
+TAV peak volume delta: 30
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 62
+TAV length delta: 62
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 124
+TAV length delta: 124
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 56
+TAV length delta: 8
+TAV peak volume delta: 15
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 24
+TAV peak volume delta: 20
+TAV length delta: 24
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 6
+TAV peak volume delta: 7
+TAV length delta: 6
+TAV peak volume delta: 6
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 16
+TAV peak volume delta: 8
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV length delta: 8
+TAV peak volume delta: 13
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 25
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 10
+TAV peak volume delta: 13
+TAV length delta: 10
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 50
+TAV length delta: 4
+TAV peak volume delta: 5
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 50
+TAV length delta: 50
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 24
+TAV peak volume delta: 19
+TAV length delta: 24
+TAV peak volume delta: 17
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 74
+TAV length delta: 74
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 132
+TAV length delta: 132
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 30
+TAV peak volume delta: 16
+TAV length delta: 30
+TAV peak volume delta: 30
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 62
+TAV length delta: 62
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 124
+TAV length delta: 124
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 56
+TAV length delta: 8
+TAV peak volume delta: 15
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 24
+TAV peak volume delta: 20
+TAV length delta: 24
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-1-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 16
+TAV peak volume delta: 11
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 8
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV peak volume delta: 11
+TAV peak volume delta: 1
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 260
+TAV length delta: 260
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 4
+TAV peak volume delta: 11
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k2-idt0] FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 48
+TAV length delta: 48
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 40
+TAV peak volume delta: 4
+TAV length delta: 40
+TAV peak volume delta: 26
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim0-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim0-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 6
+TAV peak volume delta: 7
+TAV length delta: 6
+TAV peak volume delta: 6
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 16
+TAV peak volume delta: 8
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV length delta: 8
+TAV peak volume delta: 13
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 25
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 10
+TAV peak volume delta: 13
+TAV length delta: 10
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 50
+TAV length delta: 4
+TAV peak volume delta: 5
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 50
+TAV length delta: 50
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 24
+TAV peak volume delta: 19
+TAV length delta: 24
+TAV peak volume delta: 17
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k2-idt0] FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 74
+TAV length delta: 74
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 132
+TAV length delta: 132
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k2-idt0] FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 30
+TAV peak volume delta: 16
+TAV length delta: 30
+TAV peak volume delta: 30
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 62
+TAV length delta: 62
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 124
+TAV length delta: 124
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 56
+TAV length delta: 8
+TAV peak volume delta: 15
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 24
+TAV peak volume delta: 20
+TAV length delta: 24
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 6
+TAV peak volume delta: 7
+TAV length delta: 6
+TAV peak volume delta: 6
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 16
+TAV peak volume delta: 8
+TAV length delta: 16
+TAV peak volume delta: 16
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim0-k2-idt0] TAV length delta: 8
+TAV peak volume delta: 13
+TAV length delta: 8
+TAV peak volume delta: 8
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-2-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 25
+TAV length delta: 4
+TAV peak volume delta: 4
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 10
+TAV peak volume delta: 13
+TAV length delta: 10
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 20
+TAV peak volume delta: 14
+TAV length delta: 20
+TAV peak volume delta: 20
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0] TAV length delta: 4
+TAV peak volume delta: 50
+TAV length delta: 4
+TAV peak volume delta: 5
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0] TAV length delta: 50
+TAV length delta: 50
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 24
+TAV peak volume delta: 19
+TAV length delta: 24
+TAV peak volume delta: 17
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k2-idt0] FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 74
+TAV length delta: 74
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 132
+TAV length delta: 132
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k2-idt0] FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim0-k0-idt0] TAV length delta: 30
+TAV peak volume delta: 16
+TAV length delta: 30
+TAV peak volume delta: 30
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0] TAV length delta: 56
+TAV length delta: 56
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0] TAV length delta: 62
+TAV length delta: 62
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0] TAV length delta: 124
+TAV length delta: 124
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim0-k0-idt0] TAV peak volume delta: 0
+TAV peak volume delta: 10
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0] TAV length delta: 8
+TAV peak volume delta: 56
+TAV length delta: 8
+TAV peak volume delta: 15
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim1-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim0-k0-idt0] TAV length delta: 24
+TAV peak volume delta: 20
+TAV length delta: 24
+TAV peak volume delta: 28
+PASSED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0] TAV length delta: 72
+TAV length delta: 72
+FAILED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim0-k2-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim1-k0-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim1-k1-idt0] SKIPPED
+test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim1-k2-idt0] SKIPPED
+
+=================================== FAILURES ===================================
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 2, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480aa74c0>, ('ConvolutionInputGenerator', [1, 5], 2, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480936c80>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48099bb80>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480a207c0>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d6a40>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48099a020>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480aa7eb0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 1, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480934a00>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 2, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48099a4d0>, ('ConvolutionInputGenerator', [1, 5], 2, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809131f0>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480935060>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480995e10>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d6560>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480912ad0>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480995540>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48099abf0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d7580>, ('ConvolutionInputGenerator', [1, 5], 4, [8, 8], [8, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d6110>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480aa7fd0>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480997040>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd481c86d40>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d6260>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480910d60>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b52b00>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480936bc0>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b51db0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480937880>, ('ConvolutionInputGenerator', [1, 5], 4, [8, 8], [8, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b51cf0>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480934550>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480998280>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480912740>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d6800>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480910bb0>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd481c87ca0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4819e7040>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 0, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd481c9e7a0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 2, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 0, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48155ee00>, ('ConvolutionInputGenerator', [1, 5], 2, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 0, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+
+test_fpgadataflow_convinputgenerator.py:343:
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+../../src/finn/util/test.py:423: in tree_model_test
+    node_rtlsim = get_characteristic_fnc(
+../../src/finn/util/test.py:264: in get_characteristic_fnc
+    _codegen_single_node(node, model, part, target_clk_ns)
+../../src/finn/transformation/fpgadataflow/prepare_ip.py:54: in _codegen_single_node
+    inst.code_generation_ipgen(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtlbackend.py:88: in code_generation_ipgen
+    self.generate_hdl(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:798: in generate_hdl
+    template_path, code_gen_dict = self.prepare_codegen_default()
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+
+self = <finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl.ConvolutionInputGenerator_rtl object at 0x7fd481c9f580>
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+>           abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+E       AssertionError: ERROR: H increment > buffer size, try setting parallel_window=1
+
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:378: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b263b0>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 0, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b508e0>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd481b236d0>, ('ConvolutionInputGenerator', [1, 5], 4, [8, 8], [8, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b51c30>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480a0dae0>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+
+test_fpgadataflow_convinputgenerator.py:343:
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+../../src/finn/util/test.py:423: in tree_model_test
+    node_rtlsim = get_characteristic_fnc(
+../../src/finn/util/test.py:264: in get_characteristic_fnc
+    _codegen_single_node(node, model, part, target_clk_ns)
+../../src/finn/transformation/fpgadataflow/prepare_ip.py:54: in _codegen_single_node
+    inst.code_generation_ipgen(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtlbackend.py:88: in code_generation_ipgen
+    self.generate_hdl(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:798: in generate_hdl
+    template_path, code_gen_dict = self.prepare_codegen_default()
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+
+self = <finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl.ConvolutionInputGenerator_rtl object at 0x7fd480b51bd0>
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+>           abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+E       AssertionError: ERROR: H increment > buffer size, try setting parallel_window=1
+
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:378: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48084dd50>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48099bc40>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+
+test_fpgadataflow_convinputgenerator.py:343:
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+../../src/finn/util/test.py:423: in tree_model_test
+    node_rtlsim = get_characteristic_fnc(
+../../src/finn/util/test.py:264: in get_characteristic_fnc
+    _codegen_single_node(node, model, part, target_clk_ns)
+../../src/finn/transformation/fpgadataflow/prepare_ip.py:54: in _codegen_single_node
+    inst.code_generation_ipgen(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtlbackend.py:88: in code_generation_ipgen
+    self.generate_hdl(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:798: in generate_hdl
+    template_path, code_gen_dict = self.prepare_codegen_default()
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+
+self = <finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl.ConvolutionInputGenerator_rtl object at 0x7fd48084da20>
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+>           abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+E       AssertionError: ERROR: H increment > buffer size, try setting parallel_window=1
+
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:378: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480b32440>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd48155c850>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480997790>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809d4b80>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 1, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480995f90>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480860250>, ('ConvolutionInputGenerator', [1, 5], 4, [8, 8], [8, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [1, 21], ifm_ch = 4, stride = [1, 1]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480996b90>, ('ConvolutionInputGenerator', [1, 5], 4, [1, 21], [1, 17], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4808606a0>, ('ConvolutionInputGenerator', [2, 2], 2, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+
+test_fpgadataflow_convinputgenerator.py:343:
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+../../src/finn/util/test.py:423: in tree_model_test
+    node_rtlsim = get_characteristic_fnc(
+../../src/finn/util/test.py:264: in get_characteristic_fnc
+    _codegen_single_node(node, model, part, target_clk_ns)
+../../src/finn/transformation/fpgadataflow/prepare_ip.py:54: in _codegen_single_node
+    inst.code_generation_ipgen(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtlbackend.py:88: in code_generation_ipgen
+    self.generate_hdl(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:798: in generate_hdl
+    template_path, code_gen_dict = self.prepare_codegen_default()
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+
+self = <finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl.ConvolutionInputGenerator_rtl object at 0x7fd480994fd0>
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+>           abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+E       AssertionError: ERROR: H increment > buffer size, try setting parallel_window=1
+
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:378: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480912890>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [4, 4], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd481cc0280>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [3, 3], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k2-idt0] _
+
+idt = INT2, k = [1, 5], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [1, 1], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+
+test_fpgadataflow_convinputgenerator.py:343:
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+../../src/finn/util/test.py:423: in tree_model_test
+    node_rtlsim = get_characteristic_fnc(
+../../src/finn/util/test.py:264: in get_characteristic_fnc
+    _codegen_single_node(node, model, part, target_clk_ns)
+../../src/finn/transformation/fpgadataflow/prepare_ip.py:54: in _codegen_single_node
+    inst.code_generation_ipgen(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtlbackend.py:88: in code_generation_ipgen
+    self.generate_hdl(model, fpgapart, clk)
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:798: in generate_hdl
+    template_path, code_gen_dict = self.prepare_codegen_default()
+_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+
+self = <finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl.ConvolutionInputGenerator_rtl object at 0x7fd480a6f520>
+
+    def prepare_codegen_default(self):
+        """Fills code generation dict for the default implementation style by computing
+        the incremental addressing scheme for the circular buffer."""
+        if self.get_nodeattr("dynamic_mode"):
+            template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv"
+        else:
+            template_select = "/finn-rtllib/swg/swg_template_default.sv"
+        template_path = os.environ["FINN_ROOT"] + template_select
+        code_gen_dict = {}
+
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        k = self.get_nodeattr("ConvKernelDim")
+        ifm_dim = self.get_nodeattr("IFMDim")
+        stride = self.get_nodeattr("Stride")
+        dilation = self.get_nodeattr("Dilation")
+        depthwise = self.get_nodeattr("depthwise")
+        simd = self.get_nodeattr("SIMD")
+
+        k_h, k_w = k
+        h, w = ifm_dim
+        pad = [0, 0, 0, 0]  # padding happens in separate padding node for now
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+        pad_h = pad[0] + pad[2]
+        pad_w = pad[1] + pad[3]
+        out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h)
+        out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w)
+        mmv_in = 1
+        mmv_out = 1
+        channel_factor = int(ifm_ch / simd)
+
+        # compute minimal buffer length (assuming it holds 1 complete window)
+        buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor
+
+        buffer_actual_size = self.get_buffer_depth()
+        code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)]
+
+        # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation
+        # or cols/rows that are skipped due to imperfect stride<->dim combination
+        kernel_width = (k_w - 1) * dilation_w + 1
+        kernel_height = (k_h - 1) * dilation_h + 1
+        skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w)
+        skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h)
+
+        # compute address increment values for 5-loop nest
+        addr_incr_end_simd = 1
+        addr_incr_end_window_elem = (dilation_w - 1) * channel_factor + 1
+        addr_incr_end_window_row = (
+            ((w - kernel_width) * channel_factor)  # remaining line
+            + ((dilation_h - 1) * w * channel_factor)  # skip lines
+            + 1  # wrap-around of minimally sized buffer
+        )
+        addr_incr_end_window = -buffer_min_size + stride_w * channel_factor + 1
+        addr_incr_end_row = (
+            -buffer_min_size
+            + ((skip_columns + kernel_width) * channel_factor)  # remaining line
+            + ((stride_h - 1) * w * channel_factor)  # skip lines
+            + 1
+        )
+
+        # re-use same controller structure -> re-assign address increments
+        if depthwise:
+            addr_incr_end_window_elem = dilation_w * channel_factor
+            addr_incr_end_window_row = (
+                channel_factor
+                + (w - kernel_width) * channel_factor
+                + (dilation_h - 1) * w * channel_factor
+            )
+            addr_incr_end_simd = -buffer_min_size + (channel_factor + 1)
+
+        # sanity check for wrap logic
+        assert not (
+            abs(addr_incr_end_window) > buffer_actual_size
+        ), "ERROR: W increment > buffer size, try setting parallel_window=1"
+        assert not (
+>           abs(addr_incr_end_row) > buffer_actual_size
+        ), "ERROR: H increment > buffer size, try setting parallel_window=1"
+E       AssertionError: ERROR: H increment > buffer size, try setting parallel_window=1
+
+../../src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py:378: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4809125f0>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0] _
+
+idt = INT2, k = [2, 2], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480aa5060>, ('ConvolutionInputGenerator', [2, 2], 4, [8, 8], [6, 6], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [1, 1]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480913fa0>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [4, 4], [1, 1], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 2, stride = [2, 2]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd4819e5ab0>, ('ConvolutionInputGenerator', [3, 3], 2, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+_ test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0] _
+
+idt = INT2, k = [3, 3], ifm_dim = [8, 8], ifm_ch = 4, stride = [2, 2]
+dilation = [2, 2], simd = 2, dw = 1, parallel_window = 1, m = 1, flip = False
+
+    @pytest.mark.parametrize("idt", [DataType["INT2"]])
+    # kernel size
+    @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+    # input dimension
+    @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+    # input channels
+    @pytest.mark.parametrize("ifm_ch", [2, 4])
+    # Stride
+    @pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+    # Dilation
+    @pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+    # input channel parallelism ("SIMD")
+    @pytest.mark.parametrize("simd", [1, 2])
+    # depthwise
+    @pytest.mark.parametrize("dw", [0, 1])
+    # parallel_window enable (MMV_out = M*K)
+    @pytest.mark.parametrize("parallel_window", [0, 1])
+    # in/out MMV ("M")
+    @pytest.mark.parametrize("m", [1])
+    # Flip dimensions
+    @pytest.mark.parametrize("flip", [False])
+    @pytest.mark.fpgadataflow
+    @pytest.mark.slow
+    @pytest.mark.vivado
+    @pytest.mark.node_tree_modeling
+    def test_fpgadataflow_analytical_characterization_slidingwindow(
+        idt,
+        k,
+        ifm_dim,
+        ifm_ch,
+        stride,
+        dilation,
+        simd,
+        dw,
+        parallel_window,
+        m,
+        flip,
+    ):
+        if flip:
+            if (
+                ifm_dim[0] == ifm_dim[1]
+                and k[0] == k[1]
+                and stride[0] == stride[1]
+                and dilation[0] == dilation[1]
+            ):
+                pytest.skip("Dimension flip would have no effect")
+            k = k[::-1]
+            ifm_dim = ifm_dim[::-1]
+            stride = stride[::-1]
+            dilation = dilation[::-1]
+
+        k_h, k_w = k
+        ifm_dim_h, ifm_dim_w = ifm_dim
+        stride_h, stride_w = stride
+        dilation_h, dilation_w = dilation
+
+        kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+        kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+        if simd > ifm_ch:
+            pytest.skip("SIMD cannot be larger than number of input channels")
+        if ifm_ch % simd != 0:
+            pytest.skip("SIMD must divide number of input channels")
+        if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+            pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+        if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+            pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+        if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+            pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+        if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+            pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+        ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+        ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+        ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+        model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+
+        model = model.transform(to_hw.InferConvInpGen())
+
+        # set simd
+        inst = getCustomOp(model.graph.node[0])
+        inst.set_nodeattr("SIMD", simd)
+        optype = model.graph.node[0].op_type
+        if optype == "ConvolutionInputGenerator_rtl":
+            inst.set_nodeattr("parallel_window", parallel_window)
+            inst.set_nodeattr("M", m)
+        if optype == "ConvolutionInputGenerator_hls":
+            if inst.get_nodeattr("is1D"):
+                inst.set_nodeattr("parallel_window", parallel_window)
+
+        node_details = (
+            "ConvolutionInputGenerator",
+            k,
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            stride,
+            dilation,
+            idt,
+            dw,
+            "hls",
+        )
+        part = "xc7z020clg400-1"
+        target_clk_ns = 4
+        max_allowed_volume_delta = 40
+
+>       assert tree_model_test(
+            model, node_details, part, target_clk_ns, max_allowed_volume_delta,0,True
+        ), "characterized TAV does not match RTLsim'd one!"
+E       AssertionError: characterized TAV does not match RTLsim'd one!
+E       assert False
+E        +  where False = tree_model_test(<qonnx.core.modelwrapper.ModelWrapper object at 0x7fd480913700>, ('ConvolutionInputGenerator', [3, 3], 4, [8, 8], [2, 2], [2, 2], ...), 'xc7z020clg400-1', 4, 40, 0, True)
+
+test_fpgadataflow_convinputgenerator.py:343: AssertionError
+=============================== warnings summary ===============================
+test_fpgadataflow_convinputgenerator.py:257
+  /home/lstasytis/finn_prs/finn/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py:257: PytestUnknownMarkWarning: Unknown pytest.mark.node_tree_modeling - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/mark.html
+    @pytest.mark.node_tree_modeling
+
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py: 135 warnings
+  /home/lstasytis/finn_prs/finn/deps/qonnx/src/qonnx/core/modelwrapper.py:98: UserWarning: Some old-style domain attributes were automatically converted to new-style,
+                  i.e. domain=finn to domain=qonnx.custom_op.<general|fpgadataflow|...>
+    warnings.warn(
+
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k2-idt0]
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k2-idt0]
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k2-idt0]
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k2-idt0]
+tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k2-idt0]
+  /home/lstasytis/finn_prs/finn/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py:302: DeprecationWarning: In future, it will be an error for 'np.bool_' scalars to be interpreted as an index
+    adjustments = sorted(
+
+-- Docs: https://docs.pytest.org/en/stable/warnings.html
+=========================== short test summary info ============================
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-2-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-1-dilation1-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-0-2-dilation1-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-0-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride0-2-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation0-stride1-2-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-0-2-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-2-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation0-stride1-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-1-dilation1-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride0-4-ifm_dim1-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-2-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation0-stride1-4-ifm_dim0-k2-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k0-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride0-4-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-2-ifm_dim0-k1-idt0]
+FAILED test_fpgadataflow_convinputgenerator.py::test_fpgadataflow_analytical_characterization_slidingwindow[False-1-1-1-2-dilation1-stride1-4-ifm_dim0-k1-idt0]
+=========== 64 failed, 71 passed, 249 skipped, 141 warnings in 7.15s ===========
diff --git a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
index e155053b8b..4e174fb941 100644
--- a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py
@@ -47,6 +47,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import tree_model_test
 
 
 def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False):
@@ -242,3 +243,96 @@ def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mod
         exp_cycles_dict = new_model.analysis(exp_cycles_per_layer)
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["UINT4"]])
+# output datatype
+@pytest.mark.parametrize("odt", [DataType["UINT4"]])
+# pool configuration:                   ( k,stride, pad, ifm_dim )
+# @pytest.mark.parametrize("pool_config", [(7, 7, 0, 7), (3, 2, 1, 5)])
+# @pytest.mark.parametrize("pool_config", [(7, 7, 0, 128), (3, 2, 1, 5)])
+@pytest.mark.parametrize("pool_config", [(2, 1, 0, 512)])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [32])
+# number of out channel computed in parallel
+@pytest.mark.parametrize("pe", [32])
+# pool type
+# @pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool", "MaxPool1D"])
+@pytest.mark.parametrize("op_type", ["MaxPool1D"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_analytical_characterization_pool(idt, odt, pool_config, ifm_ch, pe, op_type):
+    k, stride, pad, ifm_dim = pool_config
+
+    if ifm_ch % pe != 0:
+        pytest.skip("ifm_ch%pe != 0. Skipping")
+
+    if pad != 0 and idt.signed():
+        pytest.skip("No support for pal_val != 0. Skipping")
+
+    np.random.seed(0)
+
+    part = "xc7z020clg400-1"
+
+    ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1)
+
+    ishape = (1, ifm_ch, ifm_dim, ifm_dim)
+    use_1d = False
+    if op_type == "MaxPool1D":
+        use_1d = True
+        ishape = (1, ifm_ch, 1, ifm_dim)
+        op_type = "MaxPool"
+
+    if op_type == "MaxPool":
+        if idt != odt:
+            pytest.skip("Skipping Maxpool with idt != odt")
+
+        model = make_single_maxpool_modelwrapper(
+            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d
+        )
+    elif op_type == "QuantAvgPool2d":
+        if pad != 0:
+            pytest.skip("No padding support for QuantAvgPool2d. Skipping")
+
+        if idt.signed() != odt.signed():
+            pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()")
+        model = make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt)
+    else:
+        assert False, "{} is not a supported op_type".format(op_type)
+
+    model = model.transform(to_hw.InferPool())
+
+    # Folding
+    for n in model.graph.node:
+        if n.op_type.startswith("Pool"):
+            inst = getCustomOp(n)
+
+            ishape = inst.get_folded_input_shape()
+            oshape = inst.get_folded_output_shape()
+
+            inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+            outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+            graph = helper.make_graph(nodes=[n], name="mp_graph", inputs=[inp], outputs=[outp])
+            model = qonnx_make_model(graph, producer_name="mp-model")
+            model = ModelWrapper(model)
+            model.set_tensor_datatype("inp", idt)
+            model.set_tensor_datatype("outp", odt)
+            model = model.transform(InferShapes())
+
+            inst.set_nodeattr("PE", pe)
+            model = model.transform(SpecializeLayers(part))
+
+    node_details = ("Pool", op_type, k, ifm_ch, ifm_dim, ofm_dim, pe, idt)
+
+    target_clk_ns = 4
+
+    max_allowed_volume_delta = 5000
+    max_allowed_length_delta = 5000
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 9b36e1c6f7..dd723972cb 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -61,14 +61,38 @@ def fetch_test_model(topology, wbits=2, abits=2):
 @pytest.mark.slow
 @pytest.mark.vivado
 @pytest.mark.fpgadataflow
-@pytest.mark.parametrize("method", ["largefifo_rtlsim", "characterize"])
-@pytest.mark.parametrize("topology", ["tfc", "cnv"])
+@pytest.mark.parametrize(
+    "method",
+    [
+        "analytic_model_based",
+        "analytic_rtlsim",
+        "largefifo_rtlsim",
+    ],
+)
+@pytest.mark.parametrize(
+    "topology",
+    [
+        "tfc",
+        "cnv",
+    ],
+)
 def test_fifosizing_linear(method, topology):
     tmp_output_dir = fetch_test_model(topology)
+    if method == "analytic_model_based":
+        auto_fifo_strategy = "analytical"
+        tav_generation_strategy_key = "tree_model"
+    elif method == "analytic_rtlsim":
+        auto_fifo_strategy = "analytical"
+        tav_generation_strategy_key = "rtlsim"
+    else:
+        auto_fifo_strategy = "largefifo_rtlsim"
+        tav_generation_strategy_key = "rtlsim"
+
     cfg = build_cfg.DataflowBuildConfig(
         output_dir=tmp_output_dir,
         auto_fifo_depths=True,
-        auto_fifo_strategy=method,
+        auto_fifo_strategy=auto_fifo_strategy,
+        tav_generation_strategy=tav_generation_strategy_key,
         target_fps=10000 if topology == "tfc" else 1000,
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
@@ -100,7 +124,107 @@ def test_fifosizing_linear(method, topology):
 
     model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx")
     model1 = ModelWrapper(tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx")
+    assert len(model0.graph.node) == len(model1.graph.node)
+    for i in range(len(model0.graph.node)):
+        node0 = model0.graph.node[i]
+        node1 = model1.graph.node[i]
+        assert node0.op_type == node1.op_type
+        if node0.op_type == "StreamingFIFO":
+            node0_inst = getCustomOp(node0)
+            node1_inst = getCustomOp(node1)
+            assert node0_inst.get_nodeattr("depth") == node1_inst.get_nodeattr("depth")
+
+    shutil.rmtree(tmp_output_dir)
+    shutil.rmtree(tmp_output_dir_cmp)
+
+
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.parametrize(
+    "method",
+    [
+        "analytic_model_based",
+        "analytic_rtlsim",
+        #   "largefifo_rtlsim_python",
+        #  "largefifo_rtlsim_cpp",
+    ],
+)
+@pytest.mark.parametrize(
+    "topology",
+    [
+        "tfc",
+        #  "cnv"
+    ],
+)
+def test_fifosizing_fast(method, topology):
+    force_python_rtlsim = "python" in method
+
+    tmp_output_dir = fetch_test_model(topology)
+    if method == "analytic_model_based":
+        auto_fifo_strategy = "analytical"
+        tav_generation_strategy_key = "tree_model"
+    elif method == "characterize_rtlsim":
+        auto_fifo_strategy = "analytical"
+        tav_generation_strategy_key = "rtlsim"
+    else:
+        auto_fifo_strategy = "largefifo_rtlsim"
+        tav_generation_strategy_key = "rtlsim"
+
+    cfg = build_cfg.DataflowBuildConfig(
+        output_dir=tmp_output_dir,
+        auto_fifo_depths=True,
+        auto_fifo_strategy=auto_fifo_strategy,
+        tav_generation_strategy=tav_generation_strategy_key,
+        target_fps=10000 if topology == "tfc" else 1000,
+        force_python_rtlsim=force_python_rtlsim,
+        synth_clk_period_ns=10.0,
+        steps=[
+            "step_qonnx_to_finn",
+            "step_tidy_up",
+            "step_streamline",
+            "step_convert_to_hw",
+            "step_create_dataflow_partition",
+            "step_specialize_layers",
+            "step_target_fps_parallelization",
+            "step_apply_folding_config",
+            "step_minimize_bit_width",
+            "step_generate_estimate_reports",
+            "step_set_fifo_depths",
+        ],
+        board="Pynq-Z1",
+        rtlsim_batch_size=100 if topology == "tfc" else 2,
+        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
+        generate_outputs=[
+            build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
+        ],
+    )
+    build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg)
+
+    # now run the same build using the generated folding and FIFO config
+    tmp_output_dir_cmp = fetch_test_model(topology)
+    cfg_cmp = cfg
+    cfg_cmp.output_dir = tmp_output_dir_cmp
+    cfg_cmp.auto_fifo_depths = False
+    cfg_cmp.target_fps = None
+    cfg_cmp.steps = [
+        "step_qonnx_to_finn",
+        "step_tidy_up",
+        "step_streamline",
+        "step_convert_to_hw",
+        "step_create_dataflow_partition",
+        "step_specialize_layers",
+        "step_target_fps_parallelization",
+        "step_apply_folding_config",
+        "step_minimize_bit_width",
+        "step_generate_estimate_reports",
+        "step_set_fifo_depths",
+    ]
+    cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json"
+    build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp)
 
+    model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx")
+    model1 = ModelWrapper(tmp_output_dir_cmp + "/intermediate_models/step_set_fifo_depths.onnx")
     assert len(model0.graph.node) == len(model1.graph.node)
     for i in range(len(model0.graph.node)):
         node0 = model0.graph.node[i]
diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
index 2ad49ae58b..e0662c0b72 100644
--- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
+++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py
@@ -47,6 +47,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import tree_model_test
 
 
 def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs):
@@ -172,3 +173,48 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["INT8"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# param datatype
+@pytest.mark.parametrize("pdt", [DataType["INT4"]])
+# folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2])
+# number of input features
+@pytest.mark.parametrize("ich", [16])
+# vecs
+@pytest.mark.parametrize("vecs", [[1], [1, 7, 7]])
+# function
+@pytest.mark.parametrize("func", ["add"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_channelwise_ops(
+    idt, act, pdt, nf, ich, func, vecs
+):
+    if nf == -1:
+        nf = ich
+    pe = ich // nf
+    assert ich % pe == 0
+
+    # generate param data
+    C = gen_finn_dt_tensor(pdt, (ich))
+
+    odt = act
+
+    # create model
+    model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs)
+    node_details = ("ChannelWiseOp", C, pe, idt, odt, pdt, func, "hls")
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+
+    max_allowed_volume_delta = 14
+    max_allowed_length_delta = 14
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 93860b87ed..dc9b6331ee 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -48,6 +48,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import tree_model_test
 
 
 def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw):
@@ -225,3 +226,243 @@ def test_fpgadataflow_slidingwindow(
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10, rtol=1.1)
         assert exp_cycles != 0
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT2"]])
+# kernel size
+# @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+@pytest.mark.parametrize("k", [[1, 1], [2, 2]])
+# input dimension
+# @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+@pytest.mark.parametrize("ifm_dim", [[10, 6]])
+# input channels
+# @pytest.mark.parametrize("ifm_ch", [2, 4])
+@pytest.mark.parametrize("ifm_ch", [1, 10])
+# Stride
+# @pytest.mark.parametrize("stride", [[1, 1]])
+@pytest.mark.parametrize("stride", [[1, 1], [2, 2]])
+# Dilation
+# @pytest.mark.parametrize("dilation", [[1, 1]])
+@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1, 10])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0, 1])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_slidingwindow(
+    idt,
+    k,
+    ifm_dim,
+    ifm_ch,
+    stride,
+    dilation,
+    simd,
+    dw,
+    parallel_window,
+    m,
+    flip,
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+        pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+    if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+        pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+    if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+        pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    # set simd
+    inst = getCustomOp(model.graph.node[0])
+    inst.set_nodeattr("SIMD", simd)
+    optype = model.graph.node[0].op_type
+    if optype == "ConvolutionInputGenerator_rtl":
+        inst.set_nodeattr("parallel_window", parallel_window)
+        inst.set_nodeattr("M", m)
+
+    node_details = (
+        "ConvolutionInputGenerator",
+        ifm_dim,
+        k,
+        stride,
+        dilation,
+        ifm_ch,
+        simd,
+        dw,
+        parallel_window,
+        idt,
+        ofm_dim,
+        "hls",
+    )
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    max_allowed_volume_delta = 5000
+    max_allowed_length_delta = 5000
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT2"]])
+# kernel size
+# @pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]])
+@pytest.mark.parametrize("k", [[7, 7]])
+# input dimension
+# @pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]])
+@pytest.mark.parametrize("ifm_dim", [[7, 7]])
+# input channels
+# @pytest.mark.parametrize("ifm_ch", [2, 4])
+@pytest.mark.parametrize("ifm_ch", [1024])
+# Stride
+# @pytest.mark.parametrize("stride", [[1, 1]])
+@pytest.mark.parametrize("stride", [[1, 1]])
+# Dilation
+# @pytest.mark.parametrize("dilation", [[1, 1]])
+@pytest.mark.parametrize("dilation", [[1, 1]])
+# input channel parallelism ("SIMD")
+@pytest.mark.parametrize("simd", [1])
+# depthwise
+@pytest.mark.parametrize("dw", [1])
+# parallel_window enable (MMV_out = M*K)
+@pytest.mark.parametrize("parallel_window", [0])
+# in/out MMV ("M")
+@pytest.mark.parametrize("m", [1])
+# Flip dimensions
+@pytest.mark.parametrize("flip", [False])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_slidingwindow_mobilenet(
+    idt,
+    k,
+    ifm_dim,
+    ifm_ch,
+    stride,
+    dilation,
+    simd,
+    dw,
+    parallel_window,
+    m,
+    flip,
+):
+    if flip:
+        if (
+            ifm_dim[0] == ifm_dim[1]
+            and k[0] == k[1]
+            and stride[0] == stride[1]
+            and dilation[0] == dilation[1]
+        ):
+            pytest.skip("Dimension flip would have no effect")
+        k = k[::-1]
+        ifm_dim = ifm_dim[::-1]
+        stride = stride[::-1]
+        dilation = dilation[::-1]
+
+    k_h, k_w = k
+    ifm_dim_h, ifm_dim_w = ifm_dim
+    stride_h, stride_w = stride
+    dilation_h, dilation_w = dilation
+
+    kernel_width = (k_w - 1) * dilation_w + 1  # incl. dilation
+    kernel_height = (k_h - 1) * dilation_h + 1  # incl. dilation
+
+    if simd > ifm_ch:
+        pytest.skip("SIMD cannot be larger than number of input channels")
+    if ifm_ch % simd != 0:
+        pytest.skip("SIMD must divide number of input channels")
+    if kernel_height > ifm_dim_h or stride_h > ifm_dim_h:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if kernel_width > ifm_dim_w or stride_w > ifm_dim_w:
+        pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension")
+    if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1):
+        pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim")
+    if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)):
+        pytest.skip("Not all combinations for stride > k edge case supported in default mode")
+    if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)):
+        pytest.skip("Parallel window requires SIMD=C for non-depthwise case")
+
+    ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h)
+    ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w)
+    ofm_dim = [ofm_dim_h, ofm_dim_w]
+
+    model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw)
+
+    model = model.transform(to_hw.InferConvInpGen())
+    model = model.transform(SpecializeLayers("xc7z020clg400-1"))
+    # set simd
+    inst = getCustomOp(model.graph.node[0])
+    inst.set_nodeattr("SIMD", simd)
+    optype = model.graph.node[0].op_type
+    if optype == "ConvolutionInputGenerator_rtl":
+        inst.set_nodeattr("parallel_window", parallel_window)
+        inst.set_nodeattr("M", m)
+
+    node_details = (
+        "ConvolutionInputGenerator",
+        ifm_dim,
+        k,
+        stride,
+        dilation,
+        ifm_ch,
+        simd,
+        dw,
+        parallel_window,
+        idt,
+        ofm_dim,
+        "hls",
+    )
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    max_allowed_volume_delta = 2140  # should change to 20% of peak volume
+    max_allowed_length_delta = 2140  # should change to 20% of peak volume
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
index ce04af74ed..858271e189 100644
--- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 import onnx.parser as oprs
+from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.custom_op.general.im2col import compute_conv_output_dim
@@ -37,7 +38,7 @@
 from qonnx.transformation.general import GiveUniqueNodeNames
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
-from qonnx.util.basic import gen_finn_dt_tensor
+from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -49,6 +50,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import tree_model_test
 
 
 def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False):
@@ -160,3 +162,53 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode):
             exp_cycles = exp_cycles - in_dim
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10, rtol=1.1)
         assert exp_cycles != 0
+
+
+@pytest.mark.parametrize("is_1d", [True, False])
+@pytest.mark.parametrize("flip_1d", [True, False])
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.fpgadataflow
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_downsampler(is_1d, flip_1d):
+    if flip_1d and not is_1d:
+        pytest.skip("flip_1d only applicable for is_1d")
+    in_dim = 32
+    k = 1
+    stride = 2
+    dt_in = DataType["UINT8"]
+    dt_w = DataType["INT2"]
+    model = build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d)
+
+    model = model.transform(to_hw.InferConvInpGen())
+
+    # Folding
+    for n in model.graph.node:
+        if n.op_type.startswith("ConvolutionInputGenerator"):
+            inst = getCustomOp(n)
+
+            ishape = inst.get_normal_input_shape()
+            oshape = inst.get_normal_output_shape()
+
+            inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+            outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
+
+            graph = helper.make_graph(nodes=[n], name="mp_graph", inputs=[inp], outputs=[outp])
+            model = qonnx_make_model(graph, producer_name="mp-model")
+            model = ModelWrapper(model)
+            model.set_tensor_datatype("inp", dt_in)
+            model.set_tensor_datatype("outp", dt_in)
+            model = model.transform(InferShapes())
+
+    node_details = ("Downsampler", is_1d, flip_1d, in_dim, k, stride)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+
+    model = model.transform(SpecializeLayers(part))
+
+    max_allowed_volume_delta = 30
+    max_allowed_length_delta = 30
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 6b79a39ed5..a7cf0972a0 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -45,6 +45,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.util.test import tree_model_test
 
 
 def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style):
@@ -172,3 +173,40 @@ def test_fpgadataflow_dwc_stitched_rtlsim(config, impl_style):
     ).all(), """The output values are not the same as the
         input values anymore."""
     assert y.shape == tuple(shape), """The output shape is incorrect."""
+
+
+@pytest.mark.parametrize(
+    "config",
+    [
+        ([1, 24], 8, 4, DataType["INT2"]),
+        ([1, 4], 2, 4, DataType["BIPOLAR"]),
+        ([1, 4], 4, 2, DataType["INT2"]),
+        ([1, 2, 8], 4, 4, DataType["INT2"]),
+        ([1, 2, 8], 8, 16, DataType["INT2"]),
+    ],
+)
+@pytest.mark.parametrize("impl_style", ["hls", "rtl"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_dwc(config, impl_style):
+    shape, inWidth, outWidth, finn_dtype = config
+
+    part = "xc7z020clg400-1"
+    model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style)
+    model = model.transform(SpecializeLayers(part))
+    # model = model.transform(InferShapes())
+    # model = model.transform(SetExecMode(mode))
+
+    node_details = ("DWC", config, impl_style)
+    # part = "xc7z020clg400-1"
+
+    target_clk_ns = 4
+
+    max_allowed_volume_delta = 5
+    max_allowed_length_delta = 20
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index 1e9474677f..b76d6c5c99 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -48,6 +48,7 @@
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.util.basic import pynq_part_map
+from finn.util.test import tree_model_test
 
 test_pynq_board = "Pynq-Z1"
 test_fpga_part = pynq_part_map[test_pynq_board]
@@ -158,3 +159,43 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode):
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+# input image dimension
+@pytest.mark.parametrize("idim", [[10, 8]])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [7, 0, 8, 0]])
+# number of channels
+@pytest.mark.parametrize("num_ch", [2, 4])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1, 2])
+# FINN input datatype
+@pytest.mark.parametrize("idt", [DataType["INT2"]])
+# execution mode
+@pytest.mark.parametrize("mode", ["rtlsim"])
+# implementation style
+@pytest.mark.parametrize("impl_style", ["rtl", "hls"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_fmpadding(
+    idim, pad, num_ch, simd, idt, mode, impl_style
+):
+    if num_ch % simd != 0:
+        pytest.skip(" num_ch % simd != 0, skipping")
+
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, simd, idt)
+    model = model.transform(InferShapes())
+    model = model.transform(SetExecMode(mode))
+
+    node_details = ("FMPadding", idim, pad, num_ch, simd, idt, mode, impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+
+    max_allowed_volume_delta = 2
+    max_allowed_length_delta = 2
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
index 83ab2ddcaf..a55698bed8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py
+++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py
@@ -44,7 +44,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
-from finn.util.test import soft_verify_topk
+from finn.util.test import soft_verify_topk, tree_model_test
 
 
 def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style):
@@ -136,3 +136,40 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style):
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert soft_verify_topk(x, y, k), exec_mode + " failed"
+
+
+# which port to test
+@pytest.mark.parametrize("idt", [DataType["UINT8"]])
+# labels
+@pytest.mark.parametrize("labels", [10, 100])
+# folding
+@pytest.mark.parametrize("fold", [1, 10])
+# number of top labels to select
+@pytest.mark.parametrize("k", [1, 5])
+# impl style
+@pytest.mark.parametrize("impl_style", ["hls"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_labelselect(idt, labels, fold, k, impl_style):
+    np.random.seed(0)
+    if fold == -1:
+        pe = 1
+    else:
+        pe = labels // fold
+    assert labels % pe == 0
+
+    if k == -1:
+        k = labels
+
+    model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style)
+    node_details = ("LabelSelect", idt, labels, fold, k, impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    max_allowed_volume_delta = 10
+    max_allowed_length_delta = 398  # RTLSIM is inconsistent
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py
index d079578e72..8144cbde99 100644
--- a/tests/fpgadataflow/test_fpgadataflow_mvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py
@@ -51,7 +51,6 @@
 from finn.core.rtlsim_exec import rtlsim_exec
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
-from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.minimize_accumulator_width import (
     MinimizeAccumulatorWidth,
@@ -67,6 +66,7 @@
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.general import ApplyConfig
 from finn.util.basic import is_versal
+from finn.util.test import tree_model_test
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -661,84 +661,6 @@ def read_weights(sim):
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
 
 
-# mem_mode: internal_embedded or internal_decoupled
-@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"])
-# activation: None or DataType
-@pytest.mark.parametrize("act", [None, DataType["INT4"]])
-# weight datatype
-@pytest.mark.parametrize("wdt", [DataType["INT4"]])
-# input datatype
-@pytest.mark.parametrize("idt", [DataType["INT4"]])
-# neuron folding, -1 is maximum possible
-@pytest.mark.parametrize("nf", [8])
-# synapse folding, -1 is maximum possible
-@pytest.mark.parametrize("sf", [8])
-# HLS matrix width (input features)
-@pytest.mark.parametrize("mw", [32])
-# HLS matrix height (output features)
-@pytest.mark.parametrize("mh", [32])
-# Backend
-@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"])
-@pytest.mark.fpgadataflow
-@pytest.mark.vivado
-def test_mvau_fifocharacterize_rtlsim(
-    mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style
-):
-    if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None):
-        pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations")
-    if nf == -1:
-        nf = mh
-    if sf == -1:
-        sf = mw
-    pe = mh // nf
-    simd = mw // sf
-    assert mh % pe == 0
-    assert mw % sf == 0
-    # generate weights
-    W = gen_finn_dt_tensor(wdt, (mw, mh))
-
-    # no activation, produce accumulators
-    T = None
-    tdt = None
-    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
-        odt = DataType["UINT32"]
-    else:
-        odt = DataType["INT32"]
-
-    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
-    for node in model.graph.node:
-        # lookup op_type in registry of CustomOps
-        inst = getCustomOp(node)
-        inst.set_nodeattr("mem_mode", mem_mode)
-        inst.set_nodeattr("resType", "auto")
-        inst.set_nodeattr("preferred_impl_style", preferred_impl_style)
-    total_fold = nf * sf
-    exp_total_cycles = int(np.ceil(total_fold * 1.2))
-    model = model.transform(SpecializeLayers("xczu7ev-ffvc1156-2-e"))
-    model = model.transform(MinimizeWeightBitWidth())
-    model = model.transform(MinimizeAccumulatorWidth())
-    model = model.transform(SetExecMode("rtlsim"))
-    model = model.transform(GiveUniqueNodeNames())
-    model = model.transform(PrepareIP("xczu7ev-ffvc1156-2-e", 5))
-    model = model.transform(HLSSynthIP())
-    model = model.transform(PrepareRTLSim())
-    model = model.transform(DeriveCharacteristic(exp_total_cycles))
-    node_inst = getCustomOp(model.graph.node[0])
-    period_attr = node_inst.get_nodeattr("io_chrc_period")
-    assert period_attr == exp_total_cycles
-    chrc_in = node_inst.get_nodeattr("io_chrc_in")
-    chrc_out = node_inst.get_nodeattr("io_chrc_out")
-    if mem_mode == "internal_decoupled":
-        assert chrc_in.shape == (2, 2 * exp_total_cycles)
-    else:
-        assert chrc_in.shape == (1, 2 * exp_total_cycles)
-    assert chrc_out.shape == (1, 2 * exp_total_cycles)
-    # total number of transactions == 2*SF
-    assert chrc_in[0, -1] == 2 * sf
-    # all outputs should be produced within the exp n of cycles
-    assert chrc_out[0, exp_total_cycles] == nf
-
-
 @pytest.mark.parametrize("mh", [18])
 @pytest.mark.parametrize("mw", [32])
 @pytest.mark.parametrize("pe", [1, 9, 18])
@@ -963,3 +885,69 @@ def test_fpgadataflow_rtl_dynamic_mvau(mh, mw, n_vectors, pe, simd, idt_wdt, par
     assert (
         output_matmul == output_mvau_rtl_stitch
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+
+
+# mem_mode: internal_embedded or internal_decoupled
+@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [None])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["INT4"]])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1, 2, 8])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1, 2, 4])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [32])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [32])
+# Backend
+@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_mvau(
+    mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style
+):
+    if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None):
+        pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations")
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+
+    # no activation, produce accumulators
+    T = None
+    tdt = None
+    if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+        odt = DataType["UINT32"]
+    else:
+        odt = DataType["INT32"]
+
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+        inst.set_nodeattr("numInputVectors", [16])
+        inst.set_nodeattr("resType", "auto")
+        inst.set_nodeattr("preferred_impl_style", preferred_impl_style)
+
+    node_details = ("MVAU", mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+    max_allowed_volume_delta = 20
+    max_allowed_length_delta = 26
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index d90a080bf2..8152c4139e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -52,6 +52,7 @@
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.util.test import tree_model_test
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -397,3 +398,141 @@ def test_fpgadataflow_thresholding_stitched_ip(
     assert (
         y_expected == y_produced
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+
+
+@pytest.mark.parametrize("num_input_channels", [6, 16])
+@pytest.mark.parametrize(
+    "num_input_vecs",
+    [
+        [1],
+        [1, 2, 2],
+    ],
+)
+@pytest.mark.parametrize("activation", [DataType["BIPOLAR"]])
+@pytest.mark.parametrize(
+    "idt_tdt_cfg",
+    [
+        (DataType["INT8"], DataType["INT8"]),
+    ],
+)
+@pytest.mark.parametrize("fold", [-1, 1, 2])
+@pytest.mark.parametrize("narrow", [True, False])
+@pytest.mark.parametrize("per_tensor", [True, False])
+@pytest.mark.parametrize("impl_style", ["rtl"])
+@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+@pytest.mark.fpgadataflow
+@pytest.mark.vivado
+@pytest.mark.slow
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_thresholding(
+    num_input_channels,
+    num_input_vecs,
+    activation,
+    idt_tdt_cfg,
+    fold,
+    narrow,
+    per_tensor,
+    impl_style,
+    mem_mode,
+):
+    # the mem_mode parameter can only be used for the hls thresholding
+    # so the test will only be executed once for impl_style=rtl and once skipped
+    # when the mem_mode is varied. Otherwise, the same test configuration would always
+    # run twice.
+    if impl_style == "rtl" and mem_mode == "internal_decoupled":
+        pytest.skip(
+            "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded"
+        )
+    if narrow and activation == DataType["BIPOLAR"]:
+        pytest.skip("Narrow needs to be false with biploar activation.")
+    input_data_type, threshold_data_type = idt_tdt_cfg
+    num_steps = activation.get_num_possible_values() - 1
+
+    if fold == -1:
+        fold = num_input_channels
+    pe = num_input_channels // fold
+    if num_input_channels % pe != 0:
+        pytest.skip("Invalid folding configuration. Skipping test.")
+
+    output_data_type = activation
+    if activation == DataType["BIPOLAR"]:
+        activation_bias = 0
+    else:
+        activation_bias = activation.min()
+        if narrow and activation.signed():
+            activation_bias += 1
+
+    # Generate random thresholds and sort in ascending order
+    thresholds = generate_random_threshold_values(
+        threshold_data_type, num_input_channels, num_steps, narrow, per_tensor
+    )
+
+    # provide non-decreasing/ascending thresholds
+    thresholds = sort_thresholds_increasing(thresholds)
+
+    # Make a Multithreshold graph and convert to thresholding binary search node
+    model = make_single_multithresholding_modelwrapper(
+        thresholds,
+        input_data_type,
+        threshold_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+        num_input_channels,
+    )
+
+    # calculate reference output
+    x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels]))
+
+    input_dict = {model.graph.input[0].name: x}
+    y_expected = oxe.execute_onnx(model, input_dict)[model.graph.output[0].name]
+
+    if output_data_type == DataType["BIPOLAR"]:
+        # binary to bipolar
+        y_expected = 2 * y_expected - 1
+
+    model = model.transform(InferThresholdingLayer())
+
+    # Transform to the specified implementation style, either the
+    # RTL or HLS according to test parameters
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("preferred_impl_style", impl_style)
+    model = model.transform(SpecializeLayers(test_fpga_part))
+    model = model.transform(InferShapes())
+    assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style)
+
+    node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
+    inst = getCustomOp(node)
+    inst.set_nodeattr("PE", pe)
+
+    if impl_style == "hls":
+        inst.set_nodeattr("mem_mode", mem_mode)
+
+    node_details = (
+        "Thr",
+        input_data_type,
+        threshold_data_type,
+        output_data_type,
+        activation_bias,
+        num_input_vecs,
+        num_input_channels,
+        pe,
+        narrow,
+        per_tensor,
+        activation,
+        mem_mode,
+        impl_style,
+    )
+
+    max_allowed_volume_delta = 8
+    max_allowed_length_delta = 6
+
+    assert tree_model_test(
+        model,
+        node_details,
+        test_fpga_part,
+        target_clk_ns,
+        max_allowed_volume_delta,
+        max_allowed_length_delta,
+    ), "characterized TAV does not match RTLsim'd one!"
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 80b64d5e4a..cd8e572b79 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -63,6 +63,7 @@
 from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
 from finn.transformation.general import ApplyConfig
+from finn.util.test import tree_model_test
 
 
 def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
@@ -479,3 +480,87 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa
     assert (
         golden_out == output_vvau_stitched
     ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!"
+
+
+# input datatype
+@pytest.mark.parametrize("idt", [DataType["BIPOLAR"]])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"]])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType["BIPOLAR"], None])
+# PE
+@pytest.mark.parametrize("pe", [1, 3, 6])
+# SIMD
+@pytest.mark.parametrize("simd", [1, 9])
+# Input image shape
+@pytest.mark.parametrize("dim_h", [10])
+@pytest.mark.parametrize("dim_w", [10, 1])
+# Kernel shape
+@pytest.mark.parametrize("k_h", [3])
+@pytest.mark.parametrize("k_w", [3, 1])
+# Number of input and output channels
+@pytest.mark.parametrize("channels", [3])
+# memory mode
+@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+@pytest.mark.node_tree_modeling
+def test_fpgadataflow_analytical_characterization_vvau(
+    idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode
+):
+    if dim_w == 1 and k_w != 1:
+        pytest.skip("1D image requires 1D kernel, skipping.")
+
+    if channels % pe != 0:
+        pytest.skip("Requirement Channels divisable by PE is violated.")
+
+    if (k_h * k_w) % simd != 0:
+        pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.")
+
+    # Generate weights in expected shape for ONNX and HLS node
+    W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w))  # shape: [channels, 1, k, k]
+
+    # Generate inputs in expected format for ONNX and HLS node
+    x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels))
+    x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe)
+    x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5)
+    x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w)
+
+    if act is None:
+        T = None
+        tdt = None
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            odt = DataType["UINT32"]
+        else:
+            odt = DataType["INT32"]
+    else:
+        odt = act
+        (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32)
+        T = np.sort(T, axis=1)
+        if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]:
+            tdt = DataType["UINT32"]
+            # bias thresholds to be positive
+            T = np.ceil((T + (k_h * k_w)) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType["INT32"]
+
+    model = _make_single_vvau_modelwrapper(
+        W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode
+    )
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+
+    node_details = ("VVAU", idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode)
+    part = "xc7z020clg400-1"
+    target_clk_ns = 4
+
+    max_allowed_volume_delta = 14
+    max_allowed_length_delta = 14
+
+    assert tree_model_test(
+        model, node_details, part, target_clk_ns, max_allowed_volume_delta, max_allowed_length_delta
+    ), "characterized TAV does not match RTLsim'd one!"