Xilinx · lstasytis · Oct 27, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 30, 2025
diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -29,6 +29,7 @@
 
 from qonnx.custom_op.registry import getCustomOp
 
+from finn.util.basic import decompress_string_to_numpy
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 
@@ -76,3 +77,84 @@ def dataflow_performance(model):
         "max_cycles": int(max_cycles),
         "max_cycles_node_name": max_node_name,
     }
+
+
+def max_period(model):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+
+    for node in model.graph.node:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+
+    return {
+        "max_cycles": int(max_cycles),
+    }
+
+
+def max_remaining_period(model, node):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+    node_index = list(model.graph.node).index(node)
+    for node in model.graph.node[node_index:]:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles = int(inst.get_nodeattr("io_chrc_period"))
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+    return {
+        "max_cycles": int(max_cycles),
+    }
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
@@ -40,11 +40,30 @@
 
 class AutoFIFOSizingMethod(str, Enum):
     "Select the type of automatic FIFO sizing strategy."
-
-    CHARACTERIZE = "characterize"
+    ANALYTIC = "analytical"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
 
 
+class TAVGenerationMethod(str, Enum):
+    "Select the strategy for constructing token access vectors of an operator."
+    RTLSIM = "rtlsim"
+    TREE_MODEL = "tree_model"
+
+
+class TAVUtilizationMethod(str, Enum):
+    """Select the strategy for utilizing token access vectors of an operator
+    for buffer sizing."""
+
+    # worst-case ratio of data rates between a consumer and producer
+    CONSERVATIVE_RELAXATION = "conservative_relaxation"
+
+    # average-case ratio of data rates between a consumer and producer
+    AGGRESSIVE_RELAXATION = "aggressive_relaxation"
+
+    # no relaxation, use the token access vectors as-is
+    NO_RELAXATION = "no_relaxation"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -278,6 +297,31 @@ class DataflowBuildConfig:
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_generation_strategy: Optional[TAVGenerationMethod] = TAVGenerationMethod.RTLSIM
+
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_utilization_strategy: Optional[
+        TAVUtilizationMethod
+    ] = TAVUtilizationMethod.CONSERVATIVE_RELAXATION
+
+    #: When True, skips the resynthesis steps after fifo sizing. This makes it
+    #: possible to run the step for rapid fifo size analysis during
+    #: automatic folding optimizations or as a first approximation.
+    skip_resynth_during_fifo_sizing: Optional[bool] = False
+
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO

diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
@@ -53,7 +53,10 @@
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.analysis.fpgadataflow.dataflow_performance import (
+    dataflow_performance,
+    max_period,
+)
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.op_and_param_counts import (
@@ -80,8 +83,13 @@
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.derive_characteristic import (
-    DeriveCharacteristic,
+    DelayCharacteristicFunctions,
     DeriveFIFOSizes,
+    DeriveTokenAccessVectors,
+    HandleBranches,
+    JustInTimeSynthesize,
+    LocalStretchCharacteristicFunctions,
+    ProducerDelayCharacteristicFunctions,
 )
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
@@ -102,6 +110,7 @@
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
+    CapConvolutionFIFODepths,
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
     SplitLargeFIFOs,
@@ -573,29 +582,93 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        if cfg.auto_fifo_strategy == "characterize":
-            model = model.transform(InsertDWC())
-            model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
-            model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InsertDWC())
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(AnnotateCycles())
+
+        if cfg.auto_fifo_strategy == "analytical":
+            if cfg.tav_generation_strategy == "tree_model":
+                # if we have tree models, only rtlsim nodes for which we dont
+                only_jit_nodes_without_tree = True
+            else:
+                # rtlsim everything by force if not using trees
+                only_jit_nodes_without_tree = False
             model = model.transform(
-                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+                JustInTimeSynthesize(
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                    only_jit_nodes_without_tree,
+                )
+            )
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                DeriveTokenAccessVectors(
+                    model,
+                    period,
+                    cfg.tav_generation_strategy,
+                    cfg._resolve_fpga_part(),
+                    cfg._resolve_hls_clk_period(),
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                LocalStretchCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
             )
-            model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim(behav=True))
-            model = model.transform(AnnotateCycles())
-            period = model.analysis(dataflow_performance)["max_cycles"] + 10
-            model = model.transform(DeriveCharacteristic(period))
-            model = model.transform(DeriveFIFOSizes())
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(HandleBranches(model, period))
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                DelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(
+                ProducerDelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(max_period)["max_cycles"])
+
+            model = model.transform(
+                DeriveFIFOSizes(
+                    period=period,
+                    nodes_to_ignore=[],
+                    global_offset_correction=True,
+                    tav_utilization_strategy=cfg.tav_utilization_strategy,
+                )
+            )
+
             model = model.transform(
                 InsertFIFO(
                     vivado_ram_style=cfg.large_fifo_mem_style,
                     max_qsrl_depth=256,
                     create_shallow_fifos=True,
                 )
             )
+
             model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
+            if cfg.default_swg_exception:
+                model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=256))
+
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
             if cfg.fifosim_save_waveform:
                 report_dir = cfg.output_dir + "/report"
@@ -665,8 +738,10 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
-    model = model.transform(HLSSynthIP())
+    if not cfg.skip_resynth_during_fifo_sizing:
+        model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+        model = model.transform(HLSSynthIP())
+
     return model
 
 

diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py
@@ -32,6 +32,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class AddStreams(HWCustomOp):
@@ -149,7 +150,17 @@ def execute_node(self, context, graph):
         result = inp0_values + inp1_values
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
 
-    def derive_characteristic_fxns(self, period):
+    def prepare_tree_model(self):
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        read_write = Characteristic_Node("passing addstreams layer", [(dim, [1, 1])], True)
+        addstreams_top = Characteristic_Node("compute addstreams", [(1, read_write)], False)
+
+        return addstreams_top  # top level phase of this node
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
@@ -158,4 +169,7 @@ def derive_characteristic_fxns(self, period):
             },
             "outputs": {"out0": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py
@@ -34,6 +34,7 @@
 from qonnx.util.basic import qonnx_make_model
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 # ONNX i/o tensor shape assumptions for channelwise ops:
 # input 0 is the input tensor, shape (..., NumChannels)
@@ -240,3 +241,13 @@ def execute_node(self, context, graph):
         sess = rt.InferenceSession(model_func.SerializeToString())
         result = sess.run(None, idict)
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def get_tree_model(self):
+        # key parameters
+
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        pass_channelwise = Characteristic_Node("passing channelwise layer", [(dim, [1, 1])], True)
+        channelwise_top = Characteristic_Node("compute pool", [(1, pass_channelwise)], False)
+
+        return channelwise_top  # top level phase of this node