Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions src/finn/analysis/fpgadataflow/dataflow_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

from qonnx.custom_op.registry import getCustomOp

from finn.util.basic import decompress_string_to_numpy
from finn.util.fpgadataflow import is_hls_node, is_rtl_node


Expand Down Expand Up @@ -76,3 +77,84 @@ def dataflow_performance(model):
"max_cycles": int(max_cycles),
"max_cycles_node_name": max_node_name,
}


def max_period(model):
"""Extract maximum period among all nodes in the graph

Preconditions:
- model consists of HLS/RTL nodes
- model has cycle estimates annotated (see AnnotateCycles transformation)
- nodes have unique names (see GiveUniqueNodeNames)
- model has been characteristically derived and contains specific chr periods

Returns:
- max_cycles : number of cycles for slowest node
- max_cycles_node_name : name of slowest node
- critical_path_cycles : pessimistic expected latency from input to output
"""
max_cycles = 0

for node in model.graph.node:
if node is not None and node.op_type not in [
"AddStreams_hls",
"DuplicateStreams_hls",
"StreamingFIFO_hls",
"StreamingFIFO_rtl",
]:
if is_hls_node(node) or is_rtl_node(node):
inst = getCustomOp(node)
node_cycles_in = (
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
)
node_cycles_out = (
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
)
node_cycles = max(node_cycles_in, node_cycles_out)

if node_cycles > max_cycles:
max_cycles = node_cycles

return {
"max_cycles": int(max_cycles),
}


def max_remaining_period(model, node):
"""Extract maximum period among all nodes in the graph

Preconditions:
- model consists of HLS/RTL nodes
- model has cycle estimates annotated (see AnnotateCycles transformation)
- nodes have unique names (see GiveUniqueNodeNames)
- model has been characteristically derived and contains specific chr periods

Returns:
- max_cycles : number of cycles for slowest node
- max_cycles_node_name : name of slowest node
- critical_path_cycles : pessimistic expected latency from input to output
"""
max_cycles = 0
node_index = list(model.graph.node).index(node)
for node in model.graph.node[node_index:]:
if node is not None and node.op_type not in [
"AddStreams_hls",
"DuplicateStreams_hls",
"StreamingFIFO_hls",
"StreamingFIFO_rtl",
]:
if is_hls_node(node) or is_rtl_node(node):
inst = getCustomOp(node)
node_cycles = int(inst.get_nodeattr("io_chrc_period"))
node_cycles_in = (
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
)
node_cycles_out = (
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
)
node_cycles = max(node_cycles_in, node_cycles_out)
if node_cycles > max_cycles:
max_cycles = node_cycles
return {
"max_cycles": int(max_cycles),
}
48 changes: 46 additions & 2 deletions src/finn/builder/build_dataflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,30 @@

class AutoFIFOSizingMethod(str, Enum):
"Select the type of automatic FIFO sizing strategy."

CHARACTERIZE = "characterize"
ANALYTIC = "analytical"
LARGEFIFO_RTLSIM = "largefifo_rtlsim"


class TAVGenerationMethod(str, Enum):
"Select the strategy for constructing token access vectors of an operator."
RTLSIM = "rtlsim"
TREE_MODEL = "tree_model"


class TAVUtilizationMethod(str, Enum):
"""Select the strategy for utilizing token access vectors of an operator
for buffer sizing."""

# worst-case ratio of data rates between a consumer and producer
CONSERVATIVE_RELAXATION = "conservative_relaxation"

# average-case ratio of data rates between a consumer and producer
AGGRESSIVE_RELAXATION = "aggressive_relaxation"

# no relaxation, use the token access vectors as-is
NO_RELAXATION = "no_relaxation"


class ShellFlowType(str, Enum):
"""For builds that produce a bitfile, select the shell flow that will integrate
the FINN-generated accelerator."""
Expand Down Expand Up @@ -278,6 +297,31 @@ class DataflowBuildConfig:
#: setting the FIFO sizes.
auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM

#: Which strategy will be used for token access vector generation for FIFO sizing.
#: RTLSIM will result in performing RTLSIM for each node
#: to deduce the token access vectors empirically
#: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
#: of IP cores.
tav_generation_strategy: Optional[TAVGenerationMethod] = TAVGenerationMethod.RTLSIM

#: Which strategy will be used for token access vector generation for FIFO sizing.
#: RTLSIM will result in performing RTLSIM for each node
#: to deduce the token access vectors empirically
#: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
#: of IP cores.
tav_utilization_strategy: Optional[
TAVUtilizationMethod
] = TAVUtilizationMethod.CONSERVATIVE_RELAXATION

#: When True, skips the resynthesis steps after fifo sizing. This makes it
#: possible to run the step for rapid fifo size analysis during
#: automatic folding optimizations or as a first approximation.
skip_resynth_during_fifo_sizing: Optional[bool] = False

#: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
#: if set to True, always using Python instead
force_python_rtlsim: Optional[bool] = False
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't this option deprecated on the dev branch?


#: Memory resource type for large FIFOs
#: Only relevant when `auto_fifo_depths = True`
large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
Expand Down
105 changes: 90 additions & 15 deletions src/finn/builder/build_dataflow_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@

import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
import finn.transformation.streamline.absorb as absorb
from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
from finn.analysis.fpgadataflow.dataflow_performance import (
dataflow_performance,
max_period,
)
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
from finn.analysis.fpgadataflow.op_and_param_counts import (
Expand All @@ -80,8 +83,13 @@
)
from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
from finn.transformation.fpgadataflow.derive_characteristic import (
DeriveCharacteristic,
DelayCharacteristicFunctions,
DeriveFIFOSizes,
DeriveTokenAccessVectors,
HandleBranches,
JustInTimeSynthesize,
LocalStretchCharacteristicFunctions,
ProducerDelayCharacteristicFunctions,
)
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
Expand All @@ -102,6 +110,7 @@
)
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
from finn.transformation.fpgadataflow.set_fifo_depths import (
CapConvolutionFIFODepths,
InsertAndSetFIFODepths,
RemoveShallowFIFOs,
SplitLargeFIFOs,
Expand Down Expand Up @@ -573,29 +582,93 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
"""

if cfg.auto_fifo_depths:
if cfg.auto_fifo_strategy == "characterize":
model = model.transform(InsertDWC())
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(InsertDWC())
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(AnnotateCycles())

if cfg.auto_fifo_strategy == "analytical":
if cfg.tav_generation_strategy == "tree_model":
# if we have tree models, only rtlsim nodes for which we dont
only_jit_nodes_without_tree = True
else:
# rtlsim everything by force if not using trees
only_jit_nodes_without_tree = False
model = model.transform(
PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
JustInTimeSynthesize(
cfg._resolve_fpga_part(),
cfg._resolve_hls_clk_period(),
only_jit_nodes_without_tree,
)
)
period = int(model.analysis(dataflow_performance)["max_cycles"])
model = model.transform(
DeriveTokenAccessVectors(
model,
period,
cfg.tav_generation_strategy,
cfg._resolve_fpga_part(),
cfg._resolve_hls_clk_period(),
)
)

period = int(model.analysis(dataflow_performance)["max_cycles"])
model = model.transform(
LocalStretchCharacteristicFunctions(
1,
period,
nodes_to_ignore=[],
)
)
model = model.transform(HLSSynthIP())
model = model.transform(PrepareRTLSim(behav=True))
model = model.transform(AnnotateCycles())
period = model.analysis(dataflow_performance)["max_cycles"] + 10
model = model.transform(DeriveCharacteristic(period))
model = model.transform(DeriveFIFOSizes())

period = int(model.analysis(dataflow_performance)["max_cycles"])

model = model.transform(HandleBranches(model, period))

period = int(model.analysis(dataflow_performance)["max_cycles"])
model = model.transform(
DelayCharacteristicFunctions(
1,
period,
nodes_to_ignore=[],
)
)

period = int(model.analysis(dataflow_performance)["max_cycles"])

model = model.transform(
ProducerDelayCharacteristicFunctions(
1,
period,
nodes_to_ignore=[],
)
)

period = int(model.analysis(max_period)["max_cycles"])

model = model.transform(
DeriveFIFOSizes(
period=period,
nodes_to_ignore=[],
global_offset_correction=True,
tav_utilization_strategy=cfg.tav_utilization_strategy,
)
)

model = model.transform(
InsertFIFO(
vivado_ram_style=cfg.large_fifo_mem_style,
max_qsrl_depth=256,
create_shallow_fifos=True,
)
)

model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
if cfg.default_swg_exception:
model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=256))

elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
if cfg.fifosim_save_waveform:
report_dir = cfg.output_dir + "/report"
Expand Down Expand Up @@ -665,8 +738,10 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):

# after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
# this will only run for the new nodes (e.g. FIFOs and DWCs)
model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
model = model.transform(HLSSynthIP())
if not cfg.skip_resynth_during_fifo_sizing:
model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
model = model.transform(HLSSynthIP())

return model


Expand Down
18 changes: 16 additions & 2 deletions src/finn/custom_op/fpgadataflow/addstreams.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from qonnx.core.datatype import DataType

from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
from finn.util.basic import Characteristic_Node


class AddStreams(HWCustomOp):
Expand Down Expand Up @@ -149,7 +150,17 @@ def execute_node(self, context, graph):
result = inp0_values + inp1_values
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)

def derive_characteristic_fxns(self, period):
def prepare_tree_model(self):
dim = np.prod(self.get_folded_output_shape()[1:-1])

read_write = Characteristic_Node("passing addstreams layer", [(dim, [1, 1])], True)
addstreams_top = Characteristic_Node("compute addstreams", [(1, read_write)], False)

return addstreams_top # top level phase of this node

def derive_token_access_vectors(
self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
):
n_inps = np.prod(self.get_folded_input_shape()[:-1])
io_dict = {
"inputs": {
Expand All @@ -158,4 +169,7 @@ def derive_characteristic_fxns(self, period):
},
"outputs": {"out0": []},
}
super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)

super().derive_token_access_vectors(
model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
)
11 changes: 11 additions & 0 deletions src/finn/custom_op/fpgadataflow/channelwise_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from qonnx.util.basic import qonnx_make_model

from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
from finn.util.basic import Characteristic_Node

# ONNX i/o tensor shape assumptions for channelwise ops:
# input 0 is the input tensor, shape (..., NumChannels)
Expand Down Expand Up @@ -240,3 +241,13 @@ def execute_node(self, context, graph):
sess = rt.InferenceSession(model_func.SerializeToString())
result = sess.run(None, idict)
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)

def get_tree_model(self):
# key parameters

dim = np.prod(self.get_folded_output_shape()[1:-1])

pass_channelwise = Characteristic_Node("passing channelwise layer", [(dim, [1, 1])], True)
channelwise_top = Characteristic_Node("compute pool", [(1, pass_channelwise)], False)

return channelwise_top # top level phase of this node
Loading