diff --git a/requirements.txt b/requirements.txt index e6a6ec0521..6ebd27b319 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ sigtools==4.0.1 toposort==1.7.0 vcdvcd==1.0.5 wget==3.2 +wrapdisc==2.5.0 diff --git a/setup.cfg b/setup.cfg index c9ce06b962..adf39a6c44 100644 --- a/setup.cfg +++ b/setup.cfg @@ -83,6 +83,7 @@ docs = sphinx_rtd_theme==0.5.0 torchvision torch + wrapdisc qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 3bc2c46794..7864f11a87 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -197,6 +197,50 @@ class DataflowBuildConfig: #: useful for decreasing the latency (even though throughput won't increase). folding_two_pass_relaxation: Optional[bool] = True + #: (Optional) Control the maximum width of the per-PE MVAU stream while + #: exploring the parallelization attributes to reach target_fps + #: Only relevant if target_fps is specified. + #: Set this to a large value (e.g. 10000) if targeting full unfolding or + #: very high performance. + mvau_wwidth_max: Optional[int] = 1024 + + # (Optional) which SetFolding optimizer to use (naive, optimized) + folding_style: Optional[str] = "naive" + + # (Optional) How much padding to allow for enabling more fine-grain folding + # parameters (generally, more than 6 is unnecessary) + # Enabling this flag requires the generalized datawidthconverter in your finn branch + folding_maximum_padding: Optional[int] = 0 + + # (Optional) Whether to allow padding IO nodes during folding + # If set to True, the model IO npy arrays would also need to be + # padded by the user on host side! + folding_pad_io_nodes: Optional[bool] = False + + # (Optional) Heuristic to consider dwc LUT cost when performing folding + # this will make the folding optimizer avoid mismatching stream widths + enable_folding_dwc_heuristic: Optional[bool] = True + + # (Optional) Heuristic to consider FIFO sizing cost when performing folding + # this heuristic might help with not over-sizing fifos + # Highly recommended to NOT enable this flag unless analytic fifo sizing is + # also being used and so RTLSIM is never called for folding + enable_folding_fifo_heuristic: Optional[bool] = False + + # (Optional) How much effort to put into automatic folding + # minimizer function. Typical ranges are between 50 and 500 + folding_effort: Optional[int] = 50 + + # (Optional) How many times to attempt to optimize throughput + # (binary search steps) + # 1: only attempts the target throughput + # >1: attempt to increase the throughput to the maximum possible + # for a given device. Increasing the value by one doubles the + # precision towards reaching maximal throughput possible + # 2 attempts: at worst half of the maximum throughput + # 6 attempts: at worst 93.75% of maximum throughput + folding_max_attempts: Optional[int] = 1 + #: (Optional) At which steps the generated intermediate output model #: will be verified. See documentation of VerificationStepType for #: available options. @@ -227,12 +271,10 @@ class DataflowBuildConfig: #: to the design: e.g. Customer signature, application signature, version signature: Optional[List[int]] = None - #: (Optional) Control the maximum width of the per-PE MVAU stream while - #: exploring the parallelization attributes to reach target_fps - #: Only relevant if target_fps is specified. - #: Set this to a large value (e.g. 10000) if targeting full unfolding or - #: very high performance. - mvau_wwidth_max: Optional[int] = 36 + # (Optional) Flag for generating a hw config json in set_fifo_sizes + # this should be turned off during setFolding optimization's call + # to the set_fifo_sizes step + extract_hw_config: Optional[bool] = True #: (Optional) Whether thresholding layers (which implement quantized #: activations in FINN) will be implemented as stand-alone HW layers, diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 20d7f00be9..8fd11d3619 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -421,6 +421,15 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi target_cycles_per_frame, mvau_wwidth_max=cfg.mvau_wwidth_max, two_pass_relaxation=cfg.folding_two_pass_relaxation, + style=cfg.folding_style, + folding_maximum_padding=cfg.folding_maximum_padding, + enable_folding_dwc_heuristic=cfg.enable_folding_dwc_heuristic, + enable_folding_fifo_heuristic=cfg.enable_folding_fifo_heuristic, + folding_effort=cfg.folding_effort, + folding_max_attempts=cfg.folding_max_attempts, + folding_pad_io_nodes=cfg.folding_pad_io_nodes, + platform=cfg.board, + auto_fifo_strategy=cfg.auto_fifo_strategy, ) ) # extract the suggested configuration and save it as json diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 531b666ad6..a85b3c4ba6 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -27,30 +27,1330 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import copy import functools # Inspect information on Python objects like modules import inspect import numpy as np +import scipy import warnings +from onnx import TensorProto, helper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor +from wrapdisc import Objective +from wrapdisc.var import GridVar # Import the elementwise binary operation module to extract names of all # specializations (which require PE parallelism to be configured) import finn.custom_op.fpgadataflow.hls.elementwise_binary_hls as elementwise_binary_hls from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance +from finn.analysis.fpgadataflow.op_and_param_counts import aggregate_dict_keys +from finn.builder.build_dataflow_config import DataflowBuildConfig +from finn.builder.build_dataflow_steps import step_set_fifo_depths from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles +from finn.transformation.fpgadataflow.insert_dwc import InsertDWC +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.basic import part_map from finn.util.fpgadataflow import is_hls_node, is_rtl_node +from finn.util.platforms import DEFAULT_RES_LIMITS, platforms -def divisors(num): +def parameter_whitelist(padding_input): + """ + The whitelist is a dictionary for what node type and parameter combinations + are allowed to be optimized using the SetFolding transformation. + + """ + d = {} + d["SIMD"] = {} + d["PE"] = {} + # d["ram_style"] = {} + # d["resType"] = {} + + # d [ ] [ < op_type > ] [ padding amount, allow folding or not ] + + d["SIMD"]["DownSampler_hls"] = [padding_input, True, "NumChannels"] + d["SIMD"]["FMPadding_hls"] = [padding_input, True, "NumChannels"] + d["SIMD"]["FMPadding_rtl"] = [padding_input, True, "NumChannels"] + d["SIMD"]["FMPadding_Pixel_hls"] = [padding_input, True, "NumChannels"] + + d["SIMD"]["ConvolutionInputGenerator_hls"] = [padding_input, False, "IFMChannels"] + d["SIMD"]["ConvolutionInputGenerator_rtl"] = [padding_input, False, "IFMChannels"] + + # d["ram_style"]["ConvolutionInputGenerator_hls"]=[0,True] + + d["PE"]["AddStreams_hls"] = [padding_input, True, "NumChannels"] + d["PE"]["ChannelwiseOp_hls"] = [padding_input, True, "NumChannels"] + # d["ram_style"]["ChannelwiseOp_hls"]=[0,True,None] + d["PE"]["DuplicateStreams_hls"] = [padding_input, True, "NumChannels"] + d["PE"]["GlobalAccPool_hls"] = [0, True, "NumChannels"] + d["PE"]["Thresholding_hls"] = [padding_input, True, "NumChannels"] + d["PE"]["Thresholding_rtl"] = [padding_input, True, "NumChannels"] + d["PE"]["StreamingMaxPool_hls"] = [padding_input, True, "NumChannels"] + d["PE"]["StreamingMaxPool_rtl"] = [padding_input, True, "NumChannels"] + + # Pool nodes are always optimized in tandem with a producer SWG + d["PE"]["Pool_hls"] = [0, True, "Channels"] + + # only supported for rtl variant, need to add exceptions + # so that only if every condition to create a dsp variant is met, + # to then allow folding this parameter + d["SIMD"]["VVAU_hls"] = [0, False, "Kernel"] + + d["PE"]["VVAU_hls"] = [padding_input, True, "Channels"] + + d["SIMD"]["VVAU_rtl"] = [0, True, "Kernel"] + d["PE"]["VVAU_rtl"] = [padding_input, True, "Channels"] + + # d["resType"]["VVAU_hls"]=[0,True,None] + # d["resType"]["VVAU_rtl"]=[0,True,None] + + # d["ram_style"]["VVAU_hls"]=[0,True,None] + # d["ram_style"]["VVAU_rtl"]=[0,True,None] + + d["SIMD"]["MVAU_hls"] = [padding_input, True, "MW"] + d["PE"]["MVAU_hls"] = [padding_input, True, "MH"] + + d["SIMD"]["MVAU_rtl"] = [padding_input, True, "MW"] + d["PE"]["MVAU_rtl"] = [padding_input, True, "MH"] + # d["ram_style"]["MVAU_rtl"]=[0,True,None,[3,2,1,0]] + # d["ram_style"]["MVAU_hls"]=[0,True,None,[3,2,1,0]] + # d["ram_style_thresholds"]["MVAU_rtl"]=[0,True,None,[2,1,0]] + # d["ram_style_thresholds"]["MVAU_hls"]=[0,True,None,[2,1,0]] + # d["resType"]["MVAU_rtl"]=[0,True,None,[1,0]] + # d["resType"]["MVAU_hls"]=[0,True,None,[1,0]] + + # we do not fold LabelSelect due to it + # potentially ruining fmax (TODO: heuristic for when + # its safe to? Like certain topk to label ratio which + # routes without issues? Or bring back once LabelSelect + # has been improved / RTL variant added + + d["PE"]["LabelSelect_hls"] = [0, False, "Labels"] + + return d + + +def divisors(self, num): for x in range(1, num + 1): if (num % x) == 0: yield x +def allowed_divisors(cap, bounding_value_exponent=1, max_padding_count=0, skip_folding=False): + """ + compute all possible folding factors for a given + upper bound variable + + max_padding_count allows generating values with the assumption + that the bounding variable could be padded by up to that many + elements, which dramatically increases the possible folding + parameters with even a small amount of extra values + + bounding_value_exponent, if set to two, forces the folding factors into + square roots of the bounding variable (applicable in some cases) + """ + + all_divs = [] + all_bounding_values = [] + factors = [] + if skip_folding: + all_divs = [1] + all_bounding_values = [cap] + else: + for i in range(cap, cap + max_padding_count + 1): + for x in range(1, i + 1): + if (i**bounding_value_exponent % x) == 0: + if (x not in all_divs) and (x <= cap) and (i // x not in factors): + all_divs.append(x) + all_bounding_values.append(i) + factors.append(i // x) + + return zip(*sorted(zip(all_divs, all_bounding_values))) + + +class Parameter: + def __init__( + self, + name=None, # SWU_SIMD, MVAU_SIMD, MVAU_PE etc + target_value_name=None, + target_value=None, + bound_name=None, + bound_value=None, + bound_value_last=None, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=False, + update_output_tensor_shape=False, + node=None, # node instance! + node_index=None, + op_type=None, + model=None, + ): + self.name = name + self.target_value_name = target_value_name + self.target_value = target_value + self.bound_name = bound_name + self.bound_value = bound_value + self.bound_value_last = bound_value_last + self.update_threshold_input = update_threshold_input + self.update_weights_input = update_weights_input + self.update_input_tensor_shape = update_input_tensor_shape + self.update_output_tensor_shape = update_output_tensor_shape + self.node = node + self.node_index = node_index + self.op_type = op_type + self.model = model + + def update_threshold_tensor(self): + if self.op_type in ["Thresholding_hls", "Thresholding_rtl"]: + input_index = 1 + dim0 = self.node.get_nodeattr("NumChannels") + + elif self.op_type in ["VVAU_hls", "VVAU_rtl"]: + input_index = 2 + dim0 = self.node.get_nodeattr("Channels") + if len(self.model.graph.node[self.node_index].input) < 3: + # if the MVAU doesnt have a threshold input, just skip + return + + elif self.op_type in ["MVAU_hls", "MVAU_rtl"]: + input_index = 2 + dim0 = self.node.get_nodeattr("MH") + if len(self.model.graph.node[self.node_index].input) < 3: + # if the MVAU doesnt have a threshold input, just skip + return + + # thresholding nodes have a weight matrix which needs to be + # adjusted if padding or cropping were introduced + # MVAU and VVAU nodes can also have it so we stay flexible + + T = self.model.get_initializer(self.model.graph.node[self.node_index].input[input_index]) + + adt = self.model.get_tensor_datatype( + self.model.graph.node[self.node_index].input[input_index] + ) + T_new = gen_finn_dt_tensor(adt, (dim0, T.shape[1])) + T_new[...] = 0 + + T_new[: min(dim0, T.shape[0]), :] = T[: min(dim0, T.shape[0]), :] + + self.model.set_initializer(self.model.graph.node[self.node_index].input[input_index], T_new) + + self.model.set_tensor_shape( + self.model.graph.node[self.node_index].input[input_index], T_new.shape + ) + + def update_weight_tensor(self): + if self.op_type in ["VVAU_hls", "VVAU_rtl"]: + input_index = 1 + dim0 = self.node.get_nodeattr("Channels") + dim1 = self.node.get_nodeattr("Kernel") + + elif self.op_type in ["MVAU_hls", "MVAU_rtl"]: + input_index = 1 + dim0 = self.node.get_nodeattr("MW") + dim1 = self.node.get_nodeattr("MH") + + W = self.model.get_initializer(self.model.graph.node[self.node_index].input[input_index]) + + if self.op_type in ["MVAU_hls", "MVAU_rtl"]: + if (dim0, dim1) == W.shape: + return False + + if self.op_type in ["VVAU_hls", "VVAU_rtl"]: + if W.shape[0] == dim0 and W.shape[-2:] == tuple(dim1): + return False + + wdt = self.model.get_tensor_datatype( + self.model.graph.node[self.node_index].input[input_index] + ) + + if self.op_type in ["MVAU_hls", "MVAU_rtl"]: + W_new = gen_finn_dt_tensor(wdt, (dim0, dim1)) + W_new[...] = 0 + + W_new[: min(dim0, W.shape[0]), : min(dim1, W.shape[1])] = W[ + : min(dim0, W.shape[0]), : min(dim1, W.shape[1]) + ] + self.model.set_initializer(self.model.graph.node[self.node_index].input[1], W_new) + + if self.op_type in ["VVAU_hls", "VVAU_rtl"]: + W_new = gen_finn_dt_tensor(wdt, (dim0, W.shape[1], dim1[0], dim1[1])) + W_new[...] = 0 + + W_new[ + : min(dim0, W.shape[0]), :, : min(dim1[0], W.shape[2]), : min(dim1[1], W.shape[3]) + ] = W[ + : min(dim0, W.shape[0]), :, : min(dim1[0], W.shape[2]), : min(dim1[1], W.shape[3]) + ] + + self.model.set_initializer( + self.model.graph.node[self.node_index].input[input_index], W_new + ) + + self.model.set_tensor_shape(self.model.graph.node[self.node_index].input[1], W_new.shape) + + return True + + def apply_value(self, final=True): + # update the target value being optimized + self.node.set_nodeattr(self.target_value_name, self.target_value) + + # if the bounding value has changed (ie,. MW of an MVAU) as + # a result of padding the node, update it as well + # if self.bound_value != self.bound_value_last: + if self.bound_name is not None: + self.node.set_nodeattr(self.bound_name, self.bound_value) + + # make certain parallel window is set right + if self.bound_name == "IFMChannels": + if self.target_value < self.bound_value: + self.node.set_nodeattr("parallel_window", 0) + + # if this is the end of the minimizer routine, we update the tensor + # shapes as well to retain functional correctness + if final: + # first the io tensors only + if self.update_input_tensor_shape: + new_shape = self.node.get_normal_input_shape() + self.model.set_tensor_shape( + self.model.graph.node[self.node_index].input[0], new_shape + ) + + if self.update_output_tensor_shape: + new_shape = self.node.get_normal_output_shape() + self.model.set_tensor_shape( + self.model.graph.node[self.node_index].output[0], new_shape + ) + + if self.update_threshold_input: + self.update_threshold_tensor() + + if self.update_weights_input: + self.update_weight_tensor() + + +class MetaParameter: + """ + A meta parameter defines a single optimizable integer value (meta_value) + which translates into a set of finn-onnx graph node attributes + which are tighly linked together (called values) + + Examples: + -SIMD and PE values of a VVAU + SIMD of the SWU if necessary + -SIMD value of an SWU and the PE and SIMD values of an MVAU (convolution) + -SWU and Pool layer SIMD values (max pooling using SWU) + + - NOTE that MVAU PE and SIMD values are optimized independently, since + - both 1-2 and 2-1 SIMD-PE combinations would have the same meta value + - while having different resource characteristics + + All possible (legal) combinations of real values are stored in a list and an + address translation is performed to map each meta_value to a set + of real values when applying them + """ + + def __init__( + self, + name=None, + meta_value=None, # current value + possible_values=[], # all possible values + real_values=[], # list of real values for each possible value + model=None, + node_index=None, + ): + self.name = name + self.meta_value = None + assert len(real_values) == len(possible_values) + self.possible_values = possible_values + self.real_values = real_values + self.model = model + self.updated = False + self.index = 0 + self.node_index = node_index + + """ + we build up a list of unique nodes related to this meta parameter + for future cycle calculations + """ + + # sort the values first + pairs = [ + (x, y) + for (x, y) in sorted( + zip(self.possible_values, self.real_values), key=lambda pair: pair[0] + ) + ] + self.possible_values = [x[0] for x in pairs] + self.real_values = [x[1] for x in pairs] + + self.unique_nodes = [] + for val in real_values[0]: + if val.node not in self.unique_nodes: + self.unique_nodes.append(val.node) + + def update_value(self, value): + if self.meta_value == value: + self.updated = False + else: + self.meta_value = value + self.updated = True + + def apply_value(self, final=False, filter=["PE", "SIMD", "parallel_window"]): + # make sure to run this once before minimizing + self.index = self.possible_values.index(self.meta_value) + for val in self.real_values[self.index]: + if val.target_value_name in filter: + val.apply_value(final) + + def get_cycles(self): + """ + This function assumes all parameters in the unique nodes are + updated. + """ + return max([n.get_exp_cycles() for n in self.unique_nodes]) + + +class ParameterSet: + def __init__(self): + self.parameters = [] + self.index_list = [] + self.nodes = [] + + def filter(self, params_to_filter): + # filter parameters we want to use in the set + # useful for multi-pass optimization + self.parameters = [x for x in self.parameters if x.name in params_to_filter] + + def get_max_cycles(self): + return max([n.get_exp_cycles() for n in self.nodes]) + + def get_vals(self): + return [p.value for p in self.parameters] + + def get_min_vals(self): + # get minimum possible folding values in the set + return [p.possible_values[0] for p in self.parameters] + + def get_max_vals(self): + # get maximum possible folding values in the set + return [p.possible_values[-1] for p in self.parameters] + + def add_all_params_to_index_list(self): + self.index_list = [x for x in range(len(self.parameters))] + + def set_values(self, values): + for i in range(len(self.index_list)): + self.parameters[self.index_list[i]].update_value(values[i]) + + def apply_updates(self, final=False, filter=[]): + # a + for i in self.index_list: + self.parameters[i].apply_value(final, filter) + + def assign_involved_nodes(self): + nodes = [] + for i in range(len(self.index_list)): + p = self.parameters[self.index_list[i]] + for node in p.unique_nodes: + nodes.append(node) + self.nodes = list(set(nodes)) # make this unique + + +class Optimizer: + """ + Class responsible for the 'inner loop' of the folding optimization. + We set all minimizer-specific Hyper-parameters here, model + node & parameter partitioning, minimizer instantation, + cost model function and the overarching loop of minimizing the + partitions are performed in this class. + """ + + def __init__( + self, + model, + name, + targets, + hard_constraint_target="max_cycles", + target_cycles_per_frame=1, + padding=0, + maxfun_per_parameter=100, + fpgapart="xc7z020clg400-1", + parameters_to_apply=["SIMD", "PE", "ram_style", "resType", "ram_style_thresholds"], + enable_folding_dwc_heuristic=True, + verbose=False, + mvau_wwidth_max=1024, + value_to_minimize_relaxation=0.98, + max_parameters_per_partition=4, + init_run=False, + maxiter=200, + accept=-0.5, + pad_io_nodes=False, + optimization_parameters=["SIMD", "PE", "ram_style", "resType", "ram_style_thresholds"], + ): + self.params = None + self.targets = targets + self.updated_nodes = [] + self.param_indexes = [] # this might require insertion!!! + self.param_ranges = [] + self.all_nodes = [] + self.target_cycles_per_frame = target_cycles_per_frame + self.padding = padding + self.mvau_wwidth_max = mvau_wwidth_max + self.model = model + self.pad_io_nodes = pad_io_nodes + self.name = name + self.fpgapart = fpgapart + self.metrics = None + self.init_run = init_run + self.maxiter = maxiter + self.accept = accept + + # 0-100, relax whether we MUST hit the required bounding value, + # for example max_cycles + self.value_to_minimize_relaxation = value_to_minimize_relaxation + self.max_parameters_per_partition = max_parameters_per_partition + self.maxfun_per_parameter = maxfun_per_parameter + + self.hard_constraint_target = hard_constraint_target + self.parameters_to_apply = parameters_to_apply + self.enable_folding_dwc_heuristic = enable_folding_dwc_heuristic + self.verbose = verbose + self.optimization_parameters = optimization_parameters + + # total number of nodes which got padded + self.total_paddings = 0 + + def cleanup_pass(self): + # some corrections that may be necessary + pass + + def compute_hls_dwc_cost(self, model, nodes, lut_capacity, hls_dwc_cost_penalty=8): + # Given a set of nodes and a model, + # consider the stream widths between all adjacent nodes + # and apply a cost penalty if the shapes mismatch relative + # to the cost of introducing a DataWidthConverter + + # this heuristic is critical for preventing overuse of + # DWCs with enormous resource costs + + # hls_dwc_cost_penalty is a rough heuristic for how much + # an HLS variant consumes in LUTs + + cost = 0 + for node in nodes: + prod = model.find_producer(node.onnx_node.input[0]) + + # check if this is not the first node of a model + if prod is not None: + output_name = prod.output[0] + prod_inst = getCustomOp(prod) + inWidth = prod_inst.get_outstream_width() + outWidth = prod_inst.get_instream_width() + + n0_out_shape = prod_inst.get_folded_output_shape() + + # mvau has a special case with external memory + # where we have to consider a different input + if ( + node.onnx_node.op_type.startswith("MVAU") + and node.get_nodeattr("mem_mode") == "external" + ) or (node.onnx_node.op_type.startswith("StreamingConcat")): + # get input idx + in_idx = None + for idx, n_input in enumerate(node.onnx_node.input): + if output_name == n_input: + in_idx = idx + assert in_idx is not None, "Malformed model" + n1_in_shape = node.get_folded_input_shape(in_idx) + else: + # use default folded input shape + n1_in_shape = node.get_folded_input_shape() + + # dwcs cannot be inserted between mvau/vvau and pool/swg + # so we only run it for other combinations + if not ( + ( + prod.name.startswith("ConvolutionInputGenerator") + or prod.name.startswith("Pool") + ) + and ( + node.onnx_node.name.startswith("Pool") + or node.onnx_node.name.startswith("MVAU") + or node.onnx_node.name.startswith("VVAU") + ) + ): + n1_in_shape = node.get_folded_input_shape() + + # check if we need a DWC + if ( + np.prod(n0_out_shape) != np.prod(n1_in_shape) + or n0_out_shape[-1] != n1_in_shape[-1] + ): + # HLS DWC needed, expensive + if (max(inWidth, outWidth) % min(inWidth, outWidth) != 0) or ( + np.prod(n0_out_shape) != np.prod(n1_in_shape) + ): + cost += ((inWidth + outWidth) * hls_dwc_cost_penalty) / lut_capacity + + # RTL DWC can be used cheaply + else: + cost += (inWidth + outWidth) / lut_capacity + + # extra cost penalizing large widths + # cost += ((opt.params.nodes[0].get_instream_width() * 4) / opt.targets["LUT"]) + # cost += ((opt.params.nodes[-1].get_outstream_width() * 4) / opt.targets["LUT"]) + return cost + + def cost_model(self, param_guess, opt): + """ + the function used for determining how + 'good' a given folding configuration is + in respect to optimization targets + any heuristics to consider as effects + of folding on the effectiveness of the final + model should go here + """ + cost = 0 + + # 1. apply the folding parameters + opt.params.set_values(param_guess) + opt.params.apply_updates(final=False, filter=self.parameters_to_apply) + + # 2. compute results + cycles = opt.params.get_max_cycles() + resources = self.get_resources(opt.params.nodes) + metrics = {**{"max_cycles": cycles}, **resources} + + # 3. update cost based on all minimizable targets + # the hard constraint (usually max_cycles) enforces + # which target MUST be met. + for value_to_minimize in opt.targets: + if value_to_minimize != opt.hard_constraint_target: + cost += metrics[value_to_minimize] / opt.targets[value_to_minimize] + else: + if metrics[value_to_minimize] * self.value_to_minimize_relaxation > ( + opt.targets[value_to_minimize] + ): + cost = np.inf + + # 4. Add additional heuristic costs + + # 4.1 DWC heuristic to decrease the use of HLS DWCs + # which can have massive LUT resource consumption + # increases. All pairs are considered because + # we optimize partitions left to right and consider + # the DWC between a node and its left neighbor + if self.enable_folding_dwc_heuristic: + cost += self.compute_hls_dwc_cost(opt.model, opt.params.nodes, opt.targets["LUT"]) + + return cost + + def execute_minimizer(self, discrete_args, init_guess): + """ + the specific minimizer for performing the parameter optimization + for a single parameter set is called with this function + # argument bounds are applied using the wrap library + """ + wrapped_objective = Objective( + self.cost_model, + variables=discrete_args, + ) + bounds = wrapped_objective.bounds + + if len(bounds) == 0: + return np.array(init_guess) + + encoded_init_guess = wrapped_objective.encode((init_guess)) + fixed_args = tuple([self]) + + optimal_args = scipy.optimize.dual_annealing( + func=wrapped_objective, + x0=encoded_init_guess, + maxiter=self.maxiter, + accept=self.accept, + visit=2.0, + maxfun=self.maxfun_per_parameter * len(init_guess), + # niter=self.optimizer_ites, + # stepsize=self.stepsize, + # T=self.temp, + args=(fixed_args), + bounds=bounds, + ) + + optimized_params = optimal_args.x + optimized_params = np.array(wrapped_objective.decode(optimized_params)) + + return optimized_params + + def optimize( + self, + partitions=2, + initial_guess="max", + max_nodes_in_partition=2, + target_parameters=["SIMD", "PE"], + ): + """ + A single optimization pass across an entire model + initial guess can be "min" or "max" for what folding values to use + at the start of optimization + min = least folding (makes sense when the hard constraint is resource use) + max = maximum folding (makes sense when the hard constraint is max_cycles) + It is critical to select these values in a way that lets the optimizer know + a legal solution exists for the problem, otherwise it will give up after a set + number of iterations + + we peform partition splitting in this function + """ + + # 1. Split parameters into partitions to optimize locally + + # calculate number of partitions if not set to 1 + param_count = len(self.params.parameters) + if param_count > self.max_parameters_per_partition and partitions != 1: + partitions = param_count // self.max_parameters_per_partition + + if partitions == 1: + self.params.add_all_params_to_index_list() + + indexes = self.params.index_list = [x for x in range(len(self.params.parameters))] + + if initial_guess == "min": + init_guess = self.params.get_min_vals() + elif initial_guess == "max": + init_guess = self.params.get_max_vals() + self.params.set_values(init_guess) + + self.params.apply_updates(filter=target_parameters) + self.params.assign_involved_nodes() + params = self.params.parameters + + # node-based partitioning + partitions = 0 + old_node_index = 0 + index_partitions = [] + init_guess_partitions = [] + params_partitions = [] + + tmp_index_partitions = [] + tmp_init_guess_partitions = [] + tmp_params_partitions = [] + + i = 0 + nodes_in_partition = 1 + for param in params: + if param.name in target_parameters: + new_node_index = param.node_index + + if new_node_index != old_node_index: + nodes_in_partition += 1 + + if nodes_in_partition > max_nodes_in_partition: + # store set and start a new one + if len(tmp_index_partitions) > 0: + index_partitions.append(tmp_index_partitions) + init_guess_partitions.append(tmp_init_guess_partitions) + params_partitions.append(tmp_params_partitions) + tmp_index_partitions = [] + tmp_init_guess_partitions = [] + tmp_params_partitions = [] + partitions += 1 + nodes_in_partition = 1 + if nodes_in_partition <= max_nodes_in_partition: + tmp_index_partitions.append(indexes[i]) + tmp_init_guess_partitions.append(init_guess[i]) + tmp_params_partitions.append(params[i]) + + old_node_index = new_node_index + i += 1 + + # add remaining lefover tail partition + if len(tmp_index_partitions) > 0: + if len(tmp_index_partitions) > 0: + index_partitions.append(tmp_index_partitions) + init_guess_partitions.append(tmp_init_guess_partitions) + params_partitions.append(tmp_params_partitions) + partitions += 1 + + # 2. Perform local optimization of partitions + for p in range(partitions): + # generate discrete argument list based on possible values + # this is the input for the scipy minimizer + discrete_args = [] + for arg in params_partitions[p]: + discrete_args.append(GridVar(tuple(arg.possible_values))) + + # filter out parameters to the ones of the requested partition + self.params.index_list = index_partitions[p] + self.params.assign_involved_nodes() + + # fetch the respective initial list of parameters + # it is very important that the initial guess is feasible + # for the minimizer so that the cost_model call returns a non-infinity cost + # otherwise the optimizer might give up believing there is no solution + init_guess = init_guess_partitions[p] + + # an initial run to get resource consumption bounds + if self.init_run: + optimized_params = init_guess + else: + optimized_params = self.execute_minimizer(discrete_args, init_guess) + + # apply final values, adjusting the model accordingly + self.params.set_values(optimized_params) + self.params.apply_updates(final=True, filter=target_parameters) + + # final surgery of the model + self.cleanup_pass() + + total_params = 0 + total_padding = 0 + + self.padding_result = f"{total_padding} / {total_params}" + for p in self.params.parameters: + self.total_paddings += total_padding + + def get_resources(self, nodes): + resources = {} + for n in nodes: + resources[n] = n.node_res_estimation(self.fpgapart) + return aggregate_dict_keys(resources) + + def generate_parameter_set(self): + # given a model, extract all optimizable parameters from it + # as well as the possible values on these parameters + # and the respective bounding parameter which might need to + # be adjusted in case of padding + + model = self.model + + whitelist = parameter_whitelist(self.padding) + + graph = model.graph + pset = [] + node_indx = 0 + node_count = len(graph.node) + skips = 0 + + for node_indx in range(0, node_count): + if skips > 0: + skips -= 1 + continue + node = graph.node[node_indx] + + maximum_padding = self.padding + + if node is None: + continue + + if node.op_type == "StreamingDataWidthConverter": + continue + + if not (is_hls_node(node) or is_rtl_node(node)): + continue + + # restrict padding if applicable + if self.pad_io_nodes is not True: + if node_indx == 0 or node_indx == len(graph.node) - 1: + # do not allow padding IO nodes + maximum_padding = 0 + + if node.op_type in ["ConvolutionInputGenerator_hls", "ConvolutionInputGenerator_rtl"]: + # a convolution input generator is always followed by a node which is tied to it + # we have to handle these cases with a larger meta-parameter(s) + + node_inst = getCustomOp(node) + second_node = graph.node[node_indx + 1] + second_node_inst = getCustomOp(second_node) + # SWU should only be consumed by pool, vvau or mvau nodes + assert second_node.op_type in [ + "Pool_rtl", + "Pool_hls", + "MVAU_hls", + "MVAU_rtl", + "VVAU_hls", + "VVAU_rtl", + ] + + # start extracting the SWU parameters we will need + bound_swu = node_inst.get_nodeattr("IFMChannels") + kernel_size = np.prod(node_inst.get_nodeattr("ConvKernelDim")) + parallel_window = node_inst.get_nodeattr("parallel_window") + + padding_internal_swu = np.min([whitelist["SIMD"][node.op_type][0], maximum_padding]) + + possible_values_swu, bounding_values_swu = allowed_divisors( + bound_swu, 1, padding_internal_swu + ) + + if second_node.op_type in ["Pool_rtl", "Pool_hls"]: + # SWU->Pool pair, we are optimizing SWU_SIMD, POOL_PE + # SIMD values identical, SWU_IFMChannels == Pool_ChannelNum when padding + assert node_inst.get_nodeattr("depthwise") == 1 + # a single meta value (1 -> max SIMD) + values_simd = [] + for i, ifmchannels_new in enumerate(list(bounding_values_swu)): + simd = possible_values_swu[i] + value_swu_simd = Parameter( + name="SWU_SIMD", + target_value_name="SIMD", + target_value=simd, + bound_name="IFMChannels", + bound_value=ifmchannels_new, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + + value_pool_pe = Parameter( + name="Pool_PE", + target_value_name="PE", + target_value=simd, + bound_name="Channels", + bound_value=ifmchannels_new, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=second_node_inst, + node_index=node_indx + 1, + op_type=second_node.op_type, + model=self.model, + ) + + values_simd.append([value_swu_simd, value_pool_pe]) + + # construct meta parameter + + meta_parameter_simd = MetaParameter( + name="SIMD", + meta_value=possible_values_swu[0], + possible_values=possible_values_swu, + real_values=values_simd, + model=self.model, + node_index=node_indx, + ) + pset.append(meta_parameter_simd) + skips = 1 + + pass + elif second_node.op_type in ["MVAU_hls", "MVAU_rtl"]: + # SWU->MVAU pair, we are optimizing SWU_SIMD, MVAU_SIMD, MVAU_PE + # MVAU_SIMD is linked to SWU_SIMD, parallel window to push MVAU_SIMD > SWU_SIMD + # two meta values (1 -> max SIMD and 1 -> max PE) + skips = 1 + second_node_inst = getCustomOp(second_node) + values_simd = [] + all_possible_values = [] + factors = [] + + ww = second_node_inst.get_weight_datatype().bitwidth() + # limit to unique bounds + bounding_values_swu = list(set(bounding_values_swu)) + for i, ifmchannels_new in enumerate(bounding_values_swu): + mw_new = kernel_size * ifmchannels_new + possible_values_mvau_simd, bounding_values_mw = allowed_divisors( + mw_new, 1, 0 + ) + + for i, simd in enumerate(possible_values_mvau_simd): + bound_value_mw = bounding_values_mw[i] + + if ( + ifmchannels_new % simd == 0 + and simd not in all_possible_values + and bound_value_mw // simd not in factors + and simd <= ifmchannels_new + and (ww * simd) < self.mvau_wwidth_max + and simd > (bound_value_mw / 1024) + ): + all_possible_values.append(simd) + factors.append(bound_value_mw // simd) + if simd < ifmchannels_new: + simd_swu = simd + parallel_window = 0 + else: + simd_swu = ifmchannels_new + parallel_window = 1 + + value_swu_parallel_window = Parameter( + name="SWU_parallel_window", + target_value_name="parallel_window", + target_value=parallel_window, + bound_name=None, + bound_value=None, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + + value_swu_simd = Parameter( + name="SWU_SIMD", + target_value_name="SIMD", + target_value=simd_swu, + bound_name="IFMChannels", + bound_value=ifmchannels_new, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + + value_mvau_simd = Parameter( + name="MVAU_SIMD", + target_value_name="SIMD", + target_value=simd, + bound_name="MW", + bound_value=bound_value_mw, + update_threshold_input=True, + update_weights_input=True, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=second_node_inst, + node_index=node_indx + 1, + op_type=second_node.op_type, + model=self.model, + ) + + values_simd.append( + [value_swu_simd, value_swu_parallel_window, value_mvau_simd] + ) + meta_parameter_simd = MetaParameter( + name="SIMD", + meta_value=possible_values_mvau_simd[0], + possible_values=all_possible_values, + real_values=values_simd, + model=self.model, + node_index=node_indx, + ) + pset.append(meta_parameter_simd) + values_pe = [] + factors = [] + mh = second_node_inst.get_nodeattr("MH") + padding_internal_mvau_mh = ( + 0 # do not allow independent padding of an mvau used as a conv layer + ) + + possible_values_mvau_pe, bounding_values_mh = allowed_divisors( + mh, 1, padding_internal_mvau_mh + ) + for i, value_mvau_pe in enumerate(possible_values_mvau_pe): + bound_mvau_mh = bounding_values_mh[i] + + factor = bound_mvau_mh // value_mvau_pe + if factor not in factors: + factors.append(factor) + value_mvau_pe = Parameter( + name="MVAU_PE", + target_value_name="PE", + target_value=value_mvau_pe, + bound_name="MH", + bound_value=bound_mvau_mh, + update_threshold_input=True, + update_weights_input=True, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=second_node_inst, + node_index=node_indx + 1, + op_type=second_node.op_type, + model=self.model, + ) + values_pe.append([value_mvau_pe]) + + meta_parameter_pe = MetaParameter( + name="PE", + meta_value=possible_values_mvau_pe[0], + possible_values=possible_values_mvau_pe, + real_values=values_pe, + model=self.model, + node_index=node_indx, + ) + pset.append(meta_parameter_pe) + + pass + elif second_node.op_type in ["VVAU_hls", "VVAU_rtl"]: + # SWU->VVAU pair, we are optimizing SWU_SIMD, VVAU_SIMD, VVAU_PE + # VVAU_PE is linked to SWU_SIMD, VVAU_SIMD relies on SWU_parallel_window + + # one meta value (1 -> (max SIMD * max PE)) + skips = 1 + # make sure the SWU is depth-wise + assert node_inst.get_nodeattr("depthwise") == 1 + + # SIMD of the VVAU and SWU depend on the IFM channel count + + second_node_inst = getCustomOp(second_node) + # simd + values_swu_vvau = [] + all_possible_values = [] + factors_pe = [] + + ww = second_node_inst.get_weight_datatype().bitwidth() + # limit to unique bounds + bounding_values_swu = list(set(bounding_values_swu)) + for i, ifmchannels_new in enumerate(bounding_values_swu): + (kernel_dim0, kernel_dim1) = node_inst.get_nodeattr("ConvKernelDim") + + # we cant pad the simd of the VVAU, since this is a kernel size + possible_values_vvau_pe, bounding_values_pe = allowed_divisors( + ifmchannels_new, 1, 0 + ) + + for i, pe in enumerate(possible_values_vvau_pe): + pe_bound = bounding_values_pe[i] + factor = bounding_values_pe[i] // pe + if factor not in factors_pe: + factors_pe.append(factor) + bound_value_kernel = (kernel_dim0, kernel_dim1) + + if pe < ifmchannels_new: + simd_limit = 1 + parallel_window = 0 + else: + simd_limit = np.prod(bound_value_kernel) + parallel_window = 1 + + possible_values_mvau_simd, bounding_values_mw = allowed_divisors( + simd_limit, 1, 0 + ) + factors_simd = [] + for i, simd in enumerate(possible_values_mvau_simd): + if ( + simd * pe not in all_possible_values + and np.prod(bound_value_kernel) // simd not in factors_simd + ): + all_possible_values.append(simd * pe) + + factors_simd.append(np.prod(bound_value_kernel) // simd) + + value_swu_parallel_window = Parameter( + name="SWU_parallel_window", + target_value_name="parallel_window", + target_value=parallel_window, + bound_name=None, + bound_value=None, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + + value_swu_simd = Parameter( + name="SWU_SIMD", + target_value_name="SIMD", + target_value=pe, + bound_name="IFMChannels", + bound_value=ifmchannels_new, + update_threshold_input=False, + update_weights_input=False, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + + value_vvau_simd = Parameter( + name="VVAU_SIMD", + target_value_name="SIMD", + target_value=simd, + bound_name="Kernel", + bound_value=(kernel_dim0, kernel_dim1), + update_threshold_input=True, + update_weights_input=True, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=second_node_inst, + node_index=node_indx + 1, + op_type=second_node.op_type, + model=self.model, + ) + + value_vvau_pe = Parameter( + name="VVAU_PE", + target_value_name="PE", + target_value=pe, + bound_name="Channels", + bound_value=pe_bound, + update_threshold_input=True, + update_weights_input=True, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=second_node_inst, + node_index=node_indx + 1, + op_type=second_node.op_type, + model=self.model, + ) + + values_swu_vvau.append( + [ + value_swu_simd, + value_swu_parallel_window, + value_vvau_simd, + value_vvau_pe, + ] + ) + + meta_parameter_swu_vvau = MetaParameter( + name="SIMD", + meta_value=all_possible_values[0], + possible_values=all_possible_values, + real_values=values_swu_vvau, + model=self.model, + node_index=node_indx, + ) + pset.append(meta_parameter_swu_vvau) + else: + # simple singular node with one parameter + skips = 0 + # one meta value (either 1 -> max SIMD or 1 -> max PE) + # or none at all, then we skip + op_type = node.op_type + + node_inst = getCustomOp(node) + for p in self.optimization_parameters: + if p in whitelist: + if op_type in whitelist[p]: + (padding_internal, fold, bounding_parameter_name) = whitelist[p][ + op_type + ] + padding_internal = min(padding_internal, maximum_padding) + + factors = [] + possible_values_final = [] + + bound = node_inst.get_nodeattr(bounding_parameter_name) + possible_values, bounding_values = allowed_divisors( + bound, 1, padding_internal, skip_folding=False + ) + possible_values = list(possible_values) + + values = [] + + if op_type in [ + "Thresholding_rtl", + "Thresholding_hls", + "MVAU_hls", + "MVAU_rtl", + "VVAU_hls", + "VVAU_rtl", + ]: + update_threshold_input = True + else: + update_threshold_input = False + + if op_type in ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"]: + update_weights_input = True + else: + update_weights_input = False + + for i, value in enumerate(possible_values): + bounding_value = bounding_values[i] + + if op_type in ["VVAU_rtl", "VVAU_hls" and p == "SIMD"]: + bounding_value = node_inst.get_nodeattr( + "Kernel" + ) # dont mess with kernel sizes + + factor = np.prod(bounding_value) // value + + if factor not in factors: + factors.append(factor) + possible_values_final.append(value) + value_obj = Parameter( + name=f"{op_type}_{p}", + target_value_name=p, + target_value=value, + bound_name=bounding_parameter_name, + bound_value=bounding_value, + update_threshold_input=update_threshold_input, + update_weights_input=update_weights_input, + update_input_tensor_shape=True, + update_output_tensor_shape=True, + node=node_inst, + node_index=node_indx, + op_type=node.op_type, + model=self.model, + ) + values.append([value_obj]) + + # construct meta parameter + + meta_parameter = MetaParameter( + name=p, + meta_value=possible_values_final[0], + possible_values=possible_values_final, + real_values=values, + model=self.model, + node_index=node_indx, + ) + pset.append(meta_parameter) + + # we skip nodes in case of tighly coupled nodes like swu->mvau + + op_type = node.op_type + node_inst = getCustomOp(node) + self.all_nodes.append(node_inst) + + pset_obj = ParameterSet() + pset_obj.parameters = pset + self.params = pset_obj + + +def insert_and_size_fifos( + model_dir, model, board, fpga_part, consider_dwc_costs, auto_fifo_strategy +): + """ + force a fifo sizing step after folding to test the resource consumption + and throughput changes introduced by fifo sizing. This pass must be + performed using tree-based TAV generation. Otherwise, + it will take an extremely long amount of time. + """ + if not consider_dwc_costs: + model = model.transform(InsertDWC()) + + cfg = DataflowBuildConfig( + output_dir="", + auto_fifo_depths=True, + split_large_fifos=True, + auto_fifo_strategy=auto_fifo_strategy, + folding_config_file=None, + synth_clk_period_ns=5.0, + fpga_part=fpga_part, + steps=["step_set_fifo_depths"], + generate_outputs=[], + board=board, + extract_hw_config=False, + ) + + model = step_set_fifo_depths(model, cfg) + + return model + + def common_divisors(numbers): separate_divisors = [] for num in numbers: @@ -70,58 +1370,384 @@ def common_divisors(numbers): class SetFolding(Transformation): - """Attempt to set parallelism attributes in all nodes to meet a specific + + """ + Attempt to set parallelism attributes in all nodes to meet a specific target expressed as cycles per frame target_cycles_per_frame. For each HLSCustomOp node type, the attribute may vary but is typically one of {PE, SIMD}, and has a certain allowed-maximum value and divisibility constraints, - which SetFolding will take into account. Note that the algorithm implemented - by SetFolding is very simple and it is often possible to hand-tune the returned - parallelism configuration for better results. - - In the returned model, each node's - cycles_estimate attribute will be set to its estimated number of cycles. - - If two_pass_relaxation is enabled, - SetFolding will internally run a second time if the target cycles from the - first pass could not be achieved, instead using the achievable target (which - may be constrained by a single node) to obtain a balanced pipeline. + which SetFolding will take into account. - Notable exceptions and special behavior: + If style is set to 'optimizer', an optimization algorithm based on a target function + and an optimization objective is employed. - When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation), - which have two attributes (PE and SIMD): + If padding is set to more than 0, folding factor restrictions are + drastically relaxed by adding padding to all relevant nodes if this helps + achieve the optimal folding. Special padding & cropping DWCs are also inserted where + necessary. - * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max - (configurable in the SetFolding initializer, defaults to 36) - * then increases PE until the target is met or max PE reached - - When folding depthwise convolutions ("VVAU"/VectorVectorActivation) - or spatial reduction ops (Pool_Batch): + In the returned model, each node's + cycles_estimate attribute will be set to its estimated number of cycles. - * the producer of the node is expected to be a ConvolutionInputGenerator - with depthwise=1, whose SIMD value will be set equal to the PE value of - its consumer node - * the VVAU also supports SIMD ("input window") parallelism next to - PE ("channels"), but current ConvInpGen limitations require PE to be fully - unfolded before SIMD is increased """ - def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True): + def __init__( + self, + target_cycles_per_frame=1000, + mvau_wwidth_max=1024, + two_pass_relaxation=True, + style="optimizer", + folding_maximum_padding=0, + folding_max_attempts=1, + platform="Pynq-Z1", + folding_effort=250, + enable_folding_dwc_heuristic=0, + enable_folding_fifo_heuristic=0, + folding_pad_io_nodes=False, + devices=1, + verbose=False, + max_parameters_per_partition=4, + # the strategy should ideally be analytic fifo sizing only. + # RTLSIM-based sizing would make the folding time + # quickly explode + auto_fifo_strategy="analytic", + ): super().__init__() self.target_cycles_per_frame = target_cycles_per_frame self.mvau_wwidth_max = mvau_wwidth_max self.two_pass_relaxation = two_pass_relaxation + self.max_attempts = folding_max_attempts + self.padding = folding_maximum_padding + self.devices = devices + self.platform = platform + self.fpgapart = part_map[self.platform] + self.verbose = verbose + self.pad_io_nodes = folding_pad_io_nodes + # either "naive" or "optimizer" + self.style = style + + # maximum function calls / parameter + # recommended in the range of 50-200 depending on the network size + # and how long the user is willing to wait for this step + # ~20 parameters with <30 possible values per parameter @ 200 effort = <30s + self.effort = folding_effort + + # self.optimization_parameters = ["SIMD","PE"] + self.optimization_parameters = [ + "SIMD", + "PE", + "parallel_window", + "ram_style", + "resType", + "ram_style_thresholds", + ] + self.hard_constraint_target = "max_cycles" + self.optimize_folding = True + self.optimize_resource_types = False + self.insert_dwcs = False + self.consider_dwc_costs = True + + self.max_parameters_per_partition = max_parameters_per_partition + + # WARNING: if set to true, this flag + # can result in an enormous increase in + # the time it takes to run this transformation + # relative to the time it takes to run + # set_fifo_depths times (folding_max_attempts-1) + # Recommended to only run if analytic FIFO sizing + # is also enabled (experimental feature) + self.enable_folding_fifo_heuristic = enable_folding_fifo_heuristic + + self.auto_fifo_strategy = auto_fifo_strategy + self.enable_folding_dwc_heuristic = enable_folding_dwc_heuristic + + self.target_resources = ["LUT", "BRAM_18K", "DSP", "URAM"] + + def apply_optimized_folding(self, model): + """ + Optimization algorithm-based folding transformation + using an iterative optimization algorithm and a target function + to find optimal folding values for each node in the FINN graph, + by default minimizing resource consumption while making sure to meet + the target max_cycles (throughput) rate + """ + + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateCycles()) + + targets = {} + targets["max_cycles"] = self.target_cycles_per_frame + current_throughput_target = self.target_cycles_per_frame + + # fetch all parameters and bounds from the model by + # running the optimizer once without minimizing cost + + init_model = copy.deepcopy(model) + opt1 = Optimizer( + init_model, + "defaultOPT_for_parameter_extraction", + targets, + self.hard_constraint_target, + target_cycles_per_frame=self.target_cycles_per_frame, + padding=0, + fpgapart=self.fpgapart, + maxfun_per_parameter=self.effort, + parameters_to_apply=["SIMD", "PE"], + enable_folding_dwc_heuristic=self.enable_folding_dwc_heuristic, + verbose=self.verbose, + mvau_wwidth_max=self.mvau_wwidth_max, + init_run=True, + pad_io_nodes=self.pad_io_nodes, + optimization_parameters=self.optimization_parameters, + max_parameters_per_partition=self.max_parameters_per_partition, + ) + + opt1.targets = targets + opt1.generate_parameter_set() # generate full param list + param_set_default = opt1.params + + param_values_min = param_set_default.get_min_vals() + param_values_max = param_set_default.get_max_vals() + + param_set_default.add_all_params_to_index_list() + + # create copies of the minimum and maximum parameters + # for folding to use as upper and lower bounds for + # optimization + + param_set_min = copy.deepcopy(param_set_default) + param_set_min.set_values(param_values_min) + + param_set_max = copy.deepcopy(param_set_default) + param_set_max.set_values(param_values_max) + + # run once to initialize all the lists and objects + param_set_min.apply_updates(self.optimization_parameters) + param_set_max.apply_updates(self.optimization_parameters) + + param_set_min.assign_involved_nodes() + param_set_max.assign_involved_nodes() + + # assign maximum throughput achievable + opt1.optimize(max_nodes_in_partition=1, target_parameters=["SIMD", "PE", "parallel_window"]) + init_model = init_model.transform(AnnotateCycles()) + maximum_achievable_throughput = init_model.analysis(dataflow_performance)["max_cycles"] + + limits = DEFAULT_RES_LIMITS + self.max_luts = limits[0] * sum( + [r["LUT"] for r in platforms[self.platform](self.devices).resource_count_dict.values()] + ) + self.max_bram = limits[2] * sum( + [ + r["BRAM_18K"] + for r in platforms[self.platform](self.devices).resource_count_dict.values() + ] + ) + self.max_uram = limits[3] * sum( + [r["URAM"] for r in platforms[self.platform](self.devices).resource_count_dict.values()] + ) + self.max_dsp = limits[4] * sum( + [r["DSP"] for r in platforms[self.platform](self.devices).resource_count_dict.values()] + ) + + targets["LUT"] = max(self.max_luts, 0.001) + targets["BRAM_18K"] = max(self.max_bram, 0.001) + targets["DSP"] = max(self.max_dsp, 0.001) + targets["URAM"] = max(self.max_uram, 0.001) + + opt2 = Optimizer( + model, + "padded OPT", + targets, + self.hard_constraint_target, + target_cycles_per_frame=current_throughput_target, + padding=self.padding, + fpgapart=self.fpgapart, + maxfun_per_parameter=self.effort, + parameters_to_apply=self.optimization_parameters, + enable_folding_dwc_heuristic=self.enable_folding_dwc_heuristic, + verbose=self.verbose, + mvau_wwidth_max=self.mvau_wwidth_max, + init_run=False, + pad_io_nodes=self.pad_io_nodes, + optimization_parameters=self.optimization_parameters, + max_parameters_per_partition=self.max_parameters_per_partition, + ) + + opt2.targets = targets + opt2.generate_parameter_set() # generate full param list + + # First pass which deals with folding factors only + optimization_attempts = 0 + last_limited = False + last_successful_throughput_target = self.target_cycles_per_frame + + current_step = 1 + min_step = 0.05 + + opt2_tmp = copy.deepcopy(opt2) + last_good_model = copy.deepcopy(opt2.model) + + while current_step > min_step and optimization_attempts < self.max_attempts: + targets["max_cycles"] = current_throughput_target + opt2 = copy.deepcopy(opt2_tmp) + opt2.targets = targets + opt2.generate_parameter_set() # generate full param list + + # dont optimize if throughput request is impossible to meet + opt2.target_cycles_per_frame = current_throughput_target + if self.optimize_folding is True: + opt2.optimize(max_nodes_in_partition=3, target_parameters=["SIMD", "PE"]) + + # Second pass which adjusts ram style for memory and resource types for compute + if self.optimize_resource_types is True: + opt2.optimize( + max_nodes_in_partition=min(len(model.graph.node), 8), + target_parameters=["ram_style", "resType", "ram_style_thresholds"], + ) + + model = opt2.model + + # generate extra model with fifos and dwcs for the final estimate + + if self.consider_dwc_costs: + model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers(self.fpgapart)) + + if self.enable_folding_fifo_heuristic and self.max_attempts != 1: + # store model to use in the builder + model_dir = "folded_model.onnx" + + model = insert_and_size_fifos( + model_dir, + model, + self.platform, + self.fpgapart, + self.consider_dwc_costs, + self.auto_fifo_strategy, + ) + model = model.transform(SpecializeLayers(self.fpgapart)) + + resources = {} + for n in opt2.model.graph.node: + node_inst = getCustomOp(n) + resources[node_inst] = node_inst.node_res_estimation(self.fpgapart) + metrics = aggregate_dict_keys(resources) + + # extract costs + overshot = False + for resource in self.target_resources: + if metrics[resource] > targets[resource]: + overshot = True + + if overshot: + # if we overshot, we try again, but with half the step size + current_throughput_target = int( + last_successful_throughput_target + - last_successful_throughput_target * current_step + ) + if self.max_attempts == 1: + last_good_model = copy.deepcopy(opt2.model) + else: + if last_limited: + current_step /= 2 + + else: + for resource in self.target_resources: + budget_left = 1 - (metrics[resource] / targets[resource]) + current_step = min(current_step, budget_left) + + new_throughput = int( + last_successful_throughput_target + - last_successful_throughput_target * current_step + ) + last_good_model = copy.deepcopy(opt2.model) + last_successful_throughput_target = copy.copy(current_throughput_target) + current_throughput_target = new_throughput + if current_throughput_target < maximum_achievable_throughput: + last_limited = True + + current_throughput_target = maximum_achievable_throughput + else: + last_limited = False + optimization_attempts += 1 + + model = last_good_model + + if self.insert_dwcs: + # In case future steps do not insert DWCs + model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers(self.fpgapart)) + + # necessary final transformation + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateCycles()) + + # perform input and output tensor shape adjustment + # this is only going to have effects if padding was performed + # The tensor shape change on the IOs will require modification + # of the host, with either padding of the input or cropping of the output + # to get an equivalent result. + + if self.pad_io_nodes: + input_mw_padded = getCustomOp(model.graph.node[0]).get_normal_input_shape() + output_mh_padded = getCustomOp(model.graph.node[-1]).get_normal_output_shape() + output_name = model.graph.output[0].name + + if len(model.graph.input) != 0: + model.graph.input.remove(model.graph.input[0]) + input_x = helper.make_tensor_value_info( + model.graph.node[0].input[0], TensorProto.FLOAT, [*input_mw_padded] + ) + model.graph.input.append(input_x) + + if len(model.graph.output) != 0: + model.graph.output.remove(model.graph.output[0]) + output_y = helper.make_tensor_value_info( + output_name, TensorProto.FLOAT, [*output_mh_padded] + ) + model.graph.output.append(output_y) + + return (model, False) def optimize_attribute_val(self, node_inst, max_val, attr_name): node_inst.set_nodeattr(attr_name, 1) - for val in divisors(max_val): + for val in self.divisors(max_val): node_inst.set_nodeattr(attr_name, val) cyc = node_inst.get_exp_cycles() if cyc < self.target_cycles_per_frame: # finish if target met break - def apply(self, model): + def apply_naive_folding(self, model): + """ + A naive folding optimizer implementation + + If two_pass_relaxation is enabled, + SetFolding will internally run a second time if the target cycles from the + first pass could not be achieved, instead using the achievable target (which + may be constrained by a single node) to obtain a balanced pipeline. + + Notable exceptions and special behavior: + + When folding dense convolution/FC compute engines ("MVAU"/MatrixVectorActivation), + which have two attributes (PE and SIMD): + + * first increases SIMD while weight stream width per PE is <= mvau_wwidth_max + (configurable in the SetFolding initializer, defaults to 36) + * then increases PE until the target is met or max PE reached + + When folding depthwise convolutions ("VVAU"/VectorVectorActivation) + or spatial reduction ops (Pool_Batch): + + * the producer of the node is expected to be a ConvolutionInputGenerator + with depthwise=1, whose SIMD value will be set equal to the PE value of + its consumer node + * the VVAU also supports SIMD ("input window") parallelism next to + PE ("channels"), but current ConvInpGen limitations require PE to be fully + unfolded before SIMD is increased + """ + graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels pe_ops = [ @@ -160,12 +1786,12 @@ def apply(self, model): # increase SIMD until either we meet # the target or weight stream becomes # too wide - for simd_val in divisors(max_simd): + for simd_val in self.divisors(max_simd): prev_simd_val = node_inst.get_nodeattr("SIMD") node_inst.set_nodeattr("SIMD", simd_val) cyc = node_inst.get_exp_cycles() - if cyc < self.target_cycles_per_frame: - # finish if target met + if cyc < self.target_cycles_per_frame and simd_val > (max_simd / 1024): + # finish if target met and simd value is not too low break if ( node_inst.get_input_datatype(1).bitwidth() * node_inst.get_nodeattr("SIMD") @@ -282,7 +1908,21 @@ def apply(self, model): target_cycles_per_frame=perf_dict["max_cycles"], mvau_wwidth_max=self.mvau_wwidth_max, two_pass_relaxation=False, + padding=0, ) ) + # necessary final transforms + if self.insert_dwcs: + model.transform(InsertDWC()) + + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(AnnotateCycles()) + return (model, False) + + def apply(self, model): + if self.style == "naive": + return self.apply_naive_folding(model) + else: + return self.apply_optimized_folding(model) diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index 19e459c222..492a64dc96 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -110,8 +110,9 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): @pytest.mark.parametrize("target_fps", [30, 10**5, 10**7]) # target chip or board @pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"]) +@pytest.mark.parametrize("style", ["naive", "optimizer"]) @pytest.mark.fpgadataflow -def test_set_folding(target_fps, platform): +def test_set_folding(target_fps, platform, style): model = make_multi_fclayer_model(128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5) model = model.transform(GiveUniqueNodeNames()) @@ -123,7 +124,9 @@ def test_set_folding(target_fps, platform): clk_ns = 5 target_cycles_per_frame = int((10**9 / clk_ns) / target_fps) - dataflow_model = dataflow_model.transform(SetFolding(target_cycles_per_frame)) + dataflow_model = dataflow_model.transform( + SetFolding(target_cycles_per_frame, platform=platform, style=style, folding_effort=200) + ) exp_cycles_dict = dataflow_model.analysis(exp_cycles_per_layer) achieved_cycles_per_frame = max(exp_cycles_dict.values())