diff --git a/fetch-repos.sh b/fetch-repos.sh
index 8aad454d4f..ef99d38eaf 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ QONNX_COMMIT="f5c9819bd00f01f41e70639b8461c8e4b39432f7"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="aad4d5a293db6f2ec622a92a5d3278e47072453e"
 CNPY_COMMIT="8c82362372ce600bbd1cf11d64661ab69d38d7de"
-HLSLIB_COMMIT="a2cd3e6ce653a03e59af6bcb9fbeaa71618d160e"
+HLSLIB_COMMIT="120c46293fdf534415a6a47973a8f712fca6d900"
 OMX_COMMIT="a5d48f93309b235fdd21556d16e86e6ef5db6e2e"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index b74bbf538d..40ee90878e 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (C) 2020-2022, Xilinx, Inc.
-# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -59,6 +59,7 @@ def register_custom_op(cls):
     ConvolutionInputGenerator,
 )
 from finn.custom_op.fpgadataflow.crop import Crop
+from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
 from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams
 from finn.custom_op.fpgadataflow.fmpadding import FMPadding
 from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel
@@ -96,6 +97,7 @@ def register_custom_op(cls):
 custom_op["AddStreams"] = AddStreams
 custom_op["ChannelwiseOp"] = ChannelwiseOp
 custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
+custom_op["Deconvolution"] = Deconvolution
 custom_op["Crop"] = Crop
 custom_op["DuplicateStreams"] = DuplicateStreams
 custom_op["FMPadding"] = FMPadding
diff --git a/src/finn/custom_op/fpgadataflow/deconvolution.py b/src/finn/custom_op/fpgadataflow/deconvolution.py
new file mode 100644
index 0000000000..ad7a0bda1e
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/deconvolution.py
@@ -0,0 +1,173 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class Deconvolution(HWCustomOp):
+    """Abstraction layer for HW implementation of Deconvolution"""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "KernelDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "IFMChannels": ("i", True, 0),
+            "OFMChannels": ("i", True, 0),
+            "IFMDim": ("ints", True, []),  # [H, W] = [Y, X]
+            "PE": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            "Stride": ("ints", True, [1, 1]),  # [H, W] = [Y, X]
+            "Padding": ("ints", True, []),  # [H, W] = [Y, X]
+            # FINN DataTypes for inputs, weights, outputs
+            "inputDataType": ("s", True, ""),
+            "weightDataType": ("s", True, ""),
+            "outputDataType": ("s", True, ""),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        if ind == 0:
+            ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
+        else:
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            ofm_ch = self.get_nodeattr("OFMChannels")
+            k_h, k_w = self.get_nodeattr("KernelDim")
+            ishape = (ofm_ch, k_h, k_w, ifm_ch)
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        if ind == 0:
+            ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            simd = self.get_nodeattr("SIMD")
+            assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+            fold = int(ifm_ch / simd)
+            folded_ishape = (1, ifm_dim_h, ifm_dim_w, fold, simd)
+        else:
+            folded_ishape = self.get_normal_input_shape(ind)
+        return folded_ishape
+
+    def get_normal_output_shape(self, ind=0):
+        idim_h, idim_w = self.get_nodeattr("IFMDim")
+        stride_h, stride_w = self.get_nodeattr("Stride")
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        pad_h, pad_w = self.get_nodeattr("Padding")
+        odim_h = (idim_h - 1) * stride_h - 2 * pad_h + (k_h - 1) + 1
+        odim_w = (idim_w - 1) * stride_w - 2 * pad_w + (k_w - 1) + 1
+        oshape = (1, odim_h, odim_w, ofm_ch)
+        return oshape
+
+    def get_folded_output_shape(self, ind=0):
+        normal_oshape = self.get_normal_output_shape()
+        odim_h = normal_oshape[1]
+        odim_w = normal_oshape[2]
+        ofm_ch = normal_oshape[3]
+        pe = self.get_nodeattr("PE")
+        fold = int(ofm_ch / pe)
+        folded_oshape = (1, odim_h, odim_w, fold, pe)
+        return folded_oshape
+
+    def make_shape_compatible_op(self, model):
+        exp_ishape = self.get_normal_input_shape()
+        oshape = self.get_normal_output_shape()
+        ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+        assert ishape == exp_ishape, "Unexpected input shape for Deconv."
+        # implement tensor with correct shape
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "inputDataType changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("inputDataType", idt.name)
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_weight_datatype(self):
+        """Returns FINN DataType of weights."""
+        return DataType[self.get_nodeattr("weightDataType")]
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType[self.get_nodeattr("outputDataType")]
+
+    def get_instream_width(self, ind=0):
+        """Returns stream width, input and output stream width are equal for
+        the sliding window function"""
+        if ind == 0:
+            ibits = self.get_input_datatype().bitwidth()
+            simd = self.get_nodeattr("SIMD")
+            ifm_ch = self.get_nodeattr("IFMChannels")
+            assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
+            in_width = simd * ibits
+        else:
+            in_width = 0
+        return in_width
+
+    def get_outstream_width(self, ind=0):
+        o_bits = self.get_output_datatype().bitwidth()
+        out_width = o_bits * self.get_nodeattr("PE")
+        return out_width
+
+    def get_exp_cycles(self):
+        return 0
+
+    def bram_estimation(self):
+        return 0
+
+    def lut_estimation(self):
+        return 0
+
+    def uram_estimation(self):
+        return 0
+
+    def execute_node(self, context, graph):
+        pass
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
index e80a581b57..b75656a758 100644
--- a/src/finn/custom_op/fpgadataflow/hls/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -58,6 +58,7 @@ def register_custom_op(cls):
 from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
 from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls
 from finn.custom_op.fpgadataflow.hls.crop_hls import Crop_hls
+from finn.custom_op.fpgadataflow.hls.deconvolution_hls import Deconvolution_hls
 from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls
 from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls
 from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls
@@ -84,6 +85,7 @@ def register_custom_op(cls):
 custom_op["AddStreams_hls"] = AddStreams_hls
 custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls
 custom_op["CheckSum_hls"] = CheckSum_hls
+custom_op["Deconvolution_hls"] = Deconvolution_hls
 custom_op["Crop_hls"] = Crop_hls
 custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls
 custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
new file mode 100644
index 0000000000..d0e90661e3
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
@@ -0,0 +1,213 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions
+
+from finn.custom_op.fpgadataflow.deconvolution import Deconvolution
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import numpy_to_hls_code
+
+
+class Deconvolution_hls(Deconvolution, HLSBackend):
+    """Corresponds to finn-hlslib deconv function."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(Deconvolution.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def calc_wmem(self):
+        """Calculates and returns WMEM."""
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        kernel_2 = np.prod(self.get_nodeattr("KernelDim"))
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
+        assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
+        wmem = (ofm_ch / pe) * kernel_2 * (ifm_ch / simd)
+        return int(wmem)
+
+    def generate_params(self, model, path):
+        code_gen_dir = path
+        # weights, if not external
+        weights = model.get_initializer(self.onnx_node.input[1])
+        # save hlslib-compatible weights in params.h
+        weight_filename = "{}/params.h".format(code_gen_dir)
+        self.make_weight_file(weights, "hls_header", weight_filename)
+
+    def make_weight_file(self, weights, weight_file_mode, weight_file_name):
+        """Produce a file containing given weights in appropriate format for this
+        layer. This file can be used for either synthesis or run-time reconfig
+        of weights.
+
+        Arguments:
+
+        * weights : numpy array with weights to be put into the file
+        * weight_file_mode : one of {hls_header, decoupled_verilog_dat,
+          decoupled_runtime}
+        * weight_file_name : filename for the weight file to be generated
+
+        """
+        # convert weights into hlslib-compatible format
+        weight_tensor = self.get_hw_compatible_weight_tensor(weights)
+        export_wdt = self.get_weight_datatype()
+        if weight_file_mode == "hls_header":
+            weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", False, True)
+            # remove framing {}
+            weight_hls_code = weight_hls_code[1:-2] + ";"
+            # write weights into C++ header file as dictated by finn-hlslib
+            f_weights = open(weight_file_name, "w")
+            f_weights.write(
+                "static {} const weights[{}][{}][{}] = ".format(
+                    export_wdt.get_hls_datatype_str(),
+                    self.calc_wmem(),
+                    self.get_nodeattr("PE"),
+                    self.get_nodeattr("SIMD"),
+                )
+            )
+            f_weights.write(weight_hls_code)
+            f_weights.close()
+
+    def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
+        """Convert the original numpy weight matrix orig_weight_matrix into
+        a form suitable for passing to the hlslib call:
+        * ensure OCH % PE == 0 and ICH % SIMD == 0
+        * interleave rows between PEs
+        * reshape into (1, PE, WMEM, SIMD) and return
+        """
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        ifm_ch = self.get_nodeattr("IFMChannels")
+        ofm_ch = self.get_nodeattr("OFMChannels")
+        pe = self.get_nodeattr("PE")
+        simd = self.get_nodeattr("SIMD")
+        wmem = self.calc_wmem()
+        assert orig_weight_matrix.shape == (
+            ofm_ch,
+            k_h,
+            k_w,
+            ifm_ch,
+        ), """Weights matrix doesn't
+        have expected shape (ofm_ch, k_h, k_w, ifm_ch)"""
+        assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
+        assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
+        # interleave rows between PEs and reshape
+        # distribute rows between PEs
+        ret = orig_weight_matrix
+        ret = ret.reshape(ofm_ch, k_h * k_w * ifm_ch)
+        ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
+        # create SIMD as innermost dimension
+        ret = ret.reshape(1, pe, wmem, simd)
+        ret = ret.transpose(0, 2, 1, 3)
+        return ret
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "deconv.hpp"']
+
+    def defines(self, var):
+        ifm_dim = self.get_nodeattr("IFMDim")
+        self.code_gen_dict["$DEFINES$"] = [
+            """constexpr unsigned Kernel = {};\n constexpr unsigned Stride = {};\n
+            constexpr unsigned Padding = {};\n constexpr unsigned IFMH = {};\n
+            constexpr unsigned IFMW = {};\n constexpr unsigned ICH = {};\n
+            constexpr unsigned OCH = {};\n constexpr unsigned SIMD1 = {};\n
+            constexpr unsigned PE1 = {};""".format(
+                self.get_nodeattr("KernelDim")[0],
+                self.get_nodeattr("Stride")[0],
+                self.get_nodeattr("Padding")[0],
+                ifm_dim[0],
+                ifm_dim[1],
+                self.get_nodeattr("IFMChannels"),
+                self.get_nodeattr("OFMChannels"),
+                self.get_nodeattr("SIMD"),
+                self.get_nodeattr("PE"),
+            )
+        ]
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+        self.code_gen_dict["$DOCOMPUTE$"].append(
+            """deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
+            (weights, in0_V, out0_V);"""
+        )
+
+    def blackboxfunction(self):
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        output_elem_hls_type = self.get_output_datatype().get_hls_datatype_str()
+        simd = self.get_nodeattr("SIMD")
+        pe = self.get_nodeattr("PE")
+        in_stream = "hls::stream<hls::vector<%s, %d>> &in0_V" % (
+            input_elem_hls_type,
+            simd,
+        )
+        out_stream = "hls::stream<hls::vector<%s, %d>> &out0_V" % (
+            output_elem_hls_type,
+            pe,
+        )
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_stream, out_stream)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0_V"]
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0_V")
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+        self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+        # the weight tensor is ap_uint<simd*prec> [PE][WMEM]
+        # partition for parallel access along the PE dimension (dim 1)
+        # self.code_gen_dict["$PRAGMAS$"].append(
+        #    ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+        # )
+
+    def execute_node(self, context, graph):
+        HLSBackend.execute_node(self, context, graph)
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        simd = self.get_nodeattr("SIMD")
+        i_ch = self.get_nodeattr("IFMChannels")
+        k_h, k_w = self.get_nodeattr("KernelDim")
+        s_h, s_w = self.get_nodeattr("Stride")
+        i_h, i_w = self.get_nodeattr("IFMDim")
+        p_h, p_w = self.get_nodeattr("Padding")
+        if p_w >= k_w - s_w:
+            padup = 0
+        else:
+            padup = (k_w - p_w - 1) / s_w
+        crop = s_w * padup - ((k_w - s_w) - p_w)
+        sf = i_ch / simd
+        w_eff = padup + i_w + padup
+        wo_eff = (w_eff - 1) * s_w + k_w
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = [
+            "%s" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 4 * sf + 50)
+        ]
diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py
index 400ed3b6e3..6cd60e07ad 100644
--- a/tests/fpgadataflow/test_fpgadataflow_deconv.py
+++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py
@@ -63,12 +63,11 @@
 target_clk_ns = 10
 
 
-def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
+def set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
     idim_h, idim_w = idim
     stride_h, stride_w = stride
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
     odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
-    odt = DataType["INT32"]
 
     inp = helper.make_tensor_value_info(
         "inp",
@@ -119,6 +118,73 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
 
     model = model.transform(InferShapes())
 
+    return model, w_tensor
+
+
+def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor):
+    idim_h, idim_w = idim
+    stride_h, stride_w = stride
+    odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
+    odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
+
+    inp = helper.make_tensor_value_info(
+        "inp",
+        TensorProto.FLOAT,
+        [
+            1,
+            idim_h,
+            idim_w,
+            ifm_ch,
+        ],
+    )
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch])
+    W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, k, k, ifm_ch])
+
+    Deconv = helper.make_node(
+        "Deconvolution_hls",
+        ["inp", "W"],
+        ["outp"],
+        domain="finn.custom_op.fpgadataflow.hls",
+        backend="fpgadataflow",
+        KernelDim=[k, k],
+        IFMChannels=ifm_ch,
+        OFMChannels=ofm_ch,
+        IFMDim=idim,
+        Stride=[stride_h, stride_w],
+        Padding=[padding, padding],
+        PE=1,
+        SIMD=1,
+        inputDataType=idt.name,
+        weightDataType=wdt.name,
+        outputDataType=odt.name,
+        cpp_interface="hls_vector",
+        hls_style="freerunning",
+    )
+
+    node_list = [Deconv]
+    value_info = [W]
+
+    graph = helper.make_graph(
+        nodes=node_list,
+        name="convtranspose_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=value_info,
+    )
+
+    model = qonnx_make_model(graph, producer_name="convtranspose-model")
+    model = ModelWrapper(model)
+
+    # initialize model
+    model.set_tensor_datatype("inp", idt)
+    model.set_tensor_datatype(model.graph.output[0].name, odt)
+    model.set_tensor_datatype("W", wdt)
+
+    w_tensor = w_tensor.transpose(1, 2, 3, 0)
+    model.set_initializer("W", w_tensor)
+
+    model = model.transform(InferShapes())
+
     return model
 
 
@@ -143,13 +209,16 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
 @pytest.mark.fpgadataflow
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
+def test_fpgadataflow_deconv_pixel_pad(
+    idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode
+):
     idt = wdt = DataType["INT4"]
     wdt = idt
+    odt = DataType["INT32"]
     idim_h, idim_w = idim
     stride_h, stride_w = stride
 
-    ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding)
+    ref_model = set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding)[0]
 
     odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
     odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
@@ -208,3 +277,93 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
         exp_cycles = exp_cycles_dict[node.name]
         assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
         assert exp_cycles != 0
+
+
+# input image dimension
+@pytest.mark.parametrize("idim", [[8, 8]])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("stride", [[2, 2]])
+# number of channels
+@pytest.mark.parametrize("ifm_ch", [2])
+# number of channels
+@pytest.mark.parametrize("ofm_ch", [3])
+# Input parallelism
+@pytest.mark.parametrize("simd", [1])
+# PE
+@pytest.mark.parametrize("pe", [1])
+# kernel size
+@pytest.mark.parametrize("k", [4])
+# padding
+@pytest.mark.parametrize("padding", [1])
+# exec mode
+@pytest.mark.parametrize("exec_mode", ["cppsim"])
+@pytest.mark.fpgadataflow
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_fpgadataflow_deconv_revd2(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
+    idt = wdt = DataType["INT8"]
+    wdt = idt
+    odt = DataType["INT32"]
+    idim_h, idim_w = idim
+    stride_h, stride_w = stride
+
+    ref_model, w_tensor = set_up_reference_model(
+        idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding
+    )
+    model = create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor)
+
+    odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
+    odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
+
+    input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, idim_h, idim_w])
+    input_dict = {"inp": input_tensor}
+
+    y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"]
+
+    # model = model.transform(InferShapes())
+    # model = model.transform(GiveUniqueNodeNames())
+    input_tensor_nhwc = input_tensor.transpose(0, 2, 3, 1)
+    input_dict_nhwc = {"inp": input_tensor_nhwc}
+    # y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
+    # assert (y_produced == y_expected).all()
+
+    # model = model.transform(SpecializeLayers(test_fpga_part))
+    # model = model.transform(MinimizeAccumulatorWidth())
+
+    for n in model.graph.node:
+        if n.op_type.startswith("Deconvolution_hls"):
+            deconv_node = getCustomOp(n)
+            deconv_node.set_nodeattr("PE", pe)
+            deconv_node.set_nodeattr("SIMD", simd)
+
+    expected_oshape = (1, odim_h, odim_w, ofm_ch)
+    # model.save("deconv.onnx")
+    # cppsim
+    if exec_mode == "cppsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+        # breakpoint()
+
+    # rtlsim
+    else:
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(PrepareRTLSim())
+        model = model.transform(SetExecMode("rtlsim"))
+
+    y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
+    assert y_produced.shape == expected_oshape
+    y_produced = y_produced.transpose(0, 3, 1, 2)
+    assert (y_produced == y_expected).all()
+
+    if exec_mode == "rtlsim":
+        node = model.get_nodes_by_op_type("Deconvolution_hls")[0]
+        inst = getCustomOp(node)
+        cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
+        exp_cycles_dict = model.analysis(exp_cycles_per_layer)
+        exp_cycles = exp_cycles_dict[node.name]
+        assert np.isclose(exp_cycles, cycles_rtlsim, atol=10)
+        assert exp_cycles != 0