diff --git a/.gitignore b/.gitignore index be61378730..c00b290ddb 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,7 @@ htmlcov/* junit.xml coverage.xml .pytest_cache/ +tests/util/fastpack_*.txt # Build and docs folder/files build/* diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 6a72d38058..ef2dae1604 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -27,15 +27,54 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import binascii + +# Import the faster packing functions. This is executed when loading the module +# so that the faster version is always available when this is imported +import ctypes as ct import numpy as np import os import sys +import threading from bitstring import BitArray +from math import ceil +from numpy.ctypeslib import ndpointer from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple - -def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False): +# Setup +fastpack_source = os.path.abspath(os.path.join(os.path.dirname(__file__), "fast_pack.c")) +fastpack_lib = os.path.abspath(os.path.join(os.path.dirname(__file__), "fastpack.so")) +__fastpack_so = None +__fastpack_load_lock = threading.Lock() +assert os.path.isfile(fastpack_source), "Could not find fast_pack.c in the utils/ dir of FINN" + + +# Singleton setup to safely load this module in multithreading contexts +def get_fastpack(): + global __fastpack_load_lock, __fastpack_so + with __fastpack_load_lock: + if __fastpack_so is None: + # Compile + if os.path.isfile(fastpack_lib): + os.system(f"rm {fastpack_lib}") + os.system(f"gcc -shared -O3 -fpic {fastpack_source} -o {fastpack_lib}") + assert os.path.isfile(fastpack_lib), "Could not find fastpack.so. Did compilation fail?" + + # Load + fastpack = ct.CDLL(fastpack_lib) + fastpack_floatarray = ndpointer(ct.c_float, flags="C_CONTIGUOUS") + fastpack.array_to_hexstring_binary.argtypes = ( + fastpack_floatarray, + ct.c_uint, + ct.c_uint, + ct.c_char_p, + ) + fastpack.array_to_hexstring_binary.restype = ct.c_bool + __fastpack_so = fastpack + return __fastpack_so + + +def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False, use_fastpack=True): """ Pack given one-dimensional NumPy array with FINN DataType dtype into a hex string. @@ -45,6 +84,8 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False): fixed width. The minimum value for pad_to_nbits is 4, since a single hex digit is four bits. reverse can be used to reverse the array prior to packing. + When use_fastpack is set to true, if available the function is outsourced + to a faster C implementation for some cases. Examples: @@ -71,6 +112,17 @@ def array2hexstring(array, dtype, pad_to_nbits, prefix="0x", reverse=False): # reverse prior to packing, if desired if reverse: array = np.flip(array, -1) + + # Check if the fast way can be taken + # TODO: Expand this to cover more cases + if use_fastpack and dtype == DataType["BINARY"]: + output_string = ct.create_string_buffer(ceil(pad_to_nbits / 4) + 4) + success = get_fastpack().array_to_hexstring_binary( + np.asarray(array, order="C"), array.size, pad_to_nbits, output_string + ) + assert success, f"Could not convert array {array} with datatype {dtype} to hexstring!" + return prefix + output_string.value.decode("utf-8") + lineval = BitArray(length=0) bw = dtype.bitwidth() # special handling for fixed point: rescale, then pack as integers @@ -124,10 +176,11 @@ def npbytearray2hexstring(npbytearray, prefix="0x"): def pack_innermost_dim_as_hex_string( - ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x" + ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x", use_fastpack=True ): """Pack the innermost dimension of the given numpy ndarray into hex - strings using array2hexstring. + strings using array2hexstring. If use_fastpack is enabled this tries to speed + up the conversion Examples: @@ -149,7 +202,9 @@ def pack_innermost_dim_as_hex_string( ndarray = np.asarray(ndarray, dtype=np.float32) def fun(x): - return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix) + return array2hexstring( + x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix, use_fastpack=use_fastpack + ) return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray) diff --git a/src/finn/util/fast_pack.c b/src/finn/util/fast_pack.c new file mode 100644 index 0000000000..523d551474 --- /dev/null +++ b/src/finn/util/fast_pack.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +/*** + * Takes a numpy array of floats in BINARY datatype from finn and the number of elements in that array, as well as the number of padded bits required. + * It also takes an out-string buffer to write the results to. This buffer is created by python via ctypes.create_string_buffer() and must be large enough to + * hold the required number of padded bits. + * + * The function returns false on an error and true in case of success + */ +bool array_to_hexstring_binary(float* values, unsigned int elements, unsigned int padded_bits, char* out) { + // Calculate min number of bits required + unsigned int min_bits; + if (elements % 4 != 0) { + min_bits = elements + (4 - (elements % 4)); + } else { + min_bits = elements; + } + + // Padded bits must be atleast length of min_bits and divisible by 4 for hex repr + if (min_bits > padded_bits || padded_bits % 4 != 0) { + return false; + } + + // Pad output string + strcpy(out, ""); + unsigned int prefix_digits = (padded_bits - min_bits) / 4; + for (int i = 0; i < prefix_digits; i++) { + out[i] = '0'; + } + out[prefix_digits] = '0'; + out[prefix_digits + min_bits / 4] = '\0'; + + unsigned int temp = 0; + unsigned int digits = 0; + unsigned int bit_shift_left = 0; + char letter; + for (int index = elements - 1; index >= 0; index--) { + // Add new bit + temp |= (((unsigned int) values[index]) << bit_shift_left); + + // Convert to hex either when 4 bits are there or we arrived at the end + if (bit_shift_left == 3 || index == 0) { + if (temp <= 9) { + letter = '0' + temp; + } else { + letter = 'a' + temp - 10; + } + out[prefix_digits + min_bits / 4 - digits - 1] = letter; + digits++; + temp = 0; + bit_shift_left = 0; + } else { + bit_shift_left++; + } + } + return true; +} diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py index a718f171e2..d9bfaf39fa 100644 --- a/tests/util/test_data_packing.py +++ b/tests/util/test_data_packing.py @@ -32,11 +32,16 @@ import os import shutil import subprocess +import time from qonnx.core.datatype import DataType from qonnx.util.basic import gen_finn_dt_tensor from finn.util.basic import make_build_dir -from finn.util.data_packing import npy_to_rtlsim_input, numpy_to_hls_code +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, +) @pytest.mark.util @@ -180,3 +185,66 @@ def test_npy_to_rtlsim_input(dtype): assert all([(x >> dtype.bitwidth()) == 0 for x in output_fast]), "extraneous bits detected" assert np.all(output_fast == output_slow_split), "different behavior of packing modes detected" + + +@pytest.mark.util +@pytest.mark.parametrize("tensorshape", [(1, 2, 16384, 64), (1, 1024, 2048)]) +def test_pack_innermost_dim_to_hexstring_fast(tensorshape: tuple[int]): + # check that the sped up function call in pack_inermost_dim_to_hex_string() is valid + tensor_count = 5 + assert tensorshape[-1] % 4 == 0, "Smallest tensorshape dimension must be divisible by 4" + + # Create random binary tensor by simply rounding a random tensor + tensors = [ + np.round(np.random.random(tensorshape)).astype(np.float32) for i in range(tensor_count) + ] + results_python = [] + results_c = [] + + # Test C impl + start_c = time.time() + for count in range(tensor_count): + c_result = pack_innermost_dim_as_hex_string( + tensors[count], + DataType["BINARY"], + tensorshape[-1] * 2, + reverse_inner=False, + prefix="0x", + use_fastpack=True, + ) + results_c.append(c_result) + end_c = time.time() + + # Test python impl + start_python = time.time() + for count in range(tensor_count): + python_result = pack_innermost_dim_as_hex_string( + tensors[count], + DataType["BINARY"], + tensorshape[-1] * 2, + reverse_inner=False, + prefix="0x", + use_fastpack=False, + ) + results_python.append(python_result) + end_python = time.time() + + assert np.array_equal(np.array(results_python), np.array(results_c)) + + # Write timing results + with open( + os.path.join( + os.path.dirname(__file__), + "fastpack_benchmark" + "_".join(map(lambda x: str(x), list(tensorshape))) + ".txt", + ), + "w+", + ) as f: + f.write("Pack_innermost_dim_to_hexstring benchmark test results\n") + f.write("Shape: " + str(tensorshape) + "\n") + f.write(f"Ran {tensor_count} times\n") + python_time = end_python - start_python + c_time = end_c - start_c + f.write( + f"Python: {python_time}s overall | {python_time / tensor_count}s on avg. per sample\n" + ) + f.write(f"C: {c_time}s overall | {c_time / tensor_count}s on avg. per sample\n")