apache · jorisvandenbossche · Apr 11, 2023 · Apr 4, 2023 · Apr 5, 2023 · Apr 5, 2023
@@ -170,6 +170,7 @@ def print_entry(label, value):
                          union, sparse_union, dense_union,
                          dictionary,
                          run_end_encoded,
+                         fixedshapetensor,
                          field,
                          type_for_alias,
                          DataType, DictionaryType, StructType,
@@ -178,7 +179,7 @@ def print_entry(label, value):
                          TimestampType, Time32Type, Time64Type, DurationType,
                          FixedSizeBinaryType, Decimal128Type, Decimal256Type,
                          BaseExtensionType, ExtensionType,
-                         RunEndEncodedType,
+                         RunEndEncodedType, FixedShapeTensorType,
                          PyExtensionType, UnknownExtensionType,
                          register_extension_type, unregister_extension_type,
                          DictionaryMemo,
@@ -209,7 +210,7 @@ def print_entry(label, value):
                          Time32Array, Time64Array, DurationArray,
                          MonthDayNanoIntervalArray,
                          Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
-                         RunEndEncodedArray,
+                         RunEndEncodedArray, FixedShapeTensorArray,
                          scalar, NA, _NULL as NULL, Scalar,
                          NullScalar, BooleanScalar,
                          Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,

@@ -3090,6 +3090,71 @@ cdef class ExtensionArray(Array):
         return self.storage.to_numpy(**kwargs)
 
 
+class FixedShapeTensorArray(ExtensionArray):
+    """
+    Concrete class for fixed shape tensor extension arrays.
+
+    Examples
+    --------
+    Define the extension type for tensor array
+
+    >>> import pyarrow as pa
+    >>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])
+
+    Create an extension array
+
+    >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
+    >>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
+    >>> pa.ExtensionArray.from_storage(tensor_type, storage)
+    <pyarrow.lib.FixedShapeTensorArray object at ...>
+    [
+      [
+        1,
+        2,
+        3,
+        4
+      ],
+      [
+        10,
+        20,
+        30,
+        40
+      ],
+      [
+        100,
+        200,
+        300,
+        400
+      ]
+    ]
+    """
+
+    def to_numpy_ndarray(self):
+        """
+        Convert fixed shape tensor extension array to a numpy array (with dim+1).
+        """
+        np_flat = np.asarray(self.storage.values)
+        numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape),
+                                       order='C')
+
+        return numpy_tensor
+
+    def from_numpy_ndarray(obj):
+        """
+        Convert numpy tensors (ndarrays) to a fixed shape tensor extension array.
+        """
+        numpy_type = obj.flatten().dtype
+        arrow_type = from_numpy_dtype(numpy_type)
+        shape = obj.shape[1:]
+        size = obj.size / obj.shape[0]
+
+        return ExtensionArray.from_storage(
+            FixedShapeTensorType(arrow_type, shape),
+            array([t.flatten() for t in obj],
+                  list_(arrow_type, size))
+        )
+
+
 cdef dict _array_classes = {
     _Type_NA: NullArray,
     _Type_BOOL: BooleanArray,

@@ -2619,6 +2619,31 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
         shared_ptr[CArray] storage()
 
 
+cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension":
+    cdef cppclass CFixedShapeTensorType \
+            " arrow::extension::FixedShapeTensorType"(CExtensionType):
+
+        @staticmethod
+        CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type,
+                                            const vector[int64_t]& shape,
+                                            const vector[int64_t]& permutation,
+                                            const vector[c_string]& dim_names)
+
+        CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type,
+                                                   const c_string& serialized_data) const
+
+        c_string Serialize() const
+
+        const shared_ptr[CDataType] value_type()
+        const vector[int64_t] shape()
+        const vector[int64_t] permutation()
+        const vector[c_string] dim_names()
+
+        CFixedShapeTensorType(shared_ptr[CDataType]& value_type, int32_t& size,
+                              vector[int64_t]& shape, vector[int64_t]& permutation,
+                              vector[c_string]& dim_names)
+
+
 cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
     cdef enum CCompressionType" arrow::Compression::type":
         CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"

@@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType):
         const CPyExtensionType* cpy_ext_type
 
 
+cdef class FixedShapeTensorType(BaseExtensionType):
+    cdef:
+        const CFixedShapeTensorType* tensor_ext_type
+
+
 cdef class PyExtensionType(ExtensionType):
     pass
 

@@ -76,6 +76,7 @@ cdef api object pyarrow_wrap_data_type(
     cdef:
         const CExtensionType* ext_type
         const CPyExtensionType* cpy_ext_type
+        c_string tensor_name = tobytes("arrow.fixed_shape_tensor")
         DataType out
 
     if type.get() == NULL:
@@ -118,6 +119,8 @@ cdef api object pyarrow_wrap_data_type(
         cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
         if cpy_ext_type != nullptr:
             return cpy_ext_type.GetInstance()
+        elif ext_type.extension_name() == tensor_name:
+            out = FixedShapeTensorType.__new__(FixedShapeTensorType)
         else:
             out = BaseExtensionType.__new__(BaseExtensionType)
     else:

@@ -1127,3 +1127,86 @@ def test_cpp_extension_in_python(tmpdir):
     reconstructed_array = batch.column(0)
     assert reconstructed_array.type == uuid_type
     assert reconstructed_array == array
+
+
+def test_tensor_type():
+    tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 3))
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+    assert tensor_type.storage_type == pa.list_(pa.int8(), 6)
+
+
+def test_tensor_class_methods():
+    tensor_type = pa.FixedShapeTensorType(pa.float32(), (2, 3))
+    storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+                       pa.list_(pa.float32(), 6))
+    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+    expected = np.array(
+        [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)
+
+    result = arr.to_numpy_ndarray()
+    np.testing.assert_array_equal(result, expected)
+
+    tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(expected)
+    assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
+    assert tensor_array_from_numpy.type.value_type == pa.float32()
+    assert tensor_array_from_numpy.type.shape == [2, 3]
+
+
+@pytest.mark.parametrize("tensor_type", (
+    pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)),
+    pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]),
+    pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W'])
+))
+def test_tensor_type_ipc(tensor_type):
+    storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12))
+    arr = pa.ExtensionArray.from_storage(tensor_type, storage)
+    batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+
+    # check the built array has exactly the expected clss
+    tensor_class = tensor_type.__arrow_ext_class__()
+    assert type(arr) == tensor_class
+
+    buf = ipc_write_batch(batch)
+    del batch
+    batch = ipc_read_batch(buf)
+
+    result = batch.column(0)
+    # check the deserialized array class is the expected one
+    assert type(result) == tensor_class
+    assert result.type.extension_name == "arrow.fixed_shape_tensor"
+    assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]]
+
+    # we get back an actual TensorType
+    assert isinstance(result.type, pa.FixedShapeTensorType)
+    assert result.type.value_type == pa.int8()
+    assert result.type.shape == [2, 2, 3]
+
+    # using different parametrization as how it was registered
+    tensor_type_uint = tensor_type.__class__(pa.uint8(), (2, 3))
+    assert tensor_type_uint.extension_name == "arrow.fixed_shape_tensor"
+    assert tensor_type_uint.value_type == pa.uint8()
+    assert tensor_type_uint.shape == [2, 3]
+
+    storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
+                       pa.list_(pa.uint8(), 6))
+    arr = pa.ExtensionArray.from_storage(tensor_type_uint, storage)
+    batch = pa.RecordBatch.from_arrays([arr], ["ext"])
+
+    buf = ipc_write_batch(batch)
+    del batch
+    batch = ipc_read_batch(buf)
+    result = batch.column(0)
+    assert isinstance(result.type, pa.FixedShapeTensorType)
+    assert result.type.value_type == pa.uint8()
+    assert result.type.shape == [2, 3]
+    assert type(result) == tensor_class
+
+
+def test_tensor_type_equality():
+    tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3))
+    assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
+
+    tensor_type2 = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3))
+    tensor_type3 = pa.FixedShapeTensorType(pa.uint8(), (2, 2, 3))
+    assert tensor_type == tensor_type2
+    assert not tensor_type == tensor_type3