Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d69685a
Add bindings and tests for FixedShapeTensorType and Array
AlenkaF Apr 4, 2023
a6292f8
Fix linter error
AlenkaF Apr 5, 2023
1bdba1d
Add pa.fixedshapetensor factory function and update docstring examples
AlenkaF Apr 5, 2023
7c395b0
Apply suggestions from code review - Joris
AlenkaF Apr 5, 2023
d27d48f
Use pa.FixedSizeListArray.from_arrays(..) in from_numpy_ndarray()
AlenkaF Apr 5, 2023
8e790b4
Change fixedshapetensor to fixed_shape_tensor
AlenkaF Apr 5, 2023
64e0cd0
Add tests for all the custom attributes
AlenkaF Apr 5, 2023
48cbeb3
Add test for numpy F-contiguous
AlenkaF Apr 5, 2023
d9ca165
Correct dim_names() to return list of strings, not bytes
AlenkaF Apr 5, 2023
d3530af
Correct dim_names and permutation methods to return None and not empt…
AlenkaF Apr 5, 2023
e2ce8ba
Replace FixedShapeTensorType with fixed_shape_tensor in FixedShapeTen…
AlenkaF Apr 5, 2023
ee5d25c
Update from_numpy_ndarray docstrings
AlenkaF Apr 5, 2023
f5a5c0c
Update public-api.pxi
AlenkaF Apr 5, 2023
52f9e7e
Update python/pyarrow/types.pxi
AlenkaF Apr 5, 2023
f9dee9e
Merge branch 'main' into python-binding-tensor-extension-type
AlenkaF Apr 5, 2023
b171d00
Use ravel insted of flatten and raise ValueError if numpy array is no…
AlenkaF Apr 5, 2023
c0ec94c
Remove CFixedShapeTensorType binding in libarrow
AlenkaF Apr 5, 2023
f2d9fe7
Fix doctest failure
AlenkaF Apr 6, 2023
8b5dc93
Add explanation of permutation from the spec to the docstring of fixe…
AlenkaF Apr 6, 2023
570f086
from_numpy_ndarray should be a static method
AlenkaF Apr 6, 2023
3dbbe20
Apply suggestions from code review
AlenkaF Apr 6, 2023
223968a
Apply suggestions from code review
AlenkaF Apr 6, 2023
dd8fd31
Update to_numpy_ndarraydocstring
AlenkaF Apr 7, 2023
1ebb829
Add a check for non-trivial permutation in to_numpy_ndarray
AlenkaF Apr 11, 2023
b2d0453
Update python/pyarrow/array.pxi
AlenkaF Apr 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ def print_entry(label, value):
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
fixedshapetensor,
field,
type_for_alias,
DataType, DictionaryType, StructType,
Expand All @@ -178,7 +179,7 @@ def print_entry(label, value):
TimestampType, Time32Type, Time64Type, DurationType,
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType,
RunEndEncodedType, FixedShapeTensorType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
Expand Down Expand Up @@ -209,7 +210,7 @@ def print_entry(label, value):
Time32Array, Time64Array, DurationArray,
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
RunEndEncodedArray,
RunEndEncodedArray, FixedShapeTensorArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
Expand Down
65 changes: 65 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -3090,6 +3090,71 @@ cdef class ExtensionArray(Array):
return self.storage.to_numpy(**kwargs)


class FixedShapeTensorArray(ExtensionArray):
"""
Concrete class for fixed shape tensor extension arrays.

Examples
--------
Define the extension type for tensor array

>>> import pyarrow as pa
>>> tensor_type = FixedShapeTensorType(pa.int32(), [2, 2])

Create an extension array

>>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]]
>>> storage = pa.array(arr, pa.list_(pa.int32(), 4))
>>> pa.ExtensionArray.from_storage(tensor_type, storage)
<pyarrow.lib.FixedShapeTensorArray object at ...>
[
[
1,
2,
3,
4
],
[
10,
20,
30,
40
],
[
100,
200,
300,
400
]
]
"""

def to_numpy_ndarray(self):
"""
Convert fixed shape tensor extension array to a numpy array (with dim+1).
"""
np_flat = np.asarray(self.storage.values)
numpy_tensor = np_flat.reshape((len(self),) + tuple(self.type.shape),
order='C')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does reshape incur a copy? Perhaps we should throw if permutation[0] != 0 (as that would mean array can't be chunked)?
Also shouldn't we take permutation into account when reshaping?

Copy link
Member

@jorisvandenbossche jorisvandenbossche Apr 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't guarantee to be zero-copy in all cases, but in this simple case of reshaping a 1D array to an nD array, this is always zero copy AFAIK

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens in cases where permutation doesn't give C style layout?

Copy link
Member

@jorisvandenbossche jorisvandenbossche Apr 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A permutation would happen after the reshape, I think? (so the initial reshape is always C contiguous) And that is left to the user for now in this method (which we should document, or add)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but permutation would not roundtrip to numpy object and back. I'd prefer to throw NotImplemented if permutation is not trivial (C).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, actually I need to error if permutation[0] != 0. And if the permutation is for example [0, 2, 1] the user could still use the information about the permutation to rearrange the ordering after the reshape in to_numpy_ndarray. Please correct me if I am wrong =)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, actually I need to error if permutation[0] != 0.

No, the self.type.permutation is only that of the individual tensor elements (given that you have a FixedShapeTensorArray, by definition the first dimension of the n+1 dim ndarray is always the length of the array, and can't be permuted)

And if the permutation is for example [0, 2, 1] the user could still use the information about the permutation to rearrange the ordering after the reshape in to_numpy_ndarray.

Yes, and as mentioned before, I think we should certainly add examples how the user can do this (if we decide to not automatically do it in to_numpy_ndarray)

Do we want to add a warning for dim_names not None also?

I wouldn't raise a warning for that, since it's a fact of life that numpy arrays don't have dimension names, so that's just a consequence of calling this method. We can mention that in the docstring, though, to be explicit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the self.type.permutation is only that of the individual tensor elements (given that you have a FixedShapeTensorArray, by definition the first dimension of the n+1 dim ndarray is always the length of the array, and can't be permuted)

Yes, thank you for confirming the thought I was currently struggling with!
Then I do not see the need for raising an error (as it would be raised for every permutation that is not None) but maybe make it explicit in the docstrings that the "0" dimension is by default not permutable and is fixed (see the examples and description I added in the docs PR for this binding: https://github.com/apache/arrow/pull/34957/files)

Yes, and as mentioned before, I think we should certainly add examples how the user can do this (if we decide to not automatically do it in to_numpy_ndarray)

Done: https://github.com/apache/arrow/pull/34957/files

I wouldn't raise a warning for that, since it's a fact of life that numpy arrays don't have dimension names, so that's just a consequence of calling this method. We can mention that in the docstring, though, to be explicit.

Agree, will make it explicit in the docstrings.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After couple of days to think it over, I would agree with all the comments from Joris at the beginning of the thread.

  1. The use of permutation to reorder data should (for now) be left to the user to decide if they want to do that or not. I have added a note in the docstrings: dd8fd31
  2. to_numpy_ndarray is reshaping 1-D array so there is no need to check the layout style as 1-D array is always both C-contiguous and F-contiguous.
  3. We cannot create a F-contiguous tensor in PyArrow and there is a check for row layout in from_numpy_ndarray so there is no option, currently, to have a F-contiguous tensors in the tensor array anyways.

With this I propose to leave the code as is and maybe open an issue later to discuss changes/features further.

I have also added a PR for the documentation of the fixed_shape_tensor binding here: #34957. After the binding PR is merged I will run the code again, build the docs locally to check the html and then mark the PR as ready for review.

Copy link
Member

@rok rok Apr 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. to_numpy_ndarray is reshaping 1-D array so there is no need to check the layout style as 1-D array is always both C-contiguous and F-contiguous.

While that's true for the physical layout it's not necessarily true for the logical layout. If permutation is non-trivial tenor will not be laid out in memory in C-contiguous way. I propose (#34883 (review)) we block converting tensors with non-trivial permutations and add correct logic to handle those later. (Another option is to go via array.ToTensor().to_numpy() once FromTensor/ToTensor is merged.)


return numpy_tensor

def from_numpy_ndarray(obj):
"""
Convert numpy tensors (ndarrays) to a fixed shape tensor extension array.
"""
numpy_type = obj.flatten().dtype
arrow_type = from_numpy_dtype(numpy_type)
shape = obj.shape[1:]
size = obj.size / obj.shape[0]

return ExtensionArray.from_storage(
FixedShapeTensorType(arrow_type, shape),
array([t.flatten() for t in obj],
list_(arrow_type, size))
)


cdef dict _array_classes = {
_Type_NA: NullArray,
_Type_BOOL: BooleanArray,
Expand Down
25 changes: 25 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2619,6 +2619,31 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
shared_ptr[CArray] storage()


cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension":
cdef cppclass CFixedShapeTensorType \
" arrow::extension::FixedShapeTensorType"(CExtensionType):

@staticmethod
CResult[shared_ptr[CDataType]] Make(const shared_ptr[CDataType]& value_type,
const vector[int64_t]& shape,
const vector[int64_t]& permutation,
const vector[c_string]& dim_names)

CResult[shared_ptr[CDataType]] Deserialize(const shared_ptr[CDataType] storage_type,
const c_string& serialized_data) const

c_string Serialize() const

const shared_ptr[CDataType] value_type()
const vector[int64_t] shape()
const vector[int64_t] permutation()
const vector[c_string] dim_names()

CFixedShapeTensorType(shared_ptr[CDataType]& value_type, int32_t& size,
vector[int64_t]& shape, vector[int64_t]& permutation,
vector[c_string]& dim_names)


cdef extern from "arrow/util/compression.h" namespace "arrow" nogil:
cdef enum CCompressionType" arrow::Compression::type":
CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED"
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ cdef class ExtensionType(BaseExtensionType):
const CPyExtensionType* cpy_ext_type


cdef class FixedShapeTensorType(BaseExtensionType):
cdef:
const CFixedShapeTensorType* tensor_ext_type


cdef class PyExtensionType(ExtensionType):
pass

Expand Down
3 changes: 3 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ cdef api object pyarrow_wrap_data_type(
cdef:
const CExtensionType* ext_type
const CPyExtensionType* cpy_ext_type
c_string tensor_name = tobytes("arrow.fixed_shape_tensor")
DataType out

if type.get() == NULL:
Expand Down Expand Up @@ -118,6 +119,8 @@ cdef api object pyarrow_wrap_data_type(
cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type)
if cpy_ext_type != nullptr:
return cpy_ext_type.GetInstance()
elif ext_type.extension_name() == tensor_name:
out = FixedShapeTensorType.__new__(FixedShapeTensorType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
Expand Down
83 changes: 83 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,3 +1127,86 @@ def test_cpp_extension_in_python(tmpdir):
reconstructed_array = batch.column(0)
assert reconstructed_array.type == uuid_type
assert reconstructed_array == array


def test_tensor_type():
tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 3))
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
assert tensor_type.storage_type == pa.list_(pa.int8(), 6)


def test_tensor_class_methods():
tensor_type = pa.FixedShapeTensorType(pa.float32(), (2, 3))
storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
pa.list_(pa.float32(), 6))
arr = pa.ExtensionArray.from_storage(tensor_type, storage)
expected = np.array(
[[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32)

result = arr.to_numpy_ndarray()
np.testing.assert_array_equal(result, expected)

tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(expected)
assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType)
assert tensor_array_from_numpy.type.value_type == pa.float32()
assert tensor_array_from_numpy.type.shape == [2, 3]


@pytest.mark.parametrize("tensor_type", (
pa.FixedShapeTensorType(pa.int8(), (2, 2, 3)),
pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]),
pa.FixedShapeTensorType(pa.int8(), (2, 2, 3), dim_names=['C', 'H', 'W'])
))
def test_tensor_type_ipc(tensor_type):
storage = pa.array([[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]], pa.list_(pa.int8(), 12))
arr = pa.ExtensionArray.from_storage(tensor_type, storage)
batch = pa.RecordBatch.from_arrays([arr], ["ext"])

# check the built array has exactly the expected clss
tensor_class = tensor_type.__arrow_ext_class__()
assert type(arr) == tensor_class

buf = ipc_write_batch(batch)
del batch
batch = ipc_read_batch(buf)

result = batch.column(0)
# check the deserialized array class is the expected one
assert type(result) == tensor_class
assert result.type.extension_name == "arrow.fixed_shape_tensor"
assert arr.storage.to_pylist() == [[1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6]]

# we get back an actual TensorType
assert isinstance(result.type, pa.FixedShapeTensorType)
assert result.type.value_type == pa.int8()
assert result.type.shape == [2, 2, 3]

# using different parametrization as how it was registered
tensor_type_uint = tensor_type.__class__(pa.uint8(), (2, 3))
assert tensor_type_uint.extension_name == "arrow.fixed_shape_tensor"
assert tensor_type_uint.value_type == pa.uint8()
assert tensor_type_uint.shape == [2, 3]

storage = pa.array([[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]],
pa.list_(pa.uint8(), 6))
arr = pa.ExtensionArray.from_storage(tensor_type_uint, storage)
batch = pa.RecordBatch.from_arrays([arr], ["ext"])

buf = ipc_write_batch(batch)
del batch
batch = ipc_read_batch(buf)
result = batch.column(0)
assert isinstance(result.type, pa.FixedShapeTensorType)
assert result.type.value_type == pa.uint8()
assert result.type.shape == [2, 3]
assert type(result) == tensor_class


def test_tensor_type_equality():
tensor_type = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3))
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"

tensor_type2 = pa.FixedShapeTensorType(pa.int8(), (2, 2, 3))
tensor_type3 = pa.FixedShapeTensorType(pa.uint8(), (2, 2, 3))
assert tensor_type == tensor_type2
assert not tensor_type == tensor_type3
Loading