Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 43 additions & 9 deletions arrow-data/src/transform/list_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,55 @@ pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
let offsets = array.buffer::<T>(0);
let sizes = array.buffer::<T>(1);
Box::new(
move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| {
move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| {
let offset_buffer = &mut mutable.buffer1;
let sizes_buffer = &mut mutable.buffer2;

for &offset in &offsets[start..start + len] {
offset_buffer.push(offset);
}
// MutableArrayData builds a new independent array, so we must copy the child values
// that the source elements reference. Since ListView allows a
// non-contiguous offset layout we can do this efficiently by finding the
// bounding child range [child_min, child_max) that covers all elements being copied.
// We then bulk copy that entire range into the output child array, and
// remap each element's offset to point to its data at the new
// location. Sizes are unchanged.
//
// The copied range may include unreferenced child values in gaps
// between elements, but that is fine — ListView allows non-contiguous
// offsets, so the (offset, size) pairs precisely identify each
// element's data regardless of what else sits in the child array.

// sizes
for &size in &sizes[start..start + len] {
sizes_buffer.push(size);
// Find the bounding child range
let mut child_min = usize::MAX;
let mut child_max = 0usize;
for i in start..start + len {
let size = sizes[i].as_usize();
if size > 0 {
let offset = offsets[i].as_usize();
child_min = child_min.min(offset);
child_max = child_max.max(offset + size);
}
}

// the beauty of views is that we don't need to copy child_data, we just splat
// the offsets and sizes.
// Copy the child range
let child_base = if child_max > child_min {
let base = mutable.child_data[0].len();
mutable.child_data[0].extend(index, child_min, child_max);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not slice?

Copy link
Contributor Author

@vegarsti vegarsti Mar 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you referring to extend_with_slice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, probably the slice operation!

base
} else {
0
};

// Remap offsets
for i in start..start + len {
let size = sizes[i].as_usize();
let new_offset = if size > 0 {
offsets[i].as_usize() - child_min + child_base
} else {
0
};
offset_buffer.push(T::from_usize(new_offset).expect("offset overflow"));
sizes_buffer.push(sizes[i]);
}
},
)
}
Expand Down
2 changes: 1 addition & 1 deletion arrow-data/src/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ impl<'a> MutableArrayData<'a> {
/// Extends the in progress array with a region of the input arrays
///
/// # Arguments
/// * `index` - the index of array that you what to copy values from
/// * `index` - the index of array that you want to copy values from
/// * `start` - the start index of the chunk (inclusive)
/// * `end` - the end index of the chunk (exclusive)
///
Expand Down
55 changes: 52 additions & 3 deletions arrow/tests/array_transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

use arrow::array::{
Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, FixedSizeBinaryArray,
FixedSizeListBuilder, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, ListBuilder,
MapBuilder, NullArray, StringArray, StringBuilder, StringDictionaryBuilder, StructArray,
UInt8Array, UInt16Array, UInt16Builder, UnionArray,
FixedSizeListBuilder, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int64Builder,
ListArray, ListBuilder, ListViewArray, MapBuilder, NullArray, StringArray, StringBuilder,
StringDictionaryBuilder, StructArray, UInt8Array, UInt16Array, UInt16Builder, UnionArray,
};
use arrow::datatypes::Int16Type;
use arrow_array::StringViewArray;
Expand Down Expand Up @@ -1151,3 +1151,52 @@ fn test_fixed_size_list_append() {
.unwrap();
assert_eq!(finished, expected_fixed_size_list_data);
}

#[test]
fn test_list_view_mutable_array_data_extend() {
// Repro: MutableArrayData's extend for ListView copies offsets/sizes
// but does not copy the child data, resulting in an invalid array.
//
// Build a ListViewArray: [[1, 2], null, [3]]
let mut builder = GenericListBuilder::<i32, _>::new(Int64Builder::new());
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true);
builder.append(false);
builder.values().append_value(3);
builder.append(true);
let list: ListViewArray = builder.finish().into();

let data = list.to_data();
let mut mutable = MutableArrayData::new(vec![&data], false, 3);

// Extend with all 3 elements
mutable.extend(0, 0, 3);

let result = mutable.freeze();
let result_array = arrow_array::make_array(result);
let result_list_view = result_array
.as_any()
.downcast_ref::<ListViewArray>()
.unwrap();

// The result should have length 3 and match the original.
assert_eq!(result_list_view.len(), 3);

// Verify the child values were actually copied.
assert_eq!(result_list_view.values().len(), list.values().len());

// Collect the list elements to confirm they are accessible and correct.
let collected: Vec<Option<Vec<i64>>> = (0..result_list_view.len())
.map(|i| {
if result_list_view.is_null(i) {
None
} else {
let arr = result_list_view.value(i);
let int_arr = arr.as_any().downcast_ref::<Int64Array>().unwrap();
Some(int_arr.values().to_vec())
}
})
.collect();
assert_eq!(collected, vec![Some(vec![1, 2]), None, Some(vec![3])]);
}
Loading