diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs index 9b66a6a6abb1..19a90e921d0b 100644 --- a/arrow-data/src/transform/list_view.rs +++ b/arrow-data/src/transform/list_view.rs @@ -27,21 +27,55 @@ pub(super) fn build_extend( let offsets = array.buffer::(0); let sizes = array.buffer::(1); Box::new( - move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| { + move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| { let offset_buffer = &mut mutable.buffer1; let sizes_buffer = &mut mutable.buffer2; - for &offset in &offsets[start..start + len] { - offset_buffer.push(offset); - } + // MutableArrayData builds a new independent array, so we must copy the child values + // that the source elements reference. Since ListView allows a + // non-contiguous offset layout we can do this efficiently by finding the + // bounding child range [child_min, child_max) that covers all elements being copied. + // We then bulk copy that entire range into the output child array, and + // remap each element's offset to point to its data at the new + // location. Sizes are unchanged. + // + // The copied range may include unreferenced child values in gaps + // between elements, but that is fine — ListView allows non-contiguous + // offsets, so the (offset, size) pairs precisely identify each + // element's data regardless of what else sits in the child array. - // sizes - for &size in &sizes[start..start + len] { - sizes_buffer.push(size); + // Find the bounding child range + let mut child_min = usize::MAX; + let mut child_max = 0usize; + for i in start..start + len { + let size = sizes[i].as_usize(); + if size > 0 { + let offset = offsets[i].as_usize(); + child_min = child_min.min(offset); + child_max = child_max.max(offset + size); + } } - // the beauty of views is that we don't need to copy child_data, we just splat - // the offsets and sizes. + // Copy the child range + let child_base = if child_max > child_min { + let base = mutable.child_data[0].len(); + mutable.child_data[0].extend(index, child_min, child_max); + base + } else { + 0 + }; + + // Remap offsets + for i in start..start + len { + let size = sizes[i].as_usize(); + let new_offset = if size > 0 { + offsets[i].as_usize() - child_min + child_base + } else { + 0 + }; + offset_buffer.push(T::from_usize(new_offset).expect("offset overflow")); + sizes_buffer.push(sizes[i]); + } }, ) } diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index c6052817bfb6..07439bbd5bd6 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -719,7 +719,7 @@ impl<'a> MutableArrayData<'a> { /// Extends the in progress array with a region of the input arrays /// /// # Arguments - /// * `index` - the index of array that you what to copy values from + /// * `index` - the index of array that you want to copy values from /// * `start` - the start index of the chunk (inclusive) /// * `end` - the end index of the chunk (exclusive) /// diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs index 511dc1e8bfcd..63e5db91de60 100644 --- a/arrow/tests/array_transform.rs +++ b/arrow/tests/array_transform.rs @@ -17,9 +17,9 @@ use arrow::array::{ Array, ArrayRef, BooleanArray, Decimal128Array, DictionaryArray, FixedSizeBinaryArray, - FixedSizeListBuilder, Int16Array, Int32Array, Int64Array, Int64Builder, ListArray, ListBuilder, - MapBuilder, NullArray, StringArray, StringBuilder, StringDictionaryBuilder, StructArray, - UInt8Array, UInt16Array, UInt16Builder, UnionArray, + FixedSizeListBuilder, GenericListBuilder, Int16Array, Int32Array, Int64Array, Int64Builder, + ListArray, ListBuilder, ListViewArray, MapBuilder, NullArray, StringArray, StringBuilder, + StringDictionaryBuilder, StructArray, UInt8Array, UInt16Array, UInt16Builder, UnionArray, }; use arrow::datatypes::Int16Type; use arrow_array::StringViewArray; @@ -1151,3 +1151,52 @@ fn test_fixed_size_list_append() { .unwrap(); assert_eq!(finished, expected_fixed_size_list_data); } + +#[test] +fn test_list_view_mutable_array_data_extend() { + // Repro: MutableArrayData's extend for ListView copies offsets/sizes + // but does not copy the child data, resulting in an invalid array. + // + // Build a ListViewArray: [[1, 2], null, [3]] + let mut builder = GenericListBuilder::::new(Int64Builder::new()); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.append(false); + builder.values().append_value(3); + builder.append(true); + let list: ListViewArray = builder.finish().into(); + + let data = list.to_data(); + let mut mutable = MutableArrayData::new(vec![&data], false, 3); + + // Extend with all 3 elements + mutable.extend(0, 0, 3); + + let result = mutable.freeze(); + let result_array = arrow_array::make_array(result); + let result_list_view = result_array + .as_any() + .downcast_ref::() + .unwrap(); + + // The result should have length 3 and match the original. + assert_eq!(result_list_view.len(), 3); + + // Verify the child values were actually copied. + assert_eq!(result_list_view.values().len(), list.values().len()); + + // Collect the list elements to confirm they are accessible and correct. + let collected: Vec>> = (0..result_list_view.len()) + .map(|i| { + if result_list_view.is_null(i) { + None + } else { + let arr = result_list_view.value(i); + let int_arr = arr.as_any().downcast_ref::().unwrap(); + Some(int_arr.values().to_vec()) + } + }) + .collect(); + assert_eq!(collected, vec![Some(vec![1, 2]), None, Some(vec![3])]); +}