diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs index 711e816f70d5..1f9773422920 100644 --- a/arrow-select/src/interleave.rs +++ b/arrow-select/src/interleave.rs @@ -108,6 +108,8 @@ pub fn interleave( DataType::Struct(fields) => interleave_struct(fields, values, indices), DataType::List(field) => interleave_list::(values, indices, field), DataType::LargeList(field) => interleave_list::(values, indices, field), + DataType::ListView(field) => interleave_list_view::(values, indices, field), + DataType::LargeListView(field) => interleave_list_view::(values, indices, field), _ => interleave_fallback(values, indices) } } @@ -411,6 +413,56 @@ fn interleave_list( Ok(Arc::new(list_array)) } +fn interleave_list_view( + values: &[&dyn Array], + indices: &[(usize, usize)], + field: &FieldRef, +) -> Result { + let interleaved = Interleave::<'_, GenericListViewArray>::new(values, indices); + + // Collect child indices for each referenced list element and build + // new offsets/sizes that point into the interleaved child array + let mut capacity = 0usize; + let mut offsets = Vec::with_capacity(indices.len()); + let mut sizes = Vec::with_capacity(indices.len()); + for &(array_idx, row_idx) in indices { + let list = interleaved.arrays[array_idx]; + let size = list.sizes()[row_idx].as_usize(); + offsets.push( + O::from_usize(capacity).ok_or_else(|| ArrowError::OffsetOverflowError(capacity))?, + ); + sizes.push(O::from_usize(size).ok_or_else(|| ArrowError::OffsetOverflowError(capacity))?); + capacity += size; + } + + // Build child indices for recursive interleave of child values + let mut child_indices = Vec::with_capacity(capacity); + for &(array_idx, row_idx) in indices { + let list = interleaved.arrays[array_idx]; + let start = list.offsets()[row_idx].as_usize(); + let size = list.sizes()[row_idx].as_usize(); + child_indices.extend((start..start + size).map(|i| (array_idx, i))); + } + + let child_arrays: Vec<&dyn Array> = interleaved + .arrays + .iter() + .map(|list| list.values().as_ref()) + .collect(); + + let interleaved_values = interleave(&child_arrays, &child_indices)?; + + let list_view_array = GenericListViewArray::::new( + field.clone(), + offsets.into(), + sizes.into(), + interleaved_values, + interleaved.nulls, + ); + + Ok(Arc::new(list_view_array)) +} + /// Fallback implementation of interleave using [`MutableArrayData`] fn interleave_fallback( values: &[&dyn Array], @@ -580,7 +632,9 @@ pub fn interleave_record_batch( mod tests { use super::*; use arrow_array::Int32RunArray; - use arrow_array::builder::{GenericListBuilder, Int32Builder, PrimitiveRunBuilder}; + use arrow_array::builder::{ + GenericListBuilder, Int32Builder, Int64Builder, PrimitiveRunBuilder, + }; use arrow_array::types::Int8Type; use arrow_schema::Field; @@ -769,6 +823,61 @@ mod tests { test_interleave_lists::(); } + fn test_interleave_list_views() { + // [[1, 2], null, [3]] + let mut a = GenericListBuilder::::new(Int32Builder::new()); + a.values().append_value(1); + a.values().append_value(2); + a.append(true); + a.append(false); + a.values().append_value(3); + a.append(true); + let a: GenericListViewArray = a.finish().into(); + + // [[4], null, [5, 6, null]] + let mut b = GenericListBuilder::::new(Int32Builder::new()); + b.values().append_value(4); + b.append(true); + b.append(false); + b.values().append_value(5); + b.values().append_value(6); + b.values().append_null(); + b.append(true); + let b: GenericListViewArray = b.finish().into(); + + let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap(); + let v = values + .as_any() + .downcast_ref::>() + .unwrap(); + + // [[3], null, [4], [5, 6, null], null] + let mut expected = GenericListBuilder::::new(Int32Builder::new()); + expected.values().append_value(3); + expected.append(true); + expected.append(false); + expected.values().append_value(4); + expected.append(true); + expected.values().append_value(5); + expected.values().append_value(6); + expected.values().append_null(); + expected.append(true); + expected.append(false); + let expected: GenericListViewArray = expected.finish().into(); + + assert_eq!(v, &expected); + } + + #[test] + fn test_list_views() { + test_interleave_list_views::(); + } + + #[test] + fn test_large_list_views() { + test_interleave_list_views::(); + } + #[test] fn test_struct_without_nulls() { let fields = Fields::from(vec![ @@ -1489,4 +1598,51 @@ mod tests { Err(ArrowError::OffsetOverflowError(_)) )); } + + /// Regression test to show that ListView non-native implementation, + /// which falls through to the MutableArrayData fallback, is broken. + #[test] + fn test_list_view_interleave_fallback_is_broken() { + // Array a: [[1, 2], null, [3]] + let mut builder = GenericListBuilder::::new(Int64Builder::new()); + builder.values().append_value(1); + builder.values().append_value(2); + builder.append(true); + builder.append(false); + builder.values().append_value(3); + builder.append(true); + let a: ListViewArray = builder.finish().into(); + + // Array b: [[4], null, [5, 6, 7]] + let mut builder = GenericListBuilder::::new(Int64Builder::new()); + builder.values().append_value(4); + builder.append(true); + builder.append(false); + builder.values().append_value(5); + builder.values().append_value(6); + builder.values().append_value(7); + builder.append(true); + let b: ListViewArray = builder.finish().into(); + + let indices = &[(0, 2), (1, 0), (0, 0), (1, 2)]; + let result = interleave(&[&a as _, &b as _], indices).unwrap(); + let result_list_view = result.as_any().downcast_ref::().unwrap(); + + // Build expected: [[3], [4], [1, 2], [5, 6, 7]] + let mut expected_builder = GenericListBuilder::::new(Int64Builder::new()); + expected_builder.values().append_value(3); + expected_builder.append(true); + expected_builder.values().append_value(4); + expected_builder.append(true); + expected_builder.values().append_value(1); + expected_builder.values().append_value(2); + expected_builder.append(true); + expected_builder.values().append_value(5); + expected_builder.values().append_value(6); + expected_builder.values().append_value(7); + expected_builder.append(true); + let expected: ListViewArray = expected_builder.finish().into(); + + assert_eq!(&expected, result_list_view); + } } diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs index 8daf42a14414..182f48f5a646 100644 --- a/arrow/benches/interleave_kernels.rs +++ b/arrow/benches/interleave_kernels.rs @@ -121,6 +121,11 @@ fn add_benchmark(c: &mut Criterion) { let list_i64_no_nulls = create_primitive_list_array_with_seed::(8192, 0.0, 0.0, 20, 42); + let list_view_i64: ListViewArray = + create_primitive_list_array_with_seed::(8192, 0.1, 0.1, 20, 42).into(); + let list_view_i64_no_nulls: ListViewArray = + create_primitive_list_array_with_seed::(8192, 0.0, 0.0, 20, 42).into(); + let cases: &[(&str, &dyn Array)] = &[ ("i32(0.0)", &i32), ("i32(0.5)", &i32_opt), @@ -143,6 +148,8 @@ fn add_benchmark(c: &mut Criterion) { ), ("list(0.1,0.1,20)", &list_i64), ("list(0.0,0.0,20)", &list_i64_no_nulls), + ("list_view(0.1,0.1,20)", &list_view_i64), + ("list_view(0.0,0.0,20)", &list_view_i64_no_nulls), ]; for (prefix, base) in cases {