Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ harness = false
[[bench]]
name = "record_batch"
harness = false

[[bench]]
name = "boolean_array"
harness = false
77 changes: 77 additions & 0 deletions arrow-array/benches/boolean_array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::BooleanArray;
use criterion::*;
use std::hint;

fn criterion_benchmark(c: &mut Criterion) {
for len in [64, 1024, 65536] {
// All true (no nulls)
let all_true = BooleanArray::from(vec![true; len]);
c.bench_function(&format!("true_count(all_true, {len})"), |b| {
b.iter(|| hint::black_box(&all_true).true_count());
});
c.bench_function(&format!("has_true(all_true, {len})"), |b| {
b.iter(|| hint::black_box(&all_true).has_true());
});
c.bench_function(&format!("has_false(all_true, {len})"), |b| {
b.iter(|| hint::black_box(&all_true).has_false());
});

// All false (no nulls)
let all_false = BooleanArray::from(vec![false; len]);
c.bench_function(&format!("true_count(all_false, {len})"), |b| {
b.iter(|| hint::black_box(&all_false).true_count());
});
c.bench_function(&format!("has_true(all_false, {len})"), |b| {
b.iter(|| hint::black_box(&all_false).has_true());
});
c.bench_function(&format!("has_false(all_false, {len})"), |b| {
b.iter(|| hint::black_box(&all_false).has_false());
});

// Mixed: first element differs (best-case short-circuit)
let mut mixed_early: Vec<bool> = vec![true; len];
mixed_early[0] = false;
let mixed_early = BooleanArray::from(mixed_early);
c.bench_function(&format!("true_count(mixed_early, {len})"), |b| {
b.iter(|| hint::black_box(&mixed_early).true_count());
});
c.bench_function(&format!("has_false(mixed_early, {len})"), |b| {
b.iter(|| hint::black_box(&mixed_early).has_false());
});

// With nulls: all valid values true
let with_nulls: Vec<Option<bool>> = (0..len)
.map(|i| if i % 10 == 0 { None } else { Some(true) })
.collect();
let with_nulls = BooleanArray::from(with_nulls);
c.bench_function(&format!("true_count(nulls_all_true, {len})"), |b| {
b.iter(|| hint::black_box(&with_nulls).true_count());
});
c.bench_function(&format!("has_true(nulls_all_true, {len})"), |b| {
b.iter(|| hint::black_box(&with_nulls).has_true());
});
c.bench_function(&format!("has_false(nulls_all_true, {len})"), |b| {
b.iter(|| hint::black_box(&with_nulls).has_false());
});
}
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
159 changes: 157 additions & 2 deletions arrow-array/src/array/boolean_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use crate::array::print_long_array;
use crate::builder::BooleanBuilder;
use crate::iterator::BooleanIter;
use crate::{Array, ArrayAccessor, ArrayRef, Scalar};
use arrow_buffer::bit_chunk_iterator::UnalignedBitChunk;
use arrow_buffer::{BooleanBuffer, Buffer, MutableBuffer, NullBuffer, bit_util};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::DataType;
Expand Down Expand Up @@ -156,7 +157,8 @@ impl BooleanArray {
&self.values
}

/// Returns the number of non null, true values within this array
/// Returns the number of non null, true values within this array.
/// If you only need to check if there is at least one true value, consider using `has_true()` which can short-circuit and be more efficient.
pub fn true_count(&self) -> usize {
match self.nulls() {
Some(nulls) => {
Expand All @@ -171,11 +173,89 @@ impl BooleanArray {
}
}

/// Returns the number of non null, false values within this array
/// Returns the number of non null, false values within this array.
/// If you only need to check if there is at least one false value, consider using `has_false()` which can short-circuit and be more efficient.
pub fn false_count(&self) -> usize {
self.len() - self.null_count() - self.true_count()
}

/// Returns whether there is at least one non-null `true` value in this array.
///
/// This is more efficient than `true_count() > 0` because it can short-circuit
/// as soon as a `true` value is found, without counting all set bits.
///
/// Null values are not counted as `true`. Returns `false` for empty arrays.
pub fn has_true(&self) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may want to add this API to BooleanBuffer as well (as a follow on PR)

match self.nulls() {
Some(nulls) => {
let null_chunks = nulls.inner().bit_chunks().iter_padded();
let value_chunks = self.values().bit_chunks().iter_padded();
null_chunks.zip(value_chunks).any(|(n, v)| (n & v) != 0)
}
None => {
let bit_chunks = UnalignedBitChunk::new(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't you be able to use BitChunkIterator here?

self.values().values(),
self.values().offset(),
self.len(),
);
bit_chunks.prefix().unwrap_or(0) != 0
|| bit_chunks
.chunks()
.chunks(64)
.any(|block| block.iter().fold(0u64, |acc, &c| acc | c) != 0)
|| bit_chunks.suffix().unwrap_or(0) != 0
}
}
}

/// Returns whether there is at least one non-null `false` value in this array.
///
/// This is more efficient than `false_count() > 0` because it can short-circuit
/// as soon as a `false` value is found, without counting all set bits.
///
/// Null values are not counted as `false`. Returns `false` for empty arrays.
pub fn has_false(&self) -> bool {
match self.nulls() {
Some(nulls) => {
let null_chunks = nulls.inner().bit_chunks().iter_padded();
let value_chunks = self.values().bit_chunks().iter_padded();
null_chunks.zip(value_chunks).any(|(n, v)| (n & !v) != 0)
}
None => {
let bit_chunks = UnalignedBitChunk::new(
self.values().values(),
self.values().offset(),
self.len(),
);
// UnalignedBitChunk zeros padding bits; fill them with 1s so
// they don't appear as false values.
let lead_mask = !((1u64 << bit_chunks.lead_padding()) - 1);
let trail_mask = if bit_chunks.trailing_padding() == 0 {
u64::MAX
} else {
(1u64 << (64 - bit_chunks.trailing_padding())) - 1
};
// If both prefix and suffix exist, suffix gets trail_mask.
// If only prefix exists, it gets both masks.
let (prefix_fill, suffix_fill) = match (bit_chunks.prefix(), bit_chunks.suffix()) {
(Some(_), Some(_)) => (!lead_mask, !trail_mask),
(Some(_), None) => (!lead_mask | !trail_mask, 0),
_ => (0, 0),
};
bit_chunks
.prefix()
.is_some_and(|v| (v | prefix_fill) != u64::MAX)
|| bit_chunks
.chunks()
.chunks(64)
.any(|block| block.iter().fold(u64::MAX, |acc, &c| acc & c) != u64::MAX)
|| bit_chunks
.suffix()
.is_some_and(|v| (v | suffix_fill) != u64::MAX)
}
}
}

/// Returns the boolean value at index `i`.
///
/// Note: This method does not check for nulls and the value is arbitrary
Expand Down Expand Up @@ -854,4 +934,79 @@ mod tests {
assert!(sliced.is_valid(1));
assert!(!sliced.value(1));
}

#[test]
fn test_has_true_has_false_all_true() {
let arr = BooleanArray::from(vec![true, true, true]);
assert!(arr.has_true());
assert!(!arr.has_false());
}

#[test]
fn test_has_true_has_false_all_false() {
let arr = BooleanArray::from(vec![false, false, false]);
assert!(!arr.has_true());
assert!(arr.has_false());
}

#[test]
fn test_has_true_has_false_mixed() {
let arr = BooleanArray::from(vec![true, false, true]);
assert!(arr.has_true());
assert!(arr.has_false());
}

#[test]
fn test_has_true_has_false_empty() {
let arr = BooleanArray::from(Vec::<bool>::new());
assert!(!arr.has_true());
assert!(!arr.has_false());
}

#[test]
fn test_has_true_has_false_nulls_all_valid_true() {
let arr = BooleanArray::from(vec![Some(true), None, Some(true)]);
assert!(arr.has_true());
assert!(!arr.has_false());
}

#[test]
fn test_has_true_has_false_nulls_all_valid_false() {
let arr = BooleanArray::from(vec![Some(false), None, Some(false)]);
assert!(!arr.has_true());
assert!(arr.has_false());
}

#[test]
fn test_has_true_has_false_all_null() {
let arr = BooleanArray::new_null(5);
assert!(!arr.has_true());
assert!(!arr.has_false());
}

#[test]
fn test_has_false_non_aligned_all_true() {
// 65 elements: exercises the remainder path in has_false
let arr = BooleanArray::from(vec![true; 65]);
assert!(arr.has_true());
assert!(!arr.has_false());
}

#[test]
fn test_has_false_non_aligned_last_false() {
// 64 trues + 1 false: remainder path should find the false
let mut values = vec![true; 64];
values.push(false);
let arr = BooleanArray::from(values);
assert!(arr.has_true());
assert!(arr.has_false());
}

#[test]
fn test_has_false_exact_64_all_true() {
// Exactly 64 elements, no remainder
let arr = BooleanArray::from(vec![true; 64]);
assert!(arr.has_true());
assert!(!arr.has_false());
}
}
Loading