diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index ada9b473a..464ac2f54 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -109,6 +109,7 @@ set(ICEBERG_SOURCES util/property_util.cc util/snapshot_util.cc util/string_util.cc + util/struct_like_set.cc util/temporal_util.cc util/timepoint.cc util/transform_util.cc diff --git a/src/iceberg/iceberg_export.h b/src/iceberg/iceberg_export.h index 64ed7dff9..8d7a35f27 100644 --- a/src/iceberg/iceberg_export.h +++ b/src/iceberg/iceberg_export.h @@ -27,8 +27,31 @@ # else # define ICEBERG_EXPORT __declspec(dllimport) # endif -#else // Not Windows + +# define ICEBERG_TEMPLATE_EXPORT ICEBERG_EXPORT + +// For template class declarations. Empty on MSVC: dllexport on a class template +// declaration combined with extern template triggers C4910. +# if defined(_MSC_VER) +# define ICEBERG_TEMPLATE_CLASS_EXPORT +# else +# define ICEBERG_TEMPLATE_CLASS_EXPORT ICEBERG_EXPORT +# endif + +// For extern template declarations. Empty when building the DLL on MSVC: +// `extern` + `dllexport` is contradictory and triggers C4910. +# if defined(_MSC_VER) && defined(ICEBERG_EXPORTING) && !defined(ICEBERG_STATIC) +# define ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT +# else +# define ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT ICEBERG_TEMPLATE_EXPORT +# endif + +#else // Non-Windows # ifndef ICEBERG_EXPORT # define ICEBERG_EXPORT __attribute__((visibility("default"))) # endif + +# define ICEBERG_TEMPLATE_EXPORT +# define ICEBERG_TEMPLATE_CLASS_EXPORT ICEBERG_EXPORT +# define ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT ICEBERG_TEMPLATE_EXPORT #endif diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 81af8dc30..b81bf32b8 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -127,6 +127,7 @@ iceberg_sources = files( 'util/property_util.cc', 'util/snapshot_util.cc', 'util/string_util.cc', + 'util/struct_like_set.cc', 'util/temporal_util.cc', 'util/timepoint.cc', 'util/transform_util.cc', diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index 768e0507e..d46d9bfcd 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -116,6 +116,7 @@ add_iceberg_test(util_test formatter_test.cc location_util_test.cc string_util_test.cc + struct_like_set_test.cc transform_util_test.cc truncate_util_test.cc url_encoder_test.cc diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index df2d5db8e..833bbee51 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -91,6 +91,7 @@ iceberg_tests = { 'formatter_test.cc', 'location_util_test.cc', 'string_util_test.cc', + 'struct_like_set_test.cc', 'transform_util_test.cc', 'truncate_util_test.cc', 'url_encoder_test.cc', diff --git a/src/iceberg/test/struct_like_set_test.cc b/src/iceberg/test/struct_like_set_test.cc new file mode 100644 index 000000000..7f7535ebc --- /dev/null +++ b/src/iceberg/test/struct_like_set_test.cc @@ -0,0 +1,389 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/struct_like_set.h" + +#include +#include +#include + +#include + +#include "iceberg/schema_field.h" +#include "iceberg/test/matchers.h" +#include "iceberg/type.h" + +namespace iceberg { + +class SimpleStructLike : public StructLike { + public: + explicit SimpleStructLike(std::vector fields) : fields_(std::move(fields)) {} + + Result GetField(size_t pos) const override { + if (pos >= fields_.size()) { + return NotFound("field position {} out of range [0, {})", pos, fields_.size()); + } + return fields_[pos]; + } + + size_t num_fields() const override { return fields_.size(); } + + void SetField(size_t pos, Scalar value) { fields_[pos] = std::move(value); } + + private: + std::vector fields_; +}; + +class SimpleArrayLike : public ArrayLike { + public: + explicit SimpleArrayLike(std::vector elements) + : elements_(std::move(elements)) {} + + Result GetElement(size_t pos) const override { + if (pos >= elements_.size()) { + return NotFound("element position {} out of range [0, {})", pos, elements_.size()); + } + return elements_[pos]; + } + + size_t size() const override { return elements_.size(); } + + private: + std::vector elements_; +}; + +class SimpleMapLike : public MapLike { + public: + SimpleMapLike(std::vector keys, std::vector values) + : keys_(std::move(keys)), values_(std::move(values)) {} + + Result GetKey(size_t pos) const override { + if (pos >= keys_.size()) { + return NotFound("key position {} out of range [0, {})", pos, keys_.size()); + } + return keys_[pos]; + } + + Result GetValue(size_t pos) const override { + if (pos >= values_.size()) { + return NotFound("value position {} out of range [0, {})", pos, values_.size()); + } + return values_[pos]; + } + + size_t size() const override { return keys_.size(); } + + private: + std::vector keys_; + std::vector values_; +}; + +class FailingStructLike : public StructLike { + public: + explicit FailingStructLike(size_t num_fields) : num_fields_(num_fields) {} + + Result GetField(size_t pos) const override { + return NotFound("boom at field {}", pos); + } + + size_t num_fields() const override { return num_fields_; } + + private: + size_t num_fields_; +}; + +StructType MakeStructType( + std::vector>> fields) { + std::vector schema_fields; + schema_fields.reserve(fields.size()); + int32_t id = 1; + for (auto& [name, type] : fields) { + schema_fields.push_back(SchemaField::MakeOptional(id++, name, std::move(type))); + } + return StructType(std::move(schema_fields)); +} + +TEST(StructLikeSetTest, EmptySet) { + auto type = MakeStructType({{"id", int32()}}); + StructLikeSet set(type); + + EXPECT_TRUE(set.IsEmpty()); + EXPECT_EQ(set.Size(), 0); + + SimpleStructLike row({Scalar{int32_t{1}}}); + EXPECT_THAT(set.Contains(row), HasValue(::testing::Eq(false))); +} + +TEST(StructLikeSetTest, InsertAndContains) { + auto type = MakeStructType({{"id", int32()}, {"name", string()}}); + StructLikeSet set(type); + + std::string name1 = "alice"; + std::string name2 = "bob"; + + SimpleStructLike row1({Scalar{int32_t{1}}, Scalar{std::string_view(name1)}}); + SimpleStructLike row2({Scalar{int32_t{2}}, Scalar{std::string_view(name2)}}); + + ASSERT_THAT(set.Insert(row1), IsOk()); + ASSERT_THAT(set.Insert(row2), IsOk()); + + EXPECT_EQ(set.Size(), 2); + EXPECT_FALSE(set.IsEmpty()); + EXPECT_THAT(set.Contains(row1), HasValue(::testing::Eq(true))); + EXPECT_THAT(set.Contains(row2), HasValue(::testing::Eq(true))); + + // Row not in the set + std::string name3 = "charlie"; + SimpleStructLike row3({Scalar{int32_t{3}}, Scalar{std::string_view(name3)}}); + EXPECT_THAT(set.Contains(row3), HasValue(::testing::Eq(false))); +} + +TEST(StructLikeSetTest, DuplicateInsert) { + auto type = MakeStructType({{"id", int32()}}); + StructLikeSet set(type); + + SimpleStructLike row({Scalar{int32_t{42}}}); + ASSERT_THAT(set.Insert(row), IsOk()); + EXPECT_EQ(set.Size(), 1); + + // Duplicate insertion should not increase size + ASSERT_THAT(set.Insert(row), IsOk()); + EXPECT_EQ(set.Size(), 1); +} + +TEST(StructLikeSetTest, FieldsWithNulls) { + auto type = MakeStructType({{"id", int32()}, {"data", int64()}}); + StructLikeSet set(type); + + // Row with null in second field + SimpleStructLike row1({Scalar{int32_t{1}}, Scalar{std::monostate{}}}); + SimpleStructLike row2({Scalar{int32_t{2}}, Scalar{std::monostate{}}}); + + ASSERT_THAT(set.Insert(row1), IsOk()); + ASSERT_THAT(set.Insert(row2), IsOk()); + + EXPECT_EQ(set.Size(), 2); + EXPECT_THAT(set.Contains(row1), HasValue(::testing::Eq(true))); + EXPECT_THAT(set.Contains(row2), HasValue(::testing::Eq(true))); + + // Same key as row1 — should match + SimpleStructLike row1_copy({Scalar{int32_t{1}}, Scalar{std::monostate{}}}); + EXPECT_THAT(set.Contains(row1_copy), HasValue(::testing::Eq(true))); +} + +TEST(StructLikeSetTest, StringFieldOwnership) { + auto type = MakeStructType({{"name", std::make_shared()}}); + StructLikeSet set(type); + + // Insert with a temporary string that will be destroyed + { + std::string temp = "temporary_string_data"; + SimpleStructLike row({Scalar{std::string_view(temp)}}); + ASSERT_THAT(set.Insert(row), IsOk()); + } + // temp is destroyed here — arena should hold the copy + + EXPECT_EQ(set.Size(), 1); + + // Look up with a new string that has the same content + std::string lookup = "temporary_string_data"; + SimpleStructLike lookup_row({Scalar{std::string_view(lookup)}}); + EXPECT_THAT(set.Contains(lookup_row), HasValue(::testing::Eq(true))); +} + +TEST(StructLikeSetTest, MultipleTypes) { + auto type = MakeStructType({{"b", boolean()}, + {"i", int32()}, + {"l", int64()}, + {"f", float32()}, + {"d", float64()}, + {"s", string()}, + {"dt", date()}}); + StructLikeSet set(type); + + std::string str = "hello"; + SimpleStructLike row({Scalar{true}, Scalar{int32_t{1}}, Scalar{int64_t{2}}, + Scalar{1.0f}, Scalar{2.0}, Scalar{std::string_view(str)}, + Scalar{int32_t{19000}}}); + ASSERT_THAT(set.Insert(row), IsOk()); + EXPECT_THAT(set.Contains(row), HasValue(::testing::Eq(true))); + + // Different values → not found + SimpleStructLike row2({Scalar{false}, Scalar{int32_t{1}}, Scalar{int64_t{2}}, + Scalar{1.0f}, Scalar{2.0}, Scalar{std::string_view(str)}, + Scalar{int32_t{19000}}}); + EXPECT_THAT(set.Contains(row2), HasValue(::testing::Eq(false))); +} + +TEST(StructLikeSetTest, NestedStruct) { + auto inner_type = struct_({SchemaField::MakeOptional(10, "x", int32()), + SchemaField::MakeOptional(11, "y", string())}); + auto outer_type = MakeStructType({{"id", int32()}, {"nested", inner_type}}); + StructLikeSet set(outer_type); + + // Create nested StructLike + std::string inner_str = "nested_value"; + auto inner = std::make_shared( + std::vector{Scalar{int32_t{10}}, Scalar{std::string_view(inner_str)}}); + + SimpleStructLike row({Scalar{int32_t{1}}, Scalar{std::shared_ptr(inner)}}); + ASSERT_THAT(set.Insert(row), IsOk()); + EXPECT_EQ(set.Size(), 1); + + // Look up with same nested content (different object) + std::string inner_str2 = "nested_value"; + auto inner2 = std::make_shared( + std::vector{Scalar{int32_t{10}}, Scalar{std::string_view(inner_str2)}}); + SimpleStructLike lookup( + {Scalar{int32_t{1}}, Scalar{std::shared_ptr(inner2)}}); + EXPECT_THAT(set.Contains(lookup), HasValue(::testing::Eq(true))); + + // Different nested content → not found + std::string inner_str3 = "different"; + auto inner3 = std::make_shared( + std::vector{Scalar{int32_t{10}}, Scalar{std::string_view(inner_str3)}}); + SimpleStructLike different( + {Scalar{int32_t{1}}, Scalar{std::shared_ptr(inner3)}}); + EXPECT_THAT(set.Contains(different), HasValue(::testing::Eq(false))); +} + +TEST(StructLikeSetTest, NestedStructOwnership) { + auto inner_type = struct_({SchemaField::MakeOptional(10, "s", string())}); + auto outer_type = MakeStructType({{"nested", inner_type}}); + StructLikeSet set(outer_type); + + // Insert with temporary inner data + { + std::string temp = "will_be_destroyed"; + auto inner = std::make_shared( + std::vector{Scalar{std::string_view(temp)}}); + SimpleStructLike row({Scalar{std::shared_ptr(inner)}}); + ASSERT_THAT(set.Insert(row), IsOk()); + } + // temp and inner are destroyed here. Arena should hold copies. + + EXPECT_EQ(set.Size(), 1); + + // Look up with new identical content + std::string lookup_str = "will_be_destroyed"; + auto inner2 = std::make_shared( + std::vector{Scalar{std::string_view(lookup_str)}}); + SimpleStructLike lookup({Scalar{std::shared_ptr(inner2)}}); + EXPECT_THAT(set.Contains(lookup), HasValue(::testing::Eq(true))); +} + +TEST(StructLikeSetTest, AllNullRow) { + auto type = MakeStructType({{"a", int32()}, {"b", string()}}); + StructLikeSet set(type); + + SimpleStructLike null_row({Scalar{std::monostate{}}, Scalar{std::monostate{}}}); + ASSERT_THAT(set.Insert(null_row), IsOk()); + EXPECT_EQ(set.Size(), 1); + EXPECT_THAT(set.Contains(null_row), HasValue(::testing::Eq(true))); + + // Duplicate null row + SimpleStructLike null_row2({Scalar{std::monostate{}}, Scalar{std::monostate{}}}); + ASSERT_THAT(set.Insert(null_row2), IsOk()); + EXPECT_EQ(set.Size(), 1); +} + +TEST(StructLikeSetTest, ContainsPropagatesFieldAccessError) { + auto type = MakeStructType({{"id", int32()}}); + StructLikeSet set(type); + + FailingStructLike row(1); + EXPECT_THAT(set.Contains(row), IsError(ErrorKind::kNotFound)); +} + +TEST(StructLikeSetTest, InsertPropagatesFieldAccessError) { + auto type = MakeStructType({{"id", int32()}}); + StructLikeSet set(type); + + FailingStructLike row(1); + EXPECT_THAT(set.Insert(row), IsError(ErrorKind::kNotFound)); +} + +TEST(StructLikeSetTest, InsertRejectsFieldCountMismatch) { + auto type = MakeStructType({{"id", int32()}, {"name", string()}}); + StructLikeSet set(type); + + SimpleStructLike row({Scalar{int32_t{1}}}); + EXPECT_THAT(set.Insert(row), IsError(ErrorKind::kInvalidArgument)); +} + +TEST(StructLikeSetTest, ContainsRejectsFieldTypeMismatch) { + auto type = MakeStructType({{"id", int32()}}); + StructLikeSet set(type); + + SimpleStructLike row({Scalar{std::string_view("not_an_int")}}); + EXPECT_THAT(set.Contains(row), IsError(ErrorKind::kInvalidArgument)); +} + +TEST(StructLikeSetTest, FloatAndDoubleFollowJavaEqualitySemantics) { + auto type = MakeStructType({{"f", float32()}, {"d", float64()}}); + StructLikeSet set(type); + + float float_nan = std::numeric_limits::quiet_NaN(); + double double_nan = std::numeric_limits::quiet_NaN(); + SimpleStructLike nan_row({Scalar{float_nan}, Scalar{double_nan}}); + ASSERT_THAT(set.Insert(nan_row), IsOk()); + + float another_float_nan = std::numeric_limits::signaling_NaN(); + double another_double_nan = std::numeric_limits::signaling_NaN(); + SimpleStructLike lookup_nan({Scalar{another_float_nan}, Scalar{another_double_nan}}); + EXPECT_THAT(set.Contains(lookup_nan), HasValue(::testing::Eq(true))); + + SimpleStructLike neg_zero({Scalar{-0.0f}, Scalar{-0.0}}); + SimpleStructLike pos_zero({Scalar{0.0f}, Scalar{0.0}}); + ASSERT_THAT(set.Insert(neg_zero), IsOk()); + EXPECT_THAT(set.Contains(pos_zero), HasValue(::testing::Eq(false))); + ASSERT_THAT(set.Insert(pos_zero), IsOk()); + EXPECT_EQ(set.Size(), 3); +} + +TEST(StructLikeSetTest, NestedMapIsHashedAndComparedRecursively) { + auto map_type = + std::make_shared(SchemaField::MakeRequired(10, "key", string()), + SchemaField::MakeOptional(11, "value", int32())); + auto type = MakeStructType({{"m", map_type}}); + StructLikeSet set(type); + + std::string key1 = "a"; + auto map1 = + std::make_shared(std::vector{Scalar{std::string_view(key1)}}, + std::vector{Scalar{int32_t{7}}}); + SimpleStructLike row({Scalar{std::shared_ptr(map1)}}); + ASSERT_THAT(set.Insert(row), IsOk()); + + std::string key2 = "a"; + auto map2 = + std::make_shared(std::vector{Scalar{std::string_view(key2)}}, + std::vector{Scalar{int32_t{7}}}); + SimpleStructLike same({Scalar{std::shared_ptr(map2)}}); + EXPECT_THAT(set.Contains(same), HasValue(::testing::Eq(true))); + + std::string key3 = "b"; + auto map3 = + std::make_shared(std::vector{Scalar{std::string_view(key3)}}, + std::vector{Scalar{int32_t{7}}}); + SimpleStructLike different({Scalar{std::shared_ptr(map3)}}); + EXPECT_THAT(set.Contains(different), HasValue(::testing::Eq(false))); +} + +} // namespace iceberg diff --git a/src/iceberg/util/struct_like_set.cc b/src/iceberg/util/struct_like_set.cc new file mode 100644 index 000000000..168704579 --- /dev/null +++ b/src/iceberg/util/struct_like_set.cc @@ -0,0 +1,623 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/struct_like_set.h" + +#include +#include +#include +#include +#include + +#include "iceberg/result.h" +#include "iceberg/type.h" +#include "iceberg/util/checked_cast.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { + +/// \brief Helper for std::visit with multiple lambdas. +template +struct Overloaded : Ts... { + using Ts::operator()...; +}; +template +Overloaded(Ts...) -> Overloaded; + +/// \brief A StructLike that owns its field values in a vector of Scalars. +class ArenaStructLike : public StructLike { + public: + explicit ArenaStructLike(std::pmr::vector fields) + : fields_(std::move(fields)) {} + + Result GetField(size_t pos) const override { + ICEBERG_PRECHECK(pos < fields_.size(), "field position {} out of range [0, {})", pos, + fields_.size()); + return fields_[pos]; + } + + size_t num_fields() const override { return fields_.size(); } + + private: + std::pmr::vector fields_; +}; + +/// \brief An ArrayLike that owns its element values in a vector of Scalars. +class ArenaArrayLike : public ArrayLike { + public: + explicit ArenaArrayLike(std::pmr::vector elements) + : elements_(std::move(elements)) {} + + Result GetElement(size_t pos) const override { + ICEBERG_PRECHECK(pos < elements_.size(), "element position {} out of range [0, {})", + pos, elements_.size()); + return elements_[pos]; + } + + size_t size() const override { return elements_.size(); } + + private: + std::pmr::vector elements_; +}; + +/// \brief A MapLike that owns its key/value data in vectors of Scalars. +class ArenaMapLike : public MapLike { + public: + ArenaMapLike(std::pmr::vector keys, std::pmr::vector values) + : keys_(std::move(keys)), values_(std::move(values)) {} + + Result GetKey(size_t pos) const override { + ICEBERG_PRECHECK(pos < keys_.size(), "key position {} out of range [0, {})", pos, + keys_.size()); + return keys_[pos]; + } + + Result GetValue(size_t pos) const override { + ICEBERG_PRECHECK(pos < values_.size(), "value position {} out of range [0, {})", pos, + values_.size()); + return values_[pos]; + } + + size_t size() const override { return keys_.size(); } + + private: + std::pmr::vector keys_; + std::pmr::vector values_; +}; + +constexpr uint32_t kCanonicalFloatNaNBits = 0x7fc00000U; +constexpr uint64_t kCanonicalDoubleNaNBits = 0x7ff8000000000000ULL; + +uint32_t CanonicalFloatBits(float value) { + if (std::isnan(value)) { + return kCanonicalFloatNaNBits; + } + return std::bit_cast(value); +} + +uint64_t CanonicalDoubleBits(double value) { + if (std::isnan(value)) { + return kCanonicalDoubleNaNBits; + } + return std::bit_cast(value); +} + +Result HashScalar(const Scalar& scalar); + +/// \brief Hash a string_view using Java's String.hashCode() algorithm +size_t HashStringView(std::string_view sv) { + size_t result = 177; + for (unsigned char ch : sv) { + result = 31 * result + ch; + } + return result; +} + +/// \brief Hash a StructLike using Java's StructLikeHash algorithm +Result HashStructLike(const StructLike& s) { + size_t result = 97; + size_t len = s.num_fields(); + result = 41 * result + len; + for (size_t i = 0; i < len; ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto field_hash, s.GetField(i).and_then(HashScalar)); + result = 41 * result + field_hash; + } + return result; +} + +/// \brief Hash an ArrayLike using Java's ListHash algorithm +Result HashArrayLike(const ArrayLike& a) { + size_t result = 17; + size_t len = a.size(); + result = 37 * result + len; + for (size_t i = 0; i < len; ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto elem, a.GetElement(i).and_then(HashScalar)); + result = 37 * result + elem; + } + return result; +} + +Result HashMapLike(const MapLike& m) { + size_t result = 17; + size_t len = m.size(); + result = 37 * result + len; + for (size_t i = 0; i < len; ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto key_hash, m.GetKey(i).and_then(HashScalar)); + ICEBERG_ASSIGN_OR_RAISE(auto value_hash, m.GetValue(i).and_then(HashScalar)); + result = 37 * result + key_hash; + result = 37 * result + value_hash; + } + return result; +} + +size_t HashStructLikeUnchecked(const StructLike& s) noexcept { + auto result = HashStructLike(s); + ICEBERG_DCHECK(result.has_value(), "Validated StructLike hash must not fail"); + return result.value_or(0); +} + +Result HashScalar(const Scalar& scalar) { + return std::visit( + Overloaded{ + [](std::monostate) -> Result { return 0; }, + [](bool v) -> Result { return std::hash{}(v); }, + [](int32_t v) -> Result { return std::hash{}(v); }, + [](int64_t v) -> Result { return std::hash{}(v); }, + [](float v) -> Result { + return static_cast(CanonicalFloatBits(v)); + }, + [](double v) -> Result { + uint64_t bits = CanonicalDoubleBits(v); + return static_cast(bits ^ (bits >> 32)); + }, + [](std::string_view v) -> Result { return HashStringView(v); }, + [](const Decimal& v) -> Result { + return std::hash{}(v.low()) ^ (std::hash{}(v.high()) << 1); + }, + [](const std::shared_ptr& v) -> Result { + return v ? HashStructLike(*v) : Result{0}; + }, + [](const std::shared_ptr& v) -> Result { + return v ? HashArrayLike(*v) : Result{0}; + }, + [](const std::shared_ptr& v) -> Result { + return v ? HashMapLike(*v) : Result{0}; + }, + }, + scalar); +} + +Result ScalarEqual(const Scalar& lhs, const Scalar& rhs); + +std::string_view ScalarTypeName(const Scalar& scalar) { + return std::visit( + Overloaded{ + [](std::monostate) -> std::string_view { return "null"; }, + [](bool) -> std::string_view { return "boolean"; }, + [](int32_t) -> std::string_view { return "int32"; }, + [](int64_t) -> std::string_view { return "int64"; }, + [](float) -> std::string_view { return "float"; }, + [](double) -> std::string_view { return "double"; }, + [](std::string_view) -> std::string_view { return "string_view"; }, + [](const Decimal&) -> std::string_view { return "decimal"; }, + [](const std::shared_ptr&) -> std::string_view { return "struct"; }, + [](const std::shared_ptr&) -> std::string_view { return "list"; }, + [](const std::shared_ptr&) -> std::string_view { return "map"; }, + }, + scalar); +} + +Status ValidateScalarAgainstType(const Scalar& scalar, const Type& type); + +Status ValidateStructLikeAgainstType(const StructLike& row, const StructType& type) { + ICEBERG_PRECHECK(row.num_fields() == type.fields().size(), + "StructLike row has {} fields but expected {}", row.num_fields(), + type.fields().size()); + for (size_t i = 0; i < row.num_fields(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto scalar, row.GetField(i)); + ICEBERG_RETURN_UNEXPECTED( + ValidateScalarAgainstType(scalar, *type.fields()[i].type())); + } + return {}; +} + +Status ValidateArrayLikeAgainstType(const ArrayLike& array, const ListType& type) { + for (size_t i = 0; i < array.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto scalar, array.GetElement(i)); + ICEBERG_RETURN_UNEXPECTED(ValidateScalarAgainstType(scalar, *type.element().type())); + } + return {}; +} + +Status ValidateMapLikeAgainstType(const MapLike& map, const MapType& type) { + for (size_t i = 0; i < map.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto key, map.GetKey(i)); + ICEBERG_ASSIGN_OR_RAISE(auto value, map.GetValue(i)); + ICEBERG_RETURN_UNEXPECTED(ValidateScalarAgainstType(key, *type.key().type())); + ICEBERG_RETURN_UNEXPECTED(ValidateScalarAgainstType(value, *type.value().type())); + } + return {}; +} + +Status ValidateScalarAgainstType(const Scalar& scalar, const Type& type) { + if (std::holds_alternative(scalar)) { + return {}; + } + + switch (type.type_id()) { + case TypeId::kBoolean: + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected boolean but got {}", ScalarTypeName(scalar)); + return {}; + case TypeId::kInt: + case TypeId::kDate: + ICEBERG_PRECHECK(std::holds_alternative(scalar), "Expected {} but got {}", + type.ToString(), ScalarTypeName(scalar)); + return {}; + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + ICEBERG_PRECHECK(std::holds_alternative(scalar), "Expected {} but got {}", + type.ToString(), ScalarTypeName(scalar)); + return {}; + case TypeId::kFloat: + ICEBERG_PRECHECK(std::holds_alternative(scalar), "Expected float but got {}", + ScalarTypeName(scalar)); + return {}; + case TypeId::kDouble: + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected double but got {}", ScalarTypeName(scalar)); + return {}; + case TypeId::kDecimal: + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected decimal but got {}", ScalarTypeName(scalar)); + return {}; + case TypeId::kString: + case TypeId::kBinary: + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected {} but got {}", type.ToString(), ScalarTypeName(scalar)); + return {}; + case TypeId::kFixed: { + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected fixed but got {}", ScalarTypeName(scalar)); + const auto& fixed = static_cast(type); + auto value = std::get(scalar); + ICEBERG_PRECHECK(value.size() == fixed.length(), + "Expected fixed({}) but got byte length {}", fixed.length(), + value.size()); + return {}; + } + case TypeId::kUuid: { + ICEBERG_PRECHECK(std::holds_alternative(scalar), + "Expected uuid but got {}", ScalarTypeName(scalar)); + auto value = std::get(scalar); + ICEBERG_PRECHECK(value.size() == 16, "Expected uuid byte length 16 but got {}", + value.size()); + return {}; + } + case TypeId::kStruct: { + ICEBERG_PRECHECK(std::holds_alternative>(scalar), + "Expected struct but got {}", ScalarTypeName(scalar)); + const auto& row = std::get>(scalar); + ICEBERG_PRECHECK(row, "Expected struct but got null"); + return ValidateStructLikeAgainstType( + *row, internal::checked_cast(type)); + } + case TypeId::kList: { + ICEBERG_PRECHECK(std::holds_alternative>(scalar), + "Expected list but got {}", ScalarTypeName(scalar)); + const auto& array = std::get>(scalar); + ICEBERG_PRECHECK(array, "Expected ArrayLike but got null"); + return ValidateArrayLikeAgainstType(*array, + internal::checked_cast(type)); + } + case TypeId::kMap: { + ICEBERG_PRECHECK(std::holds_alternative>(scalar), + "Expected map but got {}", ScalarTypeName(scalar)); + const auto& map = std::get>(scalar); + ICEBERG_PRECHECK(map, "Expected MapLike but got null"); + return ValidateMapLikeAgainstType(*map, + internal::checked_cast(type)); + } + } + + std::unreachable(); +} + +Status ValidateRowAgainstTypes(const StructLike& row, + std::span> field_types) { + if (row.num_fields() != field_types.size()) { + return InvalidArgument("StructLike row has {} fields but expected {}", + row.num_fields(), field_types.size()); + } + for (size_t i = 0; i < field_types.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto scalar, row.GetField(i)); + ICEBERG_RETURN_UNEXPECTED(ValidateScalarAgainstType(scalar, *field_types[i])); + } + return {}; +} + +Result StructLikeEqual(const StructLike& lhs, const StructLike& rhs) { + if (lhs.num_fields() != rhs.num_fields()) { + return false; + } + for (size_t i = 0; i < lhs.num_fields(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto fa, lhs.GetField(i)); + ICEBERG_ASSIGN_OR_RAISE(auto fb, rhs.GetField(i)); + ICEBERG_ASSIGN_OR_RAISE(auto equal, ScalarEqual(fa, fb)); + if (!equal) { + return false; + } + } + return true; +} + +Result ArrayLikeEqual(const ArrayLike& lhs, const ArrayLike& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (size_t i = 0; i < lhs.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto ea, lhs.GetElement(i)); + ICEBERG_ASSIGN_OR_RAISE(auto eb, rhs.GetElement(i)); + ICEBERG_ASSIGN_OR_RAISE(auto equal, ScalarEqual(ea, eb)); + if (!equal) { + return false; + } + } + return true; +} + +Result MapLikeEqual(const MapLike& lhs, const MapLike& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + for (size_t i = 0; i < lhs.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto ka, lhs.GetKey(i)); + ICEBERG_ASSIGN_OR_RAISE(auto kb, rhs.GetKey(i)); + ICEBERG_ASSIGN_OR_RAISE(auto va, lhs.GetValue(i)); + ICEBERG_ASSIGN_OR_RAISE(auto vb, rhs.GetValue(i)); + ICEBERG_ASSIGN_OR_RAISE(auto keys_equal, ScalarEqual(ka, kb)); + if (!keys_equal) { + return false; + } + ICEBERG_ASSIGN_OR_RAISE(auto values_equal, ScalarEqual(va, vb)); + if (!values_equal) { + return false; + } + } + return true; +} + +bool StructLikeEqualUnchecked(const StructLike& lhs, const StructLike& rhs) noexcept { + auto result = StructLikeEqual(lhs, rhs); + ICEBERG_DCHECK(result.has_value(), "Validated StructLike equality must not fail"); + return result.value_or(false); +} + +Result ScalarEqual(const Scalar& lhs, const Scalar& rhs) { + if (lhs.index() != rhs.index()) { + return false; + } + + return std::visit( + Overloaded{ + [](std::monostate, const Scalar&) -> Result { return true; }, + [](bool v, const Scalar& other) -> Result { + return v == std::get(other); + }, + [](int32_t v, const Scalar& other) -> Result { + return v == std::get(other); + }, + [](int64_t v, const Scalar& other) -> Result { + return v == std::get(other); + }, + [](float v, const Scalar& other) -> Result { + return CanonicalFloatBits(v) == CanonicalFloatBits(std::get(other)); + }, + [](double v, const Scalar& other) -> Result { + return CanonicalDoubleBits(v) == CanonicalDoubleBits(std::get(other)); + }, + [](std::string_view v, const Scalar& other) -> Result { + return v == std::get(other); + }, + [](const Decimal& v, const Scalar& other) -> Result { + return v == std::get(other); + }, + [](const std::shared_ptr& l, const Scalar& other) -> Result { + const auto& r = std::get>(other); + if (!l && !r) return true; + if (!l || !r) return false; + return StructLikeEqual(*l, *r); + }, + [](const std::shared_ptr& l, const Scalar& other) -> Result { + const auto& r = std::get>(other); + if (!l && !r) return true; + if (!l || !r) return false; + return ArrayLikeEqual(*l, *r); + }, + [](const std::shared_ptr& l, const Scalar& other) -> Result { + const auto& r = std::get>(other); + if (!l && !r) return true; + if (!l || !r) return false; + return MapLikeEqual(*l, *r); + }, + }, + lhs, rhs); +} + +} // namespace + +template +StructLikeSet::StructLikeSet(const StructType& type) { + field_types_.reserve(type.fields().size()); + for (const auto& field : type.fields()) { + field_types_.push_back(field.type()); + } +} + +template +StructLikeSet::~StructLikeSet() = default; + +template +std::string_view StructLikeSet::CopyToArena(std::string_view src) const { + if (src.empty()) { + return {}; + } + auto buf = static_cast(arena_.allocate(src.size(), 1)); + std::memcpy(buf, src.data(), src.size()); + return {buf, src.size()}; +} + +template +Result StructLikeSet::DeepCopyScalar(const Scalar& scalar) const { + return std::visit( + Overloaded{ + [](std::monostate v) -> Result { return Scalar{v}; }, + [](bool v) -> Result { return Scalar{v}; }, + [](int32_t v) -> Result { return Scalar{v}; }, + [](int64_t v) -> Result { return Scalar{v}; }, + [](float v) -> Result { return Scalar{v}; }, + [](double v) -> Result { return Scalar{v}; }, + [this](std::string_view sv) -> Result { + return Scalar{CopyToArena(sv)}; + }, + [](const Decimal& v) -> Result { return Scalar{v}; }, + [this](const std::shared_ptr& s) -> Result { + ICEBERG_PRECHECK(s, "StructLike scalar must not be null"); + std::pmr::vector fields(&arena_); + fields.resize(s->num_fields()); + for (size_t i = 0; i < s->num_fields(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto field, s->GetField(i)); + ICEBERG_ASSIGN_OR_RAISE(fields[i], DeepCopyScalar(field)); + } + return Scalar{std::make_shared(std::move(fields))}; + }, + [this](const std::shared_ptr& a) -> Result { + ICEBERG_PRECHECK(a, "ArrayLike scalar must not be null"); + std::pmr::vector elements(&arena_); + elements.resize(a->size()); + for (size_t i = 0; i < a->size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto element, a->GetElement(i)); + ICEBERG_ASSIGN_OR_RAISE(elements[i], DeepCopyScalar(element)); + } + return Scalar{std::make_shared(std::move(elements))}; + }, + [this](const std::shared_ptr& m) -> Result { + ICEBERG_PRECHECK(m, "MapLike scalar must not be null"); + std::pmr::vector keys(&arena_); + std::pmr::vector values(&arena_); + keys.resize(m->size()); + values.resize(m->size()); + for (size_t i = 0; i < m->size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto key, m->GetKey(i)); + ICEBERG_ASSIGN_OR_RAISE(auto value, m->GetValue(i)); + ICEBERG_ASSIGN_OR_RAISE(keys[i], DeepCopyScalar(key)); + ICEBERG_ASSIGN_OR_RAISE(values[i], DeepCopyScalar(value)); + } + return Scalar{ + std::make_shared(std::move(keys), std::move(values))}; + }, + }, + scalar); +} + +template +Result> StructLikeSet::MakeArenaRow( + const StructLike& row) const { + std::pmr::vector fields(&arena_); + fields.resize(field_types_.size()); + for (size_t i = 0; i < field_types_.size(); ++i) { + ICEBERG_ASSIGN_OR_RAISE(auto scalar, row.GetField(i)); + ICEBERG_ASSIGN_OR_RAISE(fields[i], DeepCopyScalar(scalar)); + } + return std::make_unique(std::move(fields)); +} + +template +Status StructLikeSet::Insert(const StructLike& row) { + if constexpr (kValidate) { + ICEBERG_RETURN_UNEXPECTED(ValidateRowAgainstTypes(row, field_types_)); + } + if (set_.contains(row)) { + return {}; + } + ICEBERG_ASSIGN_OR_RAISE(auto arena_row, MakeArenaRow(row)); + set_.insert(std::move(arena_row)); + return {}; +} + +template +Result StructLikeSet::Contains(const StructLike& row) const { + if constexpr (kValidate) { + ICEBERG_RETURN_UNEXPECTED(ValidateRowAgainstTypes(row, field_types_)); + } + return set_.find(row) != set_.end(); +} + +template +bool StructLikeSet::IsEmpty() const { + return set_.empty(); +} + +template +size_t StructLikeSet::Size() const { + return set_.size(); +} + +// --- KeyHash --- + +template +size_t StructLikeSet::KeyHash::operator()( + const std::unique_ptr& p) const noexcept { + return HashStructLikeUnchecked(*p); +} + +template +size_t StructLikeSet::KeyHash::operator()(const StructLike& s) const noexcept { + return HashStructLikeUnchecked(s); +} + +// --- KeyEqual --- + +template +bool StructLikeSet::KeyEqual::operator()( + const std::unique_ptr& lhs, + const std::unique_ptr& rhs) const noexcept { + return StructLikeEqualUnchecked(*lhs, *rhs); +} + +template +bool StructLikeSet::KeyEqual::operator()( + const StructLike& lhs, const std::unique_ptr& rhs) const noexcept { + return StructLikeEqualUnchecked(lhs, *rhs); +} + +template +bool StructLikeSet::KeyEqual::operator()( + const std::unique_ptr& lhs, const StructLike& rhs) const noexcept { + return StructLikeEqualUnchecked(*lhs, rhs); +} + +template class ICEBERG_TEMPLATE_EXPORT StructLikeSet; +template class ICEBERG_TEMPLATE_EXPORT StructLikeSet; + +} // namespace iceberg diff --git a/src/iceberg/util/struct_like_set.h b/src/iceberg/util/struct_like_set.h new file mode 100644 index 000000000..943766759 --- /dev/null +++ b/src/iceberg/util/struct_like_set.h @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/util/struct_like_set.h +/// A hash set implementation for StructLike rows. + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/row/struct_like.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief A set of StructLike rows with type-aware hashing and equality. +/// +/// As StructLike uses view semantics, this set makes deep copies of inserted rows +/// into an internal arena to ensure ownership and lifetime safety. Lookups are +/// transparent and do not require temporary allocation. +/// +/// \tparam kValidate When true (default), Insert and Contains validate that each +/// row's scalar types match the schema passed to the constructor. Set to false +/// only when the caller guarantees schema conformance and the validation +/// overhead must be avoided. +template +class ICEBERG_TEMPLATE_CLASS_EXPORT StructLikeSet { + public: + /// \brief Create a StructLikeSet for the given struct type. + explicit StructLikeSet(const StructType& type); + + ~StructLikeSet(); + + /// \brief Insert a row into the set. + Status Insert(const StructLike& row); + + /// \brief Check if the set contains a row. + Result Contains(const StructLike& row) const; + + /// \brief Check if the set is empty. + bool IsEmpty() const; + + /// \brief Get the number of elements in the set. + size_t Size() const; + + private: + /// \brief Transparent hash functor operating on StructLike. + struct KeyHash { + using is_transparent = void; + size_t operator()(const std::unique_ptr& p) const noexcept; + size_t operator()(const StructLike& s) const noexcept; + }; + + /// \brief Transparent equality functor operating on StructLike. + struct KeyEqual { + using is_transparent = void; + bool operator()(const std::unique_ptr& lhs, + const std::unique_ptr& rhs) const noexcept; + bool operator()(const StructLike& lhs, + const std::unique_ptr& rhs) const noexcept; + bool operator()(const std::unique_ptr& lhs, + const StructLike& rhs) const noexcept; + }; + + /// \brief Create an arena-owned deep copy of a StructLike row. + Result> MakeArenaRow(const StructLike& row) const; + + /// \brief Deep copy a scalar value, copying strings into arena and + /// recursively materializing nested types. + Result DeepCopyScalar(const Scalar& scalar) const; + + /// \brief Copy string data into the arena and return a view into it. + std::string_view CopyToArena(std::string_view src) const; + + std::vector> field_types_; + mutable std::pmr::monotonic_buffer_resource arena_{64 * 1024}; + std::unordered_set, KeyHash, KeyEqual> set_; +}; + +/// \brief Type alias for StructLikeSet without schema validation, for callers +/// that guarantee schema conformance. +using UncheckedStructLikeSet = StructLikeSet; + +extern template class ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT StructLikeSet; +extern template class ICEBERG_EXTERN_TEMPLATE_CLASS_EXPORT StructLikeSet; + +} // namespace iceberg