diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index c31554ea..0b67dfb0 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -20,12 +20,54 @@ #include "iceberg/schema.h" #include +#include #include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" +#include "iceberg/util/visit_type.h" namespace iceberg { +class IdToFieldVisitor { + public: + explicit IdToFieldVisitor( + std::unordered_map>& + id_to_field); + Status Visit(const PrimitiveType& type); + Status Visit(const NestedType& type); + + private: + std::unordered_map>& id_to_field_; +}; + +class NameToIdVisitor { + public: + explicit NameToIdVisitor( + std::unordered_map>& name_to_id, + bool case_sensitive = true, + std::function quoting_func = {}); + Status Visit(const ListType& type, const std::string& path, + const std::string& short_path); + Status Visit(const MapType& type, const std::string& path, + const std::string& short_path); + Status Visit(const StructType& type, const std::string& path, + const std::string& short_path); + Status Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path); + void Finish(); + + private: + std::string BuildPath(std::string_view prefix, std::string_view field_name, + bool case_sensitive); + + private: + bool case_sensitive_; + std::unordered_map>& name_to_id_; + std::unordered_map> short_name_to_id_; + std::function quoting_func_; +}; + Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} @@ -44,4 +86,175 @@ bool Schema::Equals(const Schema& other) const { return schema_id_ == other.schema_id_ && fields_ == other.fields_; } +Result>> Schema::FindFieldByName( + std::string_view name, bool case_sensitive) const { + if (case_sensitive) { + ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap()); + auto it = name_to_id_.find(name); + if (it == name_to_id_.end()) return std::nullopt; + return FindFieldById(it->second); + } + ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap()); + auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name)); + if (it == lowercase_name_to_id_.end()) return std::nullopt; + return FindFieldById(it->second); +} + +Status Schema::InitIdToFieldMap() const { + if (!id_to_field_.empty()) { + return {}; + } + IdToFieldVisitor visitor(id_to_field_); + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor)); + return {}; +} + +Status Schema::InitNameToIdMap() const { + if (!name_to_id_.empty()) { + return {}; + } + NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); + return {}; +} + +Status Schema::InitLowerCaseNameToIdMap() const { + if (!lowercase_name_to_id_.empty()) { + return {}; + } + NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/"")); + visitor.Finish(); + return {}; +} + +Result>> Schema::FindFieldById( + int32_t field_id) const { + ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap()); + auto it = id_to_field_.find(field_id); + if (it == id_to_field_.end()) { + return std::nullopt; + } + return it->second; +} + +IdToFieldVisitor::IdToFieldVisitor( + std::unordered_map>& id_to_field) + : id_to_field_(id_to_field) {} + +Status IdToFieldVisitor::Visit(const PrimitiveType& type) { return {}; } + +Status IdToFieldVisitor::Visit(const NestedType& type) { + const auto& nested = internal::checked_cast(type); + const auto& fields = nested.fields(); + for (const auto& field : fields) { + auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field)); + if (!it.second) { + return InvalidSchema("Duplicate field id found: {}", field.field_id()); + } + ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this)); + } + return {}; +} + +NameToIdVisitor::NameToIdVisitor( + std::unordered_map>& name_to_id, + bool case_sensitive, std::function quoting_func) + : name_to_id_(name_to_id), + case_sensitive_(case_sensitive), + quoting_func_(std::move(quoting_func)) {} + +Status NameToIdVisitor::Visit(const ListType& type, const std::string& path, + const std::string& short_path) { + const auto& field = type.fields()[0]; + std::string new_path = BuildPath(path, field.name(), case_sensitive_); + std::string new_short_path; + if (field.type()->type_id() == TypeId::kStruct) { + new_short_path = short_path; + } else { + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + } + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + return {}; +} + +Status NameToIdVisitor::Visit(const MapType& type, const std::string& path, + const std::string& short_path) { + std::string new_path, new_short_path; + const auto& fields = type.fields(); + for (const auto& field : fields) { + new_path = BuildPath(path, field.name(), case_sensitive_); + if (field.name() == MapType::kValueName && + field.type()->type_id() == TypeId::kStruct) { + new_short_path = short_path; + } else { + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + } + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + } + return {}; +} + +Status NameToIdVisitor::Visit(const StructType& type, const std::string& path, + const std::string& short_path) { + const auto& fields = type.fields(); + std::string new_path, new_short_path; + for (const auto& field : fields) { + new_path = BuildPath(path, field.name(), case_sensitive_); + new_short_path = BuildPath(short_path, field.name(), case_sensitive_); + auto it = name_to_id_.try_emplace(new_path, field.field_id()); + if (!it.second) { + return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}", + it.first->first, it.first->second, field.field_id()); + } + short_name_to_id_.try_emplace(new_short_path, field.field_id()); + ICEBERG_RETURN_UNEXPECTED( + VisitTypeInline(*field.type(), this, new_path, new_short_path)); + } + return {}; +} + +Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path, + const std::string& short_path) { + return {}; +} + +std::string NameToIdVisitor::BuildPath(std::string_view prefix, + std::string_view field_name, bool case_sensitive) { + std::string quoted_name; + if (!quoting_func_) { + quoted_name = std::string(field_name); + } else { + quoted_name = quoting_func_(field_name); + } + if (case_sensitive) { + return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name; + } + return prefix.empty() ? StringUtils::ToLower(quoted_name) + : std::string(prefix) + "." + StringUtils::ToLower(quoted_name); +} + +void NameToIdVisitor::Finish() { + for (auto&& it : short_name_to_id_) { + name_to_id_.try_emplace(it.first, it.second); + } +} + } // namespace iceberg diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 490acb6d..1de829c8 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -29,8 +29,10 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" +#include "iceberg/util/string_util.h" namespace iceberg { @@ -54,13 +56,44 @@ class ICEBERG_EXPORT Schema : public StructType { [[nodiscard]] std::string ToString() const override; + /// \brief Find the SchemaField by field name. + /// + /// Short names for maps and lists are included for any name that does not conflict with + /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce + /// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its + /// value include a structs with field 'x' wil produce short name 'm.x' in addition to + /// canonical name 'm.value.x' + /// FIXME: Currently only handles ASCII lowercase conversion; extend to support + /// non-ASCII characters (e.g., using std::towlower or ICU) + [[nodiscard]] Result>> + FindFieldByName(std::string_view name, bool case_sensitive = true) const; + + /// \brief Find the SchemaField by field id. + [[nodiscard]] Result>> + FindFieldById(int32_t field_id) const; + friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); } private: /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; + // TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g., + // use std::call_once) + Status InitIdToFieldMap() const; + Status InitNameToIdMap() const; + Status InitLowerCaseNameToIdMap() const; + const std::optional schema_id_; + /// Mapping from field id to field. + mutable std::unordered_map> + id_to_field_; + /// Mapping from field name to field id. + mutable std::unordered_map> + name_to_id_; + /// Mapping from lowercased field name to field id + mutable std::unordered_map> + lowercase_name_to_id_; }; } // namespace iceberg diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 4d687bf5..3519c9a6 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -19,10 +19,13 @@ #pragma once -#define ICEBERG_RETURN_UNEXPECTED(result) \ - if (!result) [[unlikely]] { \ - return std::unexpected(result.error()); \ - } +#define ICEBERG_RETURN_UNEXPECTED(result) \ + do { \ + auto&& result_name = (result); \ + if (!result_name) [[unlikely]] { \ + return std::unexpected(result_name.error()); \ + } \ + } while (false); #define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \ auto&& result_name = (rexpr); \ diff --git a/src/iceberg/util/string_util.h b/src/iceberg/util/string_util.h index 558fc293..a0fccfd3 100644 --- a/src/iceberg/util/string_util.h +++ b/src/iceberg/util/string_util.h @@ -46,4 +46,17 @@ class ICEBERG_EXPORT StringUtils { } }; +/// \brief Transparent hash function that supports std::string_view as lookup key +/// +/// Enables std::unordered_map to directly accept std::string_view lookup keys +/// without creating temporary std::string objects, using C++20's transparent lookup. +struct ICEBERG_EXPORT StringHash { + using hash_type = std::hash; + using is_transparent = void; + + std::size_t operator()(std::string_view str) const { return hash_type{}(str); } + std::size_t operator()(const char* str) const { return hash_type{}(str); } + std::size_t operator()(const std::string& str) const { return hash_type{}(str); } +}; + } // namespace iceberg diff --git a/test/schema_test.cc b/test/schema_test.cc index 93239597..d282cea9 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -81,3 +81,413 @@ TEST(SchemaTest, Equality) { ASSERT_EQ(schema1, schema5); ASSERT_EQ(schema5, schema1); } + +class BasicShortNameTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); + field2_ = std::make_unique(2, "Bar", iceberg::string(), true); + field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); + + auto structtype = std::make_shared( + std::vector{*field1_, *field2_, *field3_}); + + field4_ = std::make_unique(4, "element", structtype, false); + + auto listype = std::make_shared(*field4_); + + field5_ = std::make_unique(5, "key", iceberg::int32(), false); + field6_ = std::make_unique(6, "value", listype, false); + + auto maptype = std::make_shared(*field5_, *field6_); + + field7_ = std::make_unique(7, "Value", maptype, false); + + schema_ = + std::make_unique(std::vector{*field7_}, 1); + } + + std::unique_ptr schema_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + std::unique_ptr field4_; + std::unique_ptr field5_; + std::unique_ptr field6_; + std::unique_ptr field7_; +}; + +TEST_F(BasicShortNameTest, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); + + ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(std::nullopt)); +} + +TEST_F(BasicShortNameTest, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("Value"), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value"), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("Value.key"), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element"), + ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foobar"), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Bar"), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.Foo"), + ::testing::Optional(*field1_)); + + ASSERT_THAT(schema_->FindFieldByName("Value.value.element.FoO"), + ::testing::Optional(std::nullopt)); +} + +TEST_F(BasicShortNameTest, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("vALue", false), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldByName("vALue.VALUE", false), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.kEy", false), + ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.elEment", false), + ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldByName("vaLue.vAlue.eLement.fOObar", false), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.vaLUe.elemEnt.Bar", false), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FOO", false), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("valUe.valUe.ELEMENT.FO", false), + ::testing::Optional(std::nullopt)); +} + +TEST_F(BasicShortNameTest, TestFindByShortNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("vaLue.value.FOO", false), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.Bar", false), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR", false), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("Value.value.FooBAR.a", false), + ::testing::Optional(std::nullopt)); +} + +class ComplexShortNameTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = std::make_unique(1, "Foo", iceberg::int32(), true); + field2_ = std::make_unique(2, "Bar", iceberg::string(), true); + field3_ = std::make_unique(3, "Foobar", iceberg::int32(), true); + + auto structtype = std::make_shared( + std::vector{*field1_, *field2_, *field3_}); + + field4_ = std::make_unique(4, "element", structtype, false); + + auto listype = std::make_shared(*field4_); + + field5_ = + std::make_unique(5, "First_child", iceberg::int32(), false); + field6_ = std::make_unique(6, "Second_child", listype, false); + + auto structtype2 = std::make_shared( + std::vector{*field5_, *field6_}); + + field7_ = std::make_unique(7, "key", iceberg::int32(), false); + field8_ = std::make_unique(8, "value", structtype2, false); + + auto maptype = std::make_shared(*field7_, *field8_); + + field9_ = std::make_unique(9, "Map", maptype, false); + + schema_ = + std::make_unique(std::vector{*field9_}, 1); + } + + std::unique_ptr schema_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + std::unique_ptr field4_; + std::unique_ptr field5_; + std::unique_ptr field6_; + std::unique_ptr field7_; + std::unique_ptr field8_; + std::unique_ptr field9_; +}; + +TEST_F(ComplexShortNameTest, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(9), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldById(8), ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldById(7), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldById(6), ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldById(5), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldById(4), ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldById(3), ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldById(2), ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldById(1), ::testing::Optional(*field1_)); + + ASSERT_THAT(schema_->FindFieldById(0), ::testing::Optional(std::nullopt)); +} + +TEST_F(ComplexShortNameTest, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("Map"), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value"), ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldByName("Map.key"), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child"), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.First_child"), + ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element"), + ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foobar"), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Bar"), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Foo"), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.value.Second_child.element.Fooo"), + ::testing::Optional(std::nullopt)); +} + +TEST_F(ComplexShortNameTest, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("map", false), ::testing::Optional(*field9_)); + ASSERT_THAT(schema_->FindFieldByName("map.vALUE", false), + ::testing::Optional(*field8_)); + ASSERT_THAT(schema_->FindFieldByName("map.Key", false), ::testing::Optional(*field7_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_Child", false), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.first_chIld", false), + ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element", false), + ::testing::Optional(*field4_)); + ASSERT_THAT(schema_->FindFieldByName("map.Value.second_child.Element.foobar", false), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("map.VaLue.second_child.Element.bar", false), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.foo", false), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("map.value.Second_child.Element.fooo", false), + ::testing::Optional(std::nullopt)); +} + +TEST_F(ComplexShortNameTest, TestFindByShortName) { + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child"), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("Map.First_child"), ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foobar"), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Bar"), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.Foo"), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa"), + ::testing::Optional(std::nullopt)); +} + +TEST_F(ComplexShortNameTest, TestFindByShortNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("map.second_child", false), + ::testing::Optional(*field6_)); + ASSERT_THAT(schema_->FindFieldByName("map.first_child", false), + ::testing::Optional(*field5_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.foobar", false), + ::testing::Optional(*field3_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.bar", false), + ::testing::Optional(*field2_)); + ASSERT_THAT(schema_->FindFieldByName("map.second_child.foo", false), + ::testing::Optional(*field1_)); + ASSERT_THAT(schema_->FindFieldByName("Map.Second_child.aaa", false), + ::testing::Optional(std::nullopt)); +} + +class ComplexMapStructShortNameTest : public ::testing::Test { + protected: + void SetUp() override { + exp_inner_key_key_ = + std::make_unique(10, "inner_key", iceberg::int32(), false); + exp_inner_key_value_ = std::make_unique( + 11, "inner_value", iceberg::int32(), false); + auto inner_struct_type_key_ = std::make_shared( + std::vector{*exp_inner_key_key_, *exp_inner_key_value_}); + + exp_inner_value_k_ = + std::make_unique(12, "inner_k", iceberg::int32(), false); + exp_inner_value_v_ = + std::make_unique(13, "inner_v", iceberg::int32(), false); + auto inner_struct_type_value_ = std::make_shared( + std::vector{*exp_inner_value_k_, *exp_inner_value_v_}); + + exp_key_struct_key_ = + std::make_unique(14, "key", iceberg::int32(), false); + exp_key_struct_value_ = std::make_unique( + 15, "value", inner_struct_type_key_, false); + auto key_struct_type_ = std::make_shared( + std::vector{*exp_key_struct_key_, *exp_key_struct_value_}); + + exp_value_struct_key_ = + std::make_unique(16, "key", iceberg::int32(), false); + exp_value_struct_value_ = std::make_unique( + 17, "value", inner_struct_type_value_, false); + auto value_struct_type_ = + std::make_shared(std::vector{ + *exp_value_struct_key_, *exp_value_struct_value_}); + + exp_map_key_ = + std::make_unique(18, "key", key_struct_type_, false); + exp_map_value_ = + std::make_unique(19, "value", value_struct_type_, false); + auto map_type_ = std::make_shared(*exp_map_key_, *exp_map_value_); + + exp_field_a_ = std::make_unique(20, "a", map_type_, false); + + schema_ = std::make_unique( + std::vector{*exp_field_a_}, 1); + } + + std::unique_ptr schema_; + std::unique_ptr exp_inner_key_key_; + std::unique_ptr exp_inner_key_value_; + std::unique_ptr exp_inner_value_k_; + std::unique_ptr exp_inner_value_v_; + std::unique_ptr exp_key_struct_key_; + std::unique_ptr exp_key_struct_value_; + std::unique_ptr exp_value_struct_key_; + std::unique_ptr exp_value_struct_value_; + std::unique_ptr exp_map_key_; + std::unique_ptr exp_map_value_; + std::unique_ptr exp_field_a_; +}; + +TEST_F(ComplexMapStructShortNameTest, TestFindById) { + ASSERT_THAT(schema_->FindFieldById(20), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldById(19), ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldById(18), ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldById(17), ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldById(16), ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldById(15), ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldById(14), ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldById(13), ::testing::Optional(*exp_inner_value_v_)); + ASSERT_THAT(schema_->FindFieldById(12), ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldById(11), ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldById(10), ::testing::Optional(*exp_inner_key_key_)); +} + +TEST_F(ComplexMapStructShortNameTest, TestFindByName) { + ASSERT_THAT(schema_->FindFieldByName("a"), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldByName("a.key"), ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.value"), ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.key"), + ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value"), + ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value.inner_key"), + ::testing::Optional(*exp_inner_key_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.key.value.inner_value"), + ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.key"), + ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value"), + ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value.inner_k"), + ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldByName("a.value.value.inner_v"), + ::testing::Optional(*exp_inner_value_v_)); +} + +TEST_F(ComplexMapStructShortNameTest, TestFindByNameCaseInsensitive) { + ASSERT_THAT(schema_->FindFieldByName("A", false), ::testing::Optional(*exp_field_a_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY", false), + ::testing::Optional(*exp_map_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE", false), + ::testing::Optional(*exp_map_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.KEY", false), + ::testing::Optional(*exp_key_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE", false), + ::testing::Optional(*exp_key_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INNER_KEY", false), + ::testing::Optional(*exp_inner_key_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INNER_VALUE", false), + ::testing::Optional(*exp_inner_key_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.KEY", false), + ::testing::Optional(*exp_value_struct_key_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE", false), + ::testing::Optional(*exp_value_struct_value_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE.INNER_K", false), + ::testing::Optional(*exp_inner_value_k_)); + ASSERT_THAT(schema_->FindFieldByName("A.VALUE.VALUE.INNER_V", false), + ::testing::Optional(*exp_inner_value_v_)); +} + +TEST_F(ComplexMapStructShortNameTest, TestInvalidPaths) { + ASSERT_THAT(schema_->FindFieldByName("a.invalid"), ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("a.key.invalid"), + ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("a.value.invalid"), + ::testing::Optional(std::nullopt)); + ASSERT_THAT(schema_->FindFieldByName("A.KEY.VALUE.INVALID", false), + ::testing::Optional(std::nullopt)); +} + +TEST(SchemaTest, DuplicatePathErrorCaseSensitive) { + auto nested_b = std::make_unique(2, "b", iceberg::int32(), false); + auto nested_struct = + std::make_shared(std::vector{*nested_b}); + auto a = std::make_unique(1, "a", nested_struct, false); + auto duplicate_ab = + std::make_unique(3, "a.b", iceberg::int32(), false); + auto schema = std::make_unique( + std::vector{*a, *duplicate_ab}, 1); + + auto result = schema->FindFieldByName("a.b", /*case_sensitive=*/true); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3")); +} + +TEST(SchemaTest, DuplicatePathErrorCaseInsensitive) { + auto nested_b = std::make_unique(2, "B", iceberg::int32(), false); + auto nested_struct = + std::make_shared(std::vector{*nested_b}); + auto a = std::make_unique(1, "A", nested_struct, false); + auto duplicate_ab = + std::make_unique(3, "a.b", iceberg::int32(), false); + auto schema = std::make_unique( + std::vector{*a, *duplicate_ab}, 1); + + auto result = schema->FindFieldByName("A.B", /*case_sensitive=*/false); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate path found: a.b, prev id: 2, curr id: 3")); +} + +TEST(SchemaTest, NestedDuplicateFieldIdError) { + // Outer struct with field ID 1 + auto outer_field = + std::make_unique(1, "outer", iceberg::int32(), true); + + // Inner struct with duplicate field ID 1 + auto inner_field = + std::make_unique(1, "inner", iceberg::string(), true); + auto inner_struct = std::make_shared( + std::vector{*inner_field}); + + // Nested field with inner struct + auto nested_field = + std::make_unique(2, "nested", inner_struct, true); + + // Schema with outer and nested fields + auto schema = std::make_unique( + std::vector{*outer_field, *nested_field}, 1); + + // Attempt to find a field, which should trigger duplicate ID detection + auto result = schema->FindFieldById(1); + ASSERT_FALSE(result.has_value()); + EXPECT_EQ(result.error().kind, iceberg::ErrorKind::kInvalidSchema); + EXPECT_THAT(result.error().message, + ::testing::HasSubstr("Duplicate field id found: 1")); +}