Skip to content
Open
221 changes: 221 additions & 0 deletions src/iceberg/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,57 @@

#include "iceberg/schema.h"

#include <algorithm>
#include <format>
#include <functional>

#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
#include "iceberg/util/macros.h"
#include "iceberg/util/visit_type.h"

namespace iceberg {

class IdToFieldVisitor {
public:
explicit IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field);
Status Visit(const Type& type);
Status VisitNestedType(const Type& type);

private:
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field_;
};

class NameToIdVisitor {
public:
explicit NameToIdVisitor(
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id,
bool case_sensitive = true,
std::function<std::string(std::string_view)> quoting_func = {});
Status Visit(const ListType& type, const std::string& path,
const std::string& short_path);
Status Visit(const MapType& type, const std::string& path,
const std::string& short_path);
Status Visit(const StructType& type, const std::string& path,
const std::string& short_path);
Status Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path);
void Finish();

private:
std::string BuildPath(std::string_view prefix, std::string_view field_name,
bool case_sensitive);

private:
bool case_sensitive_;
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id_;
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
short_name_to_id_;
std::function<std::string(std::string_view)> quoting_func_;
};

Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t> schema_id)
: StructType(std::move(fields)), schema_id_(schema_id) {}

Expand All @@ -44,4 +88,181 @@ bool Schema::Equals(const Schema& other) const {
return schema_id_ == other.schema_id_ && fields_ == other.fields_;
}

Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldByName(
std::string_view name, bool case_sensitive) const {
if (case_sensitive) {
ICEBERG_RETURN_UNEXPECTED(InitNameToIdMap());
auto it = name_to_id_.find(name);
if (it == name_to_id_.end()) return std::nullopt;
return FindFieldById(it->second);
}
ICEBERG_RETURN_UNEXPECTED(InitLowerCaseNameToIdMap());
auto it = lowercase_name_to_id_.find(StringUtils::ToLower(name));
if (it == lowercase_name_to_id_.end()) return std::nullopt;
return FindFieldById(it->second);
}

Status Schema::InitIdToFieldMap() const {
if (!id_to_field_.empty()) {
return {};
}
IdToFieldVisitor visitor(id_to_field_);
ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*this, &visitor));
return {};
}

Status Schema::InitNameToIdMap() const {
if (!name_to_id_.empty()) {
return {};
}
NameToIdVisitor visitor(name_to_id_, /*case_sensitive=*/true);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
return {};
}

Status Schema::InitLowerCaseNameToIdMap() const {
if (!lowercase_name_to_id_.empty()) {
return {};
}
NameToIdVisitor visitor(lowercase_name_to_id_, /*case_sensitive=*/false);
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*this, &visitor, /*path=*/"", /*short_path=*/""));
visitor.Finish();
return {};
}

Result<std::optional<std::reference_wrapper<const SchemaField>>> Schema::FindFieldById(
int32_t field_id) const {
ICEBERG_RETURN_UNEXPECTED(InitIdToFieldMap());
auto it = id_to_field_.find(field_id);
if (it == id_to_field_.end()) {
return std::nullopt;
}
return it->second;
}

IdToFieldVisitor::IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>& id_to_field)
: id_to_field_(id_to_field) {}

Status IdToFieldVisitor::Visit(const Type& type) {
if (type.is_nested()) {
ICEBERG_RETURN_UNEXPECTED(VisitNestedType(type));
}
return {};
}

Status IdToFieldVisitor::VisitNestedType(const Type& type) {
const auto& nested = iceberg::internal::checked_cast<const NestedType&>(type);
const auto& fields = nested.fields();
for (const auto& field : fields) {
auto it = id_to_field_.try_emplace(field.field_id(), std::cref(field));
if (!it.second) {
return InvalidSchema("Duplicate field id found: {}", field.field_id());
}
ICEBERG_RETURN_UNEXPECTED(Visit(*field.type()));
}
return {};
}

NameToIdVisitor::NameToIdVisitor(
std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>& name_to_id,
bool case_sensitive, std::function<std::string(std::string_view)> quoting_func)
: name_to_id_(name_to_id),
case_sensitive_(case_sensitive),
quoting_func_(std::move(quoting_func)) {}

Status NameToIdVisitor::Visit(const ListType& type, const std::string& path,
const std::string& short_path) {
const auto& field = type.fields()[0];
std::string new_path = BuildPath(path, field.name(), case_sensitive_);
std::string new_short_path;
if (field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
return {};
}

Status NameToIdVisitor::Visit(const MapType& type, const std::string& path,
const std::string& short_path) {
std::string new_path, new_short_path;
const auto& fields = type.fields();
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
if (field.name() == MapType::kValueName &&
field.type()->type_id() == TypeId::kStruct) {
new_short_path = short_path;
} else {
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
}
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const StructType& type, const std::string& path,
const std::string& short_path) {
const auto& fields = type.fields();
std::string new_path, new_short_path;
for (const auto& field : fields) {
new_path = BuildPath(path, field.name(), case_sensitive_);
new_short_path = BuildPath(short_path, field.name(), case_sensitive_);
auto it = name_to_id_.try_emplace(new_path, field.field_id());
if (!it.second) {
return InvalidSchema("Duplicate path found: {}, prev id: {}, curr id: {}",
it.first->first, it.first->second, field.field_id());
}
short_name_to_id_.try_emplace(new_short_path, field.field_id());
ICEBERG_RETURN_UNEXPECTED(
VisitTypeInline(*field.type(), this, new_path, new_short_path));
}
return {};
}

Status NameToIdVisitor::Visit(const PrimitiveType& type, const std::string& path,
const std::string& short_path) {
return {};
}

std::string NameToIdVisitor::BuildPath(std::string_view prefix,
std::string_view field_name, bool case_sensitive) {
std::string quoted_name;
if (!quoting_func_) {
quoted_name = std::string(field_name);
} else {
quoted_name = quoting_func_(field_name);
}
if (case_sensitive) {
return prefix.empty() ? quoted_name : std::string(prefix) + "." + quoted_name;
}
return prefix.empty() ? StringUtils::ToLower(quoted_name)
: std::string(prefix) + "." + StringUtils::ToLower(quoted_name);
;
}

void NameToIdVisitor::Finish() {
for (auto&& it : short_name_to_id_) {
name_to_id_.try_emplace(it.first, it.second);
}
}

} // namespace iceberg
35 changes: 35 additions & 0 deletions src/iceberg/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
#include <vector>

#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/type.h"
#include "iceberg/util/string_util.h"

namespace iceberg {

Expand All @@ -54,13 +56,46 @@ class ICEBERG_EXPORT Schema : public StructType {

[[nodiscard]] std::string ToString() const override;

/// \brief Find the SchemaField by field name.
///
/// Short names for maps and lists are included for any name that does not conflict with
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
/// short name 'l.x' in addition to canonical name 'l.element.x'. a map 'm', if its
/// value include a structs with field 'x' wil produce short name 'm.x' in addition to
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to support
/// non-ASCII characters (e.g., using std::towlower or ICU)
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(std::string_view name, bool case_sensitive = true) const;

/// \brief Find the SchemaField by field id.
[[nodiscard]] Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(int32_t field_id) const;

friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }

private:
/// Mapping from field id to field.
mutable std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>
id_to_field_;
/// Mapping from field name to field id.
mutable std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
name_to_id_;
/// Mapping from lowercased field name to field id
mutable std::unordered_map<std::string, int32_t, string_hash, std::equal_to<>>
lowercase_name_to_id_;

private:
/// \brief Compare two schemas for equality.
[[nodiscard]] bool Equals(const Schema& other) const;

const std::optional<int32_t> schema_id_;

// TODO(nullccxsy): Address potential concurrency issues in lazy initialization (e.g.,
// use std::call_once)
Status InitIdToFieldMap() const;
Status InitNameToIdMap() const;
Status InitLowerCaseNameToIdMap() const;
};

} // namespace iceberg
11 changes: 7 additions & 4 deletions src/iceberg/util/macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

#pragma once

#define ICEBERG_RETURN_UNEXPECTED(result) \
if (!result) [[unlikely]] { \
return std::unexpected<Error>(result.error()); \
}
#define ICEBERG_RETURN_UNEXPECTED(result) \
do { \
auto&& iceberg_temp_result = (result); \
if (!iceberg_temp_result) [[unlikely]] { \
return std::unexpected<Error>(iceberg_temp_result.error()); \
} \
} while (false);

#define ICEBERG_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
auto&& result_name = (rexpr); \
Expand Down
11 changes: 11 additions & 0 deletions src/iceberg/util/string_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,15 @@ class ICEBERG_EXPORT StringUtils {
}
};

/// \brief Transparent hash function that supports std::string_view as lookup key
///
/// Enables std::unordered_map to directly accept std::string_view lookup keys
/// without creating temporary std::string objects, using C++20's transparent lookup.
struct ICEBERG_EXPORT string_hash {
using hash_type = std::hash<std::string_view>;
using is_transparent = void;

std::size_t operator()(std::string_view str) const { return hash_type{}(str); }
};

} // namespace iceberg
Loading
Loading