From f1c6d91999186ffabf9b7be390ccfcaf9e2de768 Mon Sep 17 00:00:00 2001 From: Innocent Date: Sun, 1 Feb 2026 17:22:16 -0700 Subject: [PATCH 1/3] feat: add json serde for expressions --- src/iceberg/expression/json_serde.cc | 457 ++++++++++++++++--- src/iceberg/expression/json_serde_internal.h | 75 ++- src/iceberg/expression/predicate.h | 10 +- src/iceberg/test/expression_json_test.cc | 385 +++++++++++++++- src/iceberg/type_fwd.h | 3 + 5 files changed, 841 insertions(+), 89 deletions(-) diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 0fd7dd01d..e11237871 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -17,47 +17,116 @@ * under the License. */ -#include #include #include -#include #include #include #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/expression/term.h" +#include "iceberg/transform.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/transform_util.h" namespace iceberg { namespace { +// JSON field names +constexpr std::string_view kType = "type"; +constexpr std::string_view kTerm = "term"; +constexpr std::string_view kTransform = "transform"; +constexpr std::string_view kValue = "value"; +constexpr std::string_view kValues = "values"; +constexpr std::string_view kLeft = "left"; +constexpr std::string_view kRight = "right"; +constexpr std::string_view kChild = "child"; // Expression type strings -constexpr std::string_view kTypeTrue = "true"; -constexpr std::string_view kTypeFalse = "false"; -constexpr std::string_view kTypeEq = "eq"; -constexpr std::string_view kTypeAnd = "and"; -constexpr std::string_view kTypeOr = "or"; -constexpr std::string_view kTypeNot = "not"; -constexpr std::string_view kTypeIn = "in"; -constexpr std::string_view kTypeNotIn = "not-in"; -constexpr std::string_view kTypeLt = "lt"; -constexpr std::string_view kTypeLtEq = "lt-eq"; -constexpr std::string_view kTypeGt = "gt"; -constexpr std::string_view kTypeGtEq = "gt-eq"; -constexpr std::string_view kTypeNotEq = "not-eq"; -constexpr std::string_view kTypeStartsWith = "starts-with"; -constexpr std::string_view kTypeNotStartsWith = "not-starts-with"; -constexpr std::string_view kTypeIsNull = "is-null"; -constexpr std::string_view kTypeNotNull = "not-null"; -constexpr std::string_view kTypeIsNan = "is-nan"; -constexpr std::string_view kTypeNotNan = "not-nan"; -constexpr std::string_view kTypeCount = "count"; -constexpr std::string_view kTypeCountNull = "count-null"; -constexpr std::string_view kTypeCountStar = "count-star"; -constexpr std::string_view kTypeMin = "min"; -constexpr std::string_view kTypeMax = "max"; +constexpr std::string_view kTrue = "true"; +constexpr std::string_view kFalse = "false"; +constexpr std::string_view kEq = "eq"; +constexpr std::string_view kAnd = "and"; +constexpr std::string_view kOr = "or"; +constexpr std::string_view kNot = "not"; +constexpr std::string_view kIn = "in"; +constexpr std::string_view kNotIn = "not-in"; +constexpr std::string_view kLt = "lt"; +constexpr std::string_view kLtEq = "lt-eq"; +constexpr std::string_view kGt = "gt"; +constexpr std::string_view kGtEq = "gt-eq"; +constexpr std::string_view kNotEq = "not-eq"; +constexpr std::string_view kStartsWith = "starts-with"; +constexpr std::string_view kNotStartsWith = "not-starts-with"; +constexpr std::string_view kIsNull = "is-null"; +constexpr std::string_view kNotNull = "not-null"; +constexpr std::string_view kIsNan = "is-nan"; +constexpr std::string_view kNotNan = "not-nan"; +constexpr std::string_view kCount = "count"; +constexpr std::string_view kCountNull = "count-null"; +constexpr std::string_view kCountStar = "count-star"; +constexpr std::string_view kMin = "min"; +constexpr std::string_view kMax = "max"; +constexpr std::string_view kLiteral = "literal"; +constexpr std::string_view kReference = "reference"; + +/// Helper to build the transform JSON object shared by Unbound/BoundTransform +nlohmann::json MakeTransformJson(std::string_view transform_str, + std::string_view ref_name) { + nlohmann::json json; + json[kType] = kTransform; + json[kTransform] = transform_str; + json[kTerm] = ref_name; + return json; +} + +/// Helper to check if a JSON term represents a transform +bool IsTransformTerm(const nlohmann::json& json) { + return json.is_object() && json.contains(kType) && + json[kType].get() == kTransform && json.contains(kTerm); +} + +/// Template helper to create predicates from JSON with the appropriate term type +template +Result> PredicateFromJson( + Expression::Operation op, std::shared_ptr> term, + const nlohmann::json& json) { + if (IsUnaryOperation(op)) { + if (json.contains(kValue)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'value' field: {}", + SafeDumpJson(json)); + } + if (json.contains(kValues)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'values' field: {}", + SafeDumpJson(json)); + } + return UnboundPredicateImpl::Make(op, std::move(term)); + } + + if (IsSetOperation(op)) { + std::vector literals; + if (!json.contains(kValues) || !json[kValues].is_array() || json.contains(kValue)) + [[unlikely]] { + return JsonParseError("Missing or invalid 'values' field for set operation: {}", + SafeDumpJson(json)); + } + for (const auto& val : json[kValues]) { + ICEBERG_ASSIGN_OR_RAISE(auto lit, LiteralFromJson(val)); + literals.push_back(std::move(lit)); + } + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literals)); + } + + // Literal predicate + if (!json.contains(kValue) || json.contains(kValues)) [[unlikely]] { + return JsonParseError("Missing 'value' field for literal predicate: {}", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto literal, LiteralFromJson(json[kValue])); + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literal)); +} } // namespace bool IsUnaryOperation(Expression::Operation op) { @@ -83,67 +152,333 @@ bool IsSetOperation(Expression::Operation op) { } Result OperationTypeFromJson(const nlohmann::json& json) { - if (!json.is_string()) { + if (!json.is_string()) [[unlikely]] { return JsonParseError("Unable to create operation. Json value is not a string"); } auto typeStr = json.get(); - if (typeStr == kTypeTrue) return Expression::Operation::kTrue; - if (typeStr == kTypeFalse) return Expression::Operation::kFalse; - if (typeStr == kTypeAnd) return Expression::Operation::kAnd; - if (typeStr == kTypeOr) return Expression::Operation::kOr; - if (typeStr == kTypeNot) return Expression::Operation::kNot; - if (typeStr == kTypeEq) return Expression::Operation::kEq; - if (typeStr == kTypeNotEq) return Expression::Operation::kNotEq; - if (typeStr == kTypeLt) return Expression::Operation::kLt; - if (typeStr == kTypeLtEq) return Expression::Operation::kLtEq; - if (typeStr == kTypeGt) return Expression::Operation::kGt; - if (typeStr == kTypeGtEq) return Expression::Operation::kGtEq; - if (typeStr == kTypeIn) return Expression::Operation::kIn; - if (typeStr == kTypeNotIn) return Expression::Operation::kNotIn; - if (typeStr == kTypeIsNull) return Expression::Operation::kIsNull; - if (typeStr == kTypeNotNull) return Expression::Operation::kNotNull; - if (typeStr == kTypeIsNan) return Expression::Operation::kIsNan; - if (typeStr == kTypeNotNan) return Expression::Operation::kNotNan; - if (typeStr == kTypeStartsWith) return Expression::Operation::kStartsWith; - if (typeStr == kTypeNotStartsWith) return Expression::Operation::kNotStartsWith; - if (typeStr == kTypeCount) return Expression::Operation::kCount; - if (typeStr == kTypeCountNull) return Expression::Operation::kCountNull; - if (typeStr == kTypeCountStar) return Expression::Operation::kCountStar; - if (typeStr == kTypeMin) return Expression::Operation::kMin; - if (typeStr == kTypeMax) return Expression::Operation::kMax; + if (typeStr == kTrue) return Expression::Operation::kTrue; + if (typeStr == kFalse) return Expression::Operation::kFalse; + if (typeStr == kAnd) return Expression::Operation::kAnd; + if (typeStr == kOr) return Expression::Operation::kOr; + if (typeStr == kNot) return Expression::Operation::kNot; + if (typeStr == kEq) return Expression::Operation::kEq; + if (typeStr == kNotEq) return Expression::Operation::kNotEq; + if (typeStr == kLt) return Expression::Operation::kLt; + if (typeStr == kLtEq) return Expression::Operation::kLtEq; + if (typeStr == kGt) return Expression::Operation::kGt; + if (typeStr == kGtEq) return Expression::Operation::kGtEq; + if (typeStr == kIn) return Expression::Operation::kIn; + if (typeStr == kNotIn) return Expression::Operation::kNotIn; + if (typeStr == kIsNull) return Expression::Operation::kIsNull; + if (typeStr == kNotNull) return Expression::Operation::kNotNull; + if (typeStr == kIsNan) return Expression::Operation::kIsNan; + if (typeStr == kNotNan) return Expression::Operation::kNotNan; + if (typeStr == kStartsWith) return Expression::Operation::kStartsWith; + if (typeStr == kNotStartsWith) return Expression::Operation::kNotStartsWith; + if (typeStr == kCount) return Expression::Operation::kCount; + if (typeStr == kCountNull) return Expression::Operation::kCountNull; + if (typeStr == kCountStar) return Expression::Operation::kCountStar; + if (typeStr == kMin) return Expression::Operation::kMin; + if (typeStr == kMax) return Expression::Operation::kMax; return JsonParseError("Unknown expression type: {}", typeStr); } -nlohmann::json ToJson(Expression::Operation op) { +Result ToJson(Expression::Operation op) { std::string json(ToString(op)); std::ranges::transform(json, json.begin(), [](unsigned char c) -> char { return (c == '_') ? '-' : static_cast(std::tolower(c)); }); + return nlohmann::json(std::move(json)); +} + +Result ToJson(const NamedReference& ref) { + return nlohmann::json(ref.name()); +} + +Result> NamedReferenceFromJson( + const nlohmann::json& json) { + if (json.is_object() && json.contains(kType) && + json[kType].get() == kReference && json.contains(kTerm)) { + return NamedReference::Make(json[kTerm].get()); + } + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Expected string for named reference"); + } + return NamedReference::Make(json.get()); +} + +Result ToJson(const UnboundTransform& transform) { + auto& mut = const_cast(transform); + return MakeTransformJson(transform.transform()->ToString(), mut.reference()->name()); +} + +Result ToJson(const BoundReference& ref) { + return nlohmann::json(ref.name()); +} + +Result ToJson(const BoundTransform& transform) { + auto& mut = const_cast(transform); + return MakeTransformJson(transform.transform()->ToString(), mut.reference()->name()); +} + +Result> UnboundTransformFromJson( + const nlohmann::json& json) { + if (IsTransformTerm(json)) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_str, + GetJsonValue(json, kTransform)); + ICEBERG_ASSIGN_OR_RAISE(auto transform, TransformFromString(transform_str)); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReferenceFromJson(json[kTerm])); + return UnboundTransform::Make(std::move(ref), std::move(transform)); + } + return JsonParseError("Invalid unbound transform json: {}", SafeDumpJson(json)); +} + +Result ToJson(const Literal& literal) { + if (literal.IsNull()) { + return nlohmann::json(nullptr); + } + + const auto type_id = literal.type()->type_id(); + const auto& value = literal.value(); + + switch (type_id) { + case TypeId::kBoolean: + return nlohmann::json(std::get(value)); + case TypeId::kInt: + return nlohmann::json(std::get(value)); + case TypeId::kDate: + return nlohmann::json(TransformUtil::HumanDay(std::get(value))); + case TypeId::kLong: + return nlohmann::json(std::get(value)); + case TypeId::kTime: + return nlohmann::json(TransformUtil::HumanTime(std::get(value))); + case TypeId::kTimestamp: + return nlohmann::json(TransformUtil::HumanTimestamp(std::get(value))); + case TypeId::kTimestampTz: + return nlohmann::json( + TransformUtil::HumanTimestampWithZone(std::get(value))); + case TypeId::kFloat: + return nlohmann::json(std::get(value)); + case TypeId::kDouble: + return nlohmann::json(std::get(value)); + case TypeId::kString: + return nlohmann::json(std::get(value)); + case TypeId::kBinary: + case TypeId::kFixed: { + // base 16 encoding for binary data + const auto& bytes = std::get>(value); + std::string hex; + hex.reserve(bytes.size() * 2); + for (uint8_t byte : bytes) { + hex += std::format("{:02X}", byte); + } + return nlohmann::json(std::move(hex)); + } + case TypeId::kDecimal: { + return nlohmann::json(literal.ToString()); + } + case TypeId::kUuid: + return nlohmann::json(std::get(value).ToString()); + default: + return NotSupported("Unsupported literal type for JSON serialization: {}", + literal.type()->ToString()); + } +} + +Result LiteralFromJson(const nlohmann::json& json) { + // Unwrap {"type": "literal", "value": } wrapper + if (json.is_object() && json.contains(kType) && + json[kType].get() == kLiteral && json.contains(kValue)) { + return LiteralFromJson(json[kValue]); + } + if (json.is_null()) { + return Literal::Null(nullptr); + } + if (json.is_boolean()) { + return Literal::Boolean(json.get()); + } + if (json.is_number_integer()) { + return Literal::Long(json.get()); + } + if (json.is_number_float()) { + return Literal::Double(json.get()); + } + if (json.is_string()) { + // All strings are returned as String literals. + // Conversion to binary/date/time/etc. happens during binding + // when schema type information is available. + return Literal::String(json.get()); + } + return JsonParseError("Unsupported literal JSON type"); +} + +Result ToJson(const Term& term) { + switch (term.kind()) { + case Term::Kind::kReference: + if (term.is_unbound()) + return ToJson(internal::checked_cast(term)); + return ToJson(internal::checked_cast(term)); + case Term::Kind::kTransform: + if (term.is_unbound()) + return ToJson(internal::checked_cast(term)); + return ToJson(internal::checked_cast(term)); + default: + return NotSupported("Unsupported term kind for JSON serialization"); + } +} + +Result ToJson(const UnboundPredicate& pred) { + nlohmann::json json; + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(pred.op())); + + ICEBERG_ASSIGN_OR_RAISE(json[kTerm], ToJson(pred.predicate_term())); + std::span literals = pred.literals(); + + if (IsSetOperation(pred.op())) { + nlohmann::json values = nlohmann::json::array(); + for (const auto& lit : literals) { + ICEBERG_ASSIGN_OR_RAISE(auto lit_json, ToJson(lit)); + values.push_back(std::move(lit_json)); + } + json[kValues] = std::move(values); + } else if (!literals.empty()) { + ICEBERG_DCHECK(literals.size() == 1, + "Expected exactly one literal for non-set predicate"); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(literals[0])); + } + return json; +} + +Result ToJson(const BoundPredicate& pred) { + nlohmann::json json; + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(pred.op())); + ICEBERG_ASSIGN_OR_RAISE(json[kTerm], ToJson(*pred.term())); + + if (IsSetOperation(pred.op())) { + const auto& sp = internal::checked_cast(pred); + nlohmann::json values = nlohmann::json::array(); + for (const auto& lit : sp.literal_set()) { + ICEBERG_ASSIGN_OR_RAISE(auto lit_json, ToJson(lit)); + values.push_back(std::move(lit_json)); + } + json[kValues] = std::move(values); + } else if (!IsUnaryOperation(pred.op())) { + const auto& lp = internal::checked_cast(pred); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(lp.literal())); + } return json; } +Result> UnboundPredicateFromJson( + const nlohmann::json& json) { + if (!json.contains(kType) || !json.contains(kTerm)) [[unlikely]] { + return JsonParseError( + "Invalid predicate JSON: unexpected 'type' or 'term' field : {}", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + const auto& term_json = json[kTerm]; + + if (IsTransformTerm(term_json)) { + ICEBERG_ASSIGN_OR_RAISE(auto term, UnboundTransformFromJson(term_json)); + return PredicateFromJson(op, std::move(term), json); + } + + ICEBERG_ASSIGN_OR_RAISE(auto term, NamedReferenceFromJson(term_json)); + return PredicateFromJson(op, std::move(term), json); +} + Result> ExpressionFromJson(const nlohmann::json& json) { - // Handle boolean + // Handle boolean constants if (json.is_boolean()) { return json.get() ? internal::checked_pointer_cast(True::Instance()) : internal::checked_pointer_cast(False::Instance()); } - return JsonParseError("Only booleans are currently supported."); + if (json.is_string()) { + auto s = json.get(); + std::ranges::transform(s, s.begin(), [](unsigned char c) -> char { + return static_cast(std::tolower(c)); + }); + if (s == kTrue) return internal::checked_pointer_cast(True::Instance()); + if (s == kFalse) return internal::checked_pointer_cast(False::Instance()); + } + + if (!json.is_object() || !json.contains(kType)) [[unlikely]] { + return JsonParseError("expresion JSON must be an object with a 'type' field: {}", + SafeDumpJson(json)); + } + + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + switch (op) { + case Expression::Operation::kAnd: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("AND expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft])); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight])); + return And::Make(std::move(left), std::move(right)); + } + case Expression::Operation::kOr: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("OR expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft])); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight])); + return Or::Make(std::move(left), std::move(right)); + } + case Expression::Operation::kNot: { + if (!json.contains(kChild)) [[unlikely]] { + return JsonParseError("NOT expression missing 'child' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto child, ExpressionFromJson(json[kChild])); + return Not::Make(std::move(child)); + } + default: + // All other operations are predicates however since binding does not happen during + // JSON deserialization, we will always deserialize to an unbound predicate. + return UnboundPredicateFromJson(json); + } } -nlohmann::json ToJson(const Expression& expr) { +Result ToJson(const Expression& expr) { switch (expr.op()) { case Expression::Operation::kTrue: - return true; - + return nlohmann::json(true); case Expression::Operation::kFalse: - return false; + return nlohmann::json(false); + case Expression::Operation::kAnd: { + const auto& and_expr = internal::checked_cast(expr); + nlohmann::json json; + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(expr.op())); + ICEBERG_ASSIGN_OR_RAISE(json[kLeft], ToJson(*and_expr.left())); + ICEBERG_ASSIGN_OR_RAISE(json[kRight], ToJson(*and_expr.right())); + return json; + } + case Expression::Operation::kOr: { + const auto& or_expr = internal::checked_cast(expr); + nlohmann::json json; + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(expr.op())); + ICEBERG_ASSIGN_OR_RAISE(json[kLeft], ToJson(*or_expr.left())); + ICEBERG_ASSIGN_OR_RAISE(json[kRight], ToJson(*or_expr.right())); + return json; + } + case Expression::Operation::kNot: { + const auto& not_expr = internal::checked_cast(expr); + nlohmann::json json; + ICEBERG_ASSIGN_OR_RAISE(json[kType], ToJson(expr.op())); + ICEBERG_ASSIGN_OR_RAISE(json[kChild], ToJson(*not_expr.child())); + return json; + } default: - // TODO(evindj): This code will be removed as we implemented the full expression - // serialization. - ICEBERG_CHECK_OR_DIE(false, "Only booleans are currently supported."); + if (expr.is_unbound_predicate()) + return ToJson(dynamic_cast(expr)); + if (expr.is_bound_predicate()) + return ToJson(dynamic_cast(expr)); + return NotSupported("Unsupported expression type for JSON serialization"); } } diff --git a/src/iceberg/expression/json_serde_internal.h b/src/iceberg/expression/json_serde_internal.h index e44234d39..323fdca22 100644 --- a/src/iceberg/expression/json_serde_internal.h +++ b/src/iceberg/expression/json_serde_internal.h @@ -42,7 +42,7 @@ ICEBERG_EXPORT Result OperationTypeFromJson( /// /// \param op The operation to convert /// \return The operation type string (e.g., "eq", "lt-eq", "is-null") -ICEBERG_EXPORT nlohmann::json ToJson(Expression::Operation op); +ICEBERG_EXPORT Result ToJson(Expression::Operation op); /// \brief Deserializes a JSON object into an Expression. /// @@ -54,8 +54,77 @@ ICEBERG_EXPORT Result> ExpressionFromJson( /// \brief Serializes an Expression into its JSON representation. /// /// \param expr The expression to serialize -/// \return A JSON object representing the expression -ICEBERG_EXPORT nlohmann::json ToJson(const Expression& expr); +/// \return A JSON object representing the expression, or an error +ICEBERG_EXPORT Result ToJson(const Expression& expr); + +/// \brief Deserializes a JSON object into a NamedReference. +/// +/// \param json A JSON object representing a named reference +/// \return A shared pointer to the deserialized NamedReference or an error +ICEBERG_EXPORT Result> NamedReferenceFromJson( + const nlohmann::json& json); + +/// \brief Serializes a NamedReference into its JSON representation. +/// +/// \param ref The named reference to serialize +/// \return A JSON object representing the named reference, or an error +ICEBERG_EXPORT Result ToJson(const NamedReference& ref); + +/// \brief Serializes an UnboundTransform into its JSON representation. +/// +/// \param transform The unbound transform to serialize +/// \return A JSON object representing the unbound transform, or an error +ICEBERG_EXPORT Result ToJson(const UnboundTransform& transform); + +/// \brief Deserializes a JSON object into an UnboundTransform. +/// +/// \param json A JSON object representing an unbound transform +/// \return A shared pointer to the deserialized UnboundTransform or an error +ICEBERG_EXPORT Result> UnboundTransformFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Literal into its JSON representation. +/// +/// \param literal The literal to serialize +/// \return A JSON value representing the literal, or an error +ICEBERG_EXPORT Result ToJson(const Literal& literal); + +/// \brief Deserializes a JSON value into a Literal. +/// +/// \param json A JSON value representing a literal. There is a discrepency with the java +/// implementation. In java, a schema is taken as parameter even though the value passed +/// is always null. It could come in handy if we decide to do binding a deserialization +/// time. +/// \return The deserialized Literal or an error. +ICEBERG_EXPORT Result LiteralFromJson(const nlohmann::json& json); + +/// \brief Serializes an UnboundPredicate into its JSON representation. +/// +/// \param pred The unbound predicate to serialize +/// \return A JSON object representing the predicate, or an error +ICEBERG_EXPORT Result ToJson(const UnboundPredicate& pred); + +/// \brief Serializes a BoundReference into its JSON representation (field name string). +ICEBERG_EXPORT Result ToJson(const BoundReference& ref); + +/// \brief Serializes a BoundTransform into its JSON representation. +ICEBERG_EXPORT Result ToJson(const BoundTransform& transform); + +/// \brief Serializes a BoundPredicate into its JSON representation. +ICEBERG_EXPORT Result ToJson(const BoundPredicate& pred); + +/// \brief Deserializes a JSON object into an UnboundPredicate. +/// +/// \param json A JSON object representing an unbound predicate +/// \return A pointer to the deserialized UnboundPredicate or an error +ICEBERG_EXPORT Result> UnboundPredicateFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Term into its JSON representation. +/// +/// \param term The term to serialize (NamedReference or UnboundTransform) +/// \return A JSON value representing the term, or an error +ICEBERG_EXPORT Result ToJson(const Term& term); /// Check if an operation is a unary predicate ICEBERG_EXPORT bool IsUnaryOperation(Expression::Operation op); diff --git a/src/iceberg/expression/predicate.h b/src/iceberg/expression/predicate.h index cdd3d1f52..f54f02938 100644 --- a/src/iceberg/expression/predicate.h +++ b/src/iceberg/expression/predicate.h @@ -76,6 +76,12 @@ class ICEBERG_EXPORT UnboundPredicate : public virtual Expression, bool is_unbound_predicate() const override { return true; } + /// \brief Returns the term of this predicate as a base Term reference. + virtual const Term& predicate_term() const = 0; + + /// \brief Returns the literals of this predicate. + virtual std::span literals() const = 0; + protected: UnboundPredicate() = default; }; @@ -130,7 +136,9 @@ class ICEBERG_EXPORT UnboundPredicateImpl : public UnboundPredicate, Result> Negate() const override; - std::span literals() const { return values_; } + const Term& predicate_term() const override { return *BASE::term(); } + + std::span literals() const override { return values_; } private: UnboundPredicateImpl(Expression::Operation op, std::shared_ptr> term); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index dd3ac5e3e..db0426825 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -30,37 +31,373 @@ #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" #include "iceberg/expression/term.h" +#include "iceberg/schema.h" #include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" +#include "iceberg/util/uuid.h" namespace iceberg { -// Test boolean constant expressions -TEST(ExpressionJsonTest, CheckBooleanExpression) { - auto checkBoolean = [](std::shared_ptr expr, bool value) { - auto json = ToJson(*expr); - EXPECT_TRUE(json.is_boolean()); - EXPECT_EQ(json.get(), value); - - auto result = ExpressionFromJson(json); - ASSERT_THAT(result, IsOk()); - if (value) { - EXPECT_EQ(result.value()->op(), Expression::Operation::kTrue); - } else { - EXPECT_EQ(result.value()->op(), Expression::Operation::kFalse); - } - }; - checkBoolean(True::Instance(), true); - checkBoolean(False::Instance(), false); +struct ExpressionJsonRoundTripParam { + std::string name; + nlohmann::json json; + Expression::Operation expected_op; +}; + +class ExpressionJsonRoundTripTest + : public ::testing::TestWithParam {}; + +TEST_P(ExpressionJsonRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(param.json)); + EXPECT_EQ(expr->op(), param.expected_op); + ICEBERG_UNWRAP_OR_FAIL(auto round_trip, ToJson(*expr)); + EXPECT_EQ(round_trip, param.json); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, ExpressionJsonRoundTripTest, + ::testing::Values( + ExpressionJsonRoundTripParam{"BooleanTrue", true, Expression::Operation::kTrue}, + ExpressionJsonRoundTripParam{"BooleanFalse", false, + Expression::Operation::kFalse}, + ExpressionJsonRoundTripParam{"UnaryIsNull", + {{"type", "is-null"}, {"term", "col"}}, + Expression::Operation::kIsNull}, + ExpressionJsonRoundTripParam{"LiteralGt", + {{"type", "gt"}, {"term", "age"}, {"value", 21}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "SetIn", + {{"type", "in"}, + {"term", "status"}, + {"values", nlohmann::json::array({"active", "pending"})}}, + Expression::Operation::kIn}, + ExpressionJsonRoundTripParam{ + "AndExpression", + {{"type", "and"}, + {"left", {{"type", "gt"}, {"term", "age"}, {"value", 18}}}, + {"right", {{"type", "lt"}, {"term", "age"}, {"value", 65}}}}, + Expression::Operation::kAnd}, + ExpressionJsonRoundTripParam{ + "NotExpression", + {{"type", "not"}, {"child", {{"type", "is-null"}, {"term", "name"}}}}, + Expression::Operation::kNot}, + ExpressionJsonRoundTripParam{ + "TransformDay", + {{"type", "eq"}, + {"term", {{"type", "transform"}, {"transform", "day"}, {"term", "ts"}}}, + {"value", 19738}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{ + "TransformYear", + {{"type", "gt"}, + {"term", + {{"type", "transform"}, {"transform", "year"}, {"term", "timestamp_col"}}}, + {"value", 2020}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "TransformTruncate", + {{"type", "lt"}, + {"term", + {{"type", "transform"}, {"transform", "truncate[4]"}, {"term", "col"}}}, + {"value", 100}}, + Expression::Operation::kLt}, + ExpressionJsonRoundTripParam{ + "LiteralNotEq", + {{"type", "not-eq"}, {"term", "status"}, {"value", "closed"}}, + Expression::Operation::kNotEq}, + ExpressionJsonRoundTripParam{ + "LiteralLtEq", + {{"type", "lt-eq"}, {"term", "price"}, {"value", 100}}, + Expression::Operation::kLtEq}, + ExpressionJsonRoundTripParam{ + "LiteralGtEq", + {{"type", "gt-eq"}, {"term", "quantity"}, {"value", 1}}, + Expression::Operation::kGtEq}, + ExpressionJsonRoundTripParam{ + "SetNotIn", + {{"type", "not-in"}, + {"term", "category"}, + {"values", nlohmann::json::array({"archived", "deleted"})}}, + Expression::Operation::kNotIn}, + ExpressionJsonRoundTripParam{"UnaryNotNan", + {{"type", "not-nan"}, {"term", "score"}}, + Expression::Operation::kNotNan}, + ExpressionJsonRoundTripParam{ + "LiteralStartsWith", + {{"type", "starts-with"}, {"term", "name"}, {"value", "prefix"}}, + Expression::Operation::kStartsWith}, + ExpressionJsonRoundTripParam{ + "LiteralNotStartsWith", + {{"type", "not-starts-with"}, {"term", "name"}, {"value", "bad"}}, + Expression::Operation::kNotStartsWith}, + ExpressionJsonRoundTripParam{ + "OrExpression", + {{"type", "or"}, + {"left", {{"type", "lt"}, {"term", "price"}, {"value", 50}}}, + {"right", {{"type", "not-null"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "NestedWithDecimals", + {{"type", "or"}, + {"left", + {{"type", "and"}, + {"left", + {{"type", "in"}, + {"term", "price"}, + {"values", nlohmann::json::array({3.14, 2.72})}}}, + {"right", {{"type", "eq"}, {"term", "currency"}, {"value", "USD"}}}}}, + {"right", {{"type", "is-nan"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "FixedBinaryInPredicate", + {{"type", "eq"}, {"term", "col"}, {"value", "010203"}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{"ScaleDecimalInSet", + {{"type", "in"}, + {"term", "amount"}, + {"values", nlohmann::json::array({"3.14E+4"})}}, + Expression::Operation::kIn}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Object wrapper normalization tests -- + +TEST(ExpressionJsonTest, PredicateWithObjectLiteral) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", "col"}, + {"value", {{"type", "literal"}, {"value", 50}}}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + ICEBERG_UNWRAP_OR_FAIL(auto result, ToJson(*expr)); + EXPECT_EQ(result, expected); +} + +TEST(ExpressionJsonTest, PredicateWithObjectReference) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", {{"type", "reference"}, {"term", "col"}}}, + {"value", 50}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + ICEBERG_UNWRAP_OR_FAIL(auto result2, ToJson(*expr)); + EXPECT_EQ(result2, expected); +} + +// -- Parameterized invalid expression tests -- + +struct InvalidExpressionParam { + std::string name; + nlohmann::json json; + std::string expected_error_substr; +}; + +class InvalidExpressionTest : public ::testing::TestWithParam {}; + +TEST_P(InvalidExpressionTest, ReturnsError) { + const auto& param = GetParam(); + auto result = ExpressionFromJson(param.json); + EXPECT_THAT(result, HasErrorMessage(param.expected_error_substr)); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, InvalidExpressionTest, + ::testing::Values( + InvalidExpressionParam{"NotBooleanOrObject", 42, "an object with a 'type'"}, + InvalidExpressionParam{"UnknownOperationType", + {{"type", "illegal"}, {"term", "col"}}, + "Unknown expression type"}, + InvalidExpressionParam{ + "AndMissingLeft", + {{"type", "and"}, {"right", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{ + "OrMissingRight", + {{"type", "or"}, {"left", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{"NotMissingChild", {{"type", "not"}}, "missing 'child'"}, + InvalidExpressionParam{"UnaryWithSpuriousValue", + {{"type", "not-nan"}, {"term", "col"}, {"value", 42}}, + "invalid 'value' field"}, + InvalidExpressionParam{"UnaryWithSpuriousValues", + {{"type", "is-nan"}, + {"term", "col"}, + {"values", nlohmann::json::array({1, 2})}}, + "invalid 'values' field"}, + InvalidExpressionParam{"NumericTerm", + {{"type", "lt"}, {"term", 23}, {"value", 10}}, + "Expected string for named reference"}, + InvalidExpressionParam{"SetMissingValues", + {{"type", "in"}, {"term", "col"}, {"value", 42}}, + "values"}, + InvalidExpressionParam{ + "LiteralMissingValue", {{"type", "gt"}, {"term", "col"}}, "value"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +struct BooleanStringParam { + std::string name; + std::string json_value; + Expression::Operation expected_op; +}; + +class BooleanStringDeserializationTest + : public ::testing::TestWithParam {}; + +TEST_P(BooleanStringDeserializationTest, ParsesBooleanStrings) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(nlohmann::json(param.json_value))); + EXPECT_EQ(expr->op(), param.expected_op); } -TEST(ExpressionJsonTest, OperationTypeTests) { - EXPECT_EQ(OperationTypeFromJson("true"), Expression::Operation::kTrue); - EXPECT_EQ("true", ToJson(Expression::Operation::kTrue)); - EXPECT_TRUE(IsSetOperation(Expression::Operation::kIn)); - EXPECT_FALSE(IsSetOperation(Expression::Operation::kTrue)); +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, BooleanStringDeserializationTest, + ::testing::Values( + BooleanStringParam{"LowerTrue", "true", Expression::Operation::kTrue}, + BooleanStringParam{"LowerFalse", "false", Expression::Operation::kFalse}, + BooleanStringParam{"UpperTrue", "TRuE", Expression::Operation::kTrue}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Bound predicate ToJson tests -- + +struct BoundPredicateToJsonParam { + std::string name; + std::shared_ptr pred; + nlohmann::json expected_json; +}; + +class BoundPredicateToJsonTest + : public ::testing::TestWithParam { + protected: + static void SetUpTestSuite() { + schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64())}, + /*schema_id=*/0); + } + static std::shared_ptr schema_; +}; + +std::shared_ptr BoundPredicateToJsonTest::schema_; - EXPECT_TRUE(IsUnaryOperation(Expression::Operation::kIsNull)); - EXPECT_FALSE(IsUnaryOperation(Expression::Operation::kTrue)); +TEST_P(BoundPredicateToJsonTest, ToJson) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto bound, param.pred->Bind(*schema_, /*case_sensitive=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*bound)); + EXPECT_EQ(json, param.expected_json); } +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, BoundPredicateToJsonTest, + ::testing::Values( + BoundPredicateToJsonParam{"UnaryIsNull", + Expressions::IsNull("name"), + {{"type", "is-null"}, {"term", "name"}}}, + BoundPredicateToJsonParam{"UnaryNotNull", + Expressions::NotNull("name"), + {{"type", "not-null"}, {"term", "name"}}}, + BoundPredicateToJsonParam{"UnaryIsNan", + Expressions::IsNaN("salary"), + {{"type", "is-nan"}, {"term", "salary"}}}, + BoundPredicateToJsonParam{"UnaryNotNan", + Expressions::NotNaN("salary"), + {{"type", "not-nan"}, {"term", "salary"}}}, + BoundPredicateToJsonParam{"LiteralEq", + Expressions::Equal("age", Literal::Int(25)), + {{"type", "eq"}, {"term", "age"}, {"value", 25}}}, + BoundPredicateToJsonParam{"LiteralLt", + Expressions::LessThan("age", Literal::Int(18)), + {{"type", "lt"}, {"term", "age"}, {"value", 18}}}, + BoundPredicateToJsonParam{ + "LiteralGtEq", + Expressions::GreaterThanOrEqual("age", Literal::Int(21)), + {{"type", "gt-eq"}, {"term", "age"}, {"value", 21}}}, + BoundPredicateToJsonParam{ + "LiteralStartsWith", + Expressions::StartsWith("name", "prefix"), + {{"type", "starts-with"}, {"term", "name"}, {"value", "prefix"}}}, + BoundPredicateToJsonParam{"LiteralNotEq", + Expressions::NotEqual("age", Literal::Int(7)), + {{"type", "not-eq"}, {"term", "age"}, {"value", 7}}}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Set operation round-trip tests -- +// Tests the full cycle: bind UnboundPredicate → serialize BoundPredicate to JSON +// → deserialize to UnboundPredicate → compare op, term, and values. + +struct SetOpRoundTripParam { + std::string name; + std::shared_ptr pred; + Expression::Operation expected_op; + std::string expected_term; + std::vector expected_values; +}; + +class SetOpRoundTripTest : public ::testing::TestWithParam { + protected: + static void SetUpTestSuite() { + schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64())}, + /*schema_id=*/0); + } + static std::shared_ptr schema_; +}; + +std::shared_ptr SetOpRoundTripTest::schema_; + +TEST_P(SetOpRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + + ICEBERG_UNWRAP_OR_FAIL(auto bound, param.pred->Bind(*schema_, /*case_sensitive=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*bound)); + ICEBERG_UNWRAP_OR_FAIL(auto unbound, UnboundPredicateFromJson(json)); + + EXPECT_EQ(unbound->op(), param.expected_op); + EXPECT_EQ(unbound->reference()->name(), param.expected_term); + std::vector got; + got.reserve(unbound->literals().size()); + for (const auto& lit : unbound->literals()) { + got.push_back(lit.ToString()); + } + + std::vector expected; + expected.reserve(param.expected_values.size()); + for (const auto& lit : param.expected_values) { + expected.push_back(lit.ToString()); + } + + EXPECT_THAT(got, ::testing::UnorderedElementsAreArray(expected)); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, SetOpRoundTripTest, + ::testing::Values( + SetOpRoundTripParam{ + "In", + Expressions::In("age", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}), + Expression::Operation::kIn, + "age", + {Literal::Int(1), Literal::Int(2), Literal::Int(3)}}, + SetOpRoundTripParam{ + "NotIn", + Expressions::NotIn("age", {Literal::Int(5), Literal::Int(10)}), + Expression::Operation::kNotIn, + "age", + {Literal::Int(5), Literal::Int(10)}}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index e97de0ac5..491775ee3 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -129,8 +129,11 @@ class BoundReference; class BoundTransform; class Expression; class Literal; +class NamedReference; class Term; +class Transform; class UnboundPredicate; +class UnboundTransform; /// \brief Evaluator. class Evaluator; From 0ed2c5e054c194b5b4f62e3990128d0e3ccba7f2 Mon Sep 17 00:00:00 2001 From: Innocent Date: Sun, 1 Feb 2026 17:22:16 -0700 Subject: [PATCH 2/3] feat: add json serde for expressions --- src/iceberg/expression/json_serde_internal.h | 58 ++++++++++++++++++++ src/iceberg/test/expression_json_test.cc | 2 - 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/src/iceberg/expression/json_serde_internal.h b/src/iceberg/expression/json_serde_internal.h index 323fdca22..e028a9ca4 100644 --- a/src/iceberg/expression/json_serde_internal.h +++ b/src/iceberg/expression/json_serde_internal.h @@ -19,6 +19,7 @@ #pragma once + #include #include "iceberg/expression/expression.h" @@ -126,6 +127,63 @@ ICEBERG_EXPORT Result> UnboundPredicateFromJso /// \return A JSON value representing the term, or an error ICEBERG_EXPORT Result ToJson(const Term& term); +/// \brief Deserializes a JSON object into a NamedReference. +/// +/// \param json A JSON object representing a named reference +/// \return A shared pointer to the deserialized NamedReference or an error +ICEBERG_EXPORT Result> NamedReferenceFromJson( + const nlohmann::json& json); + +/// \brief Serializes a NamedReference into its JSON representation. +/// +/// \param ref The named reference to serialize +/// \return A JSON object representing the named reference +ICEBERG_EXPORT nlohmann::json ToJson(const NamedReference& ref); + +/// \brief Serializes an UnboundTransform into its JSON representation. +/// +/// \param transform The unbound transform to serialize +/// \return A JSON object representing the unbound transform +ICEBERG_EXPORT nlohmann::json ToJson(const UnboundTransform& transform); + +/// \brief Deserializes a JSON object into an UnboundTransform. +/// +/// \param json A JSON object representing an unbound transform +/// \return A shared pointer to the deserialized UnboundTransform or an error +ICEBERG_EXPORT Result> UnboundTransformFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Literal into its JSON representation. +/// +/// \param literal The literal to serialize +/// \return A JSON value representing the literal +ICEBERG_EXPORT nlohmann::json ToJson(const Literal& literal); + +/// \brief Deserializes a JSON value into a Literal. +/// +/// \param json A JSON value representing a literal +/// \return The deserialized Literal or an error +ICEBERG_EXPORT Result LiteralFromJson(const nlohmann::json& json); + +/// \brief Serializes an UnboundPredicate into its JSON representation. +/// +/// \param pred The unbound predicate to serialize +/// \return A JSON object representing the predicate +ICEBERG_EXPORT nlohmann::json ToJson(const UnboundPredicate& pred); + +/// \brief Deserializes a JSON object into an UnboundPredicate. +/// +/// \param json A JSON object representing an unbound predicate +/// \return A shared pointer to the deserialized UnboundPredicate or an error +ICEBERG_EXPORT Result> UnboundPredicateFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Term into its JSON representation. +/// +/// \param term The term to serialize (NamedReference or UnboundTransform) +/// \return A JSON value representing the term +ICEBERG_EXPORT nlohmann::json TermToJson(const Term& term); + /// Check if an operation is a unary predicate ICEBERG_EXPORT bool IsUnaryOperation(Expression::Operation op); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index db0426825..933cb5b7a 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -26,10 +26,8 @@ #include #include "iceberg/expression/expression.h" -#include "iceberg/expression/expressions.h" #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" -#include "iceberg/expression/predicate.h" #include "iceberg/expression/term.h" #include "iceberg/schema.h" #include "iceberg/test/matchers.h" From d8e2630280a0ea580657e4f257df4574eb96cd99 Mon Sep 17 00:00:00 2001 From: Innocent Date: Wed, 11 Feb 2026 22:05:39 -0700 Subject: [PATCH 3/3] feat: correctly bind literals after serde --- src/iceberg/expression/json_serde_internal.h | 58 ----------- src/iceberg/expression/literal.cc | 68 +++++++++++-- src/iceberg/test/expression_json_test.cc | 2 + src/iceberg/test/literal_test.cc | 31 ++++++ src/iceberg/test/transform_util_test.cc | 95 ++++++++++++++++++ src/iceberg/util/transform_util.cc | 100 +++++++++++++++++++ src/iceberg/util/transform_util.h | 31 ++++++ 7 files changed, 321 insertions(+), 64 deletions(-) diff --git a/src/iceberg/expression/json_serde_internal.h b/src/iceberg/expression/json_serde_internal.h index e028a9ca4..323fdca22 100644 --- a/src/iceberg/expression/json_serde_internal.h +++ b/src/iceberg/expression/json_serde_internal.h @@ -19,7 +19,6 @@ #pragma once - #include #include "iceberg/expression/expression.h" @@ -127,63 +126,6 @@ ICEBERG_EXPORT Result> UnboundPredicateFromJso /// \return A JSON value representing the term, or an error ICEBERG_EXPORT Result ToJson(const Term& term); -/// \brief Deserializes a JSON object into a NamedReference. -/// -/// \param json A JSON object representing a named reference -/// \return A shared pointer to the deserialized NamedReference or an error -ICEBERG_EXPORT Result> NamedReferenceFromJson( - const nlohmann::json& json); - -/// \brief Serializes a NamedReference into its JSON representation. -/// -/// \param ref The named reference to serialize -/// \return A JSON object representing the named reference -ICEBERG_EXPORT nlohmann::json ToJson(const NamedReference& ref); - -/// \brief Serializes an UnboundTransform into its JSON representation. -/// -/// \param transform The unbound transform to serialize -/// \return A JSON object representing the unbound transform -ICEBERG_EXPORT nlohmann::json ToJson(const UnboundTransform& transform); - -/// \brief Deserializes a JSON object into an UnboundTransform. -/// -/// \param json A JSON object representing an unbound transform -/// \return A shared pointer to the deserialized UnboundTransform or an error -ICEBERG_EXPORT Result> UnboundTransformFromJson( - const nlohmann::json& json); - -/// \brief Serializes a Literal into its JSON representation. -/// -/// \param literal The literal to serialize -/// \return A JSON value representing the literal -ICEBERG_EXPORT nlohmann::json ToJson(const Literal& literal); - -/// \brief Deserializes a JSON value into a Literal. -/// -/// \param json A JSON value representing a literal -/// \return The deserialized Literal or an error -ICEBERG_EXPORT Result LiteralFromJson(const nlohmann::json& json); - -/// \brief Serializes an UnboundPredicate into its JSON representation. -/// -/// \param pred The unbound predicate to serialize -/// \return A JSON object representing the predicate -ICEBERG_EXPORT nlohmann::json ToJson(const UnboundPredicate& pred); - -/// \brief Deserializes a JSON object into an UnboundPredicate. -/// -/// \param json A JSON object representing an unbound predicate -/// \return A shared pointer to the deserialized UnboundPredicate or an error -ICEBERG_EXPORT Result> UnboundPredicateFromJson( - const nlohmann::json& json); - -/// \brief Serializes a Term into its JSON representation. -/// -/// \param term The term to serialize (NamedReference or UnboundTransform) -/// \return A JSON value representing the term -ICEBERG_EXPORT nlohmann::json TermToJson(const Term& term); - /// Check if an operation is a unary predicate ICEBERG_EXPORT bool IsUnaryOperation(Expression::Operation op); diff --git a/src/iceberg/expression/literal.cc b/src/iceberg/expression/literal.cc index 88bafd78d..22617d32a 100644 --- a/src/iceberg/expression/literal.cc +++ b/src/iceberg/expression/literal.cc @@ -23,14 +23,46 @@ #include #include #include +#include +#include "iceberg/type.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/conversions.h" +#include "iceberg/util/decimal.h" #include "iceberg/util/macros.h" #include "iceberg/util/temporal_util.h" +#include "iceberg/util/transform_util.h" namespace iceberg { +namespace { +Result> HexStringToBytes(std::string_view hex) { + if (hex.length() % 2 != 0) { + return InvalidArgument("Hex string must have an even length"); + } + + std::vector bytes; + bytes.reserve(hex.length() / 2); + + auto to_nibble = [](char c) -> uint8_t { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + throw std::invalid_argument("Invalid hex character"); + }; + + for (size_t i = 0; i < hex.length(); i += 2) { + try { + bytes.push_back( + static_cast((to_nibble(hex[i]) << 4) | to_nibble(hex[i + 1]))); + } catch (const std::invalid_argument& e) { + return InvalidArgument("Invalid hex character in string: {}", e.what()); + } + } + return bytes; +} +} // namespace + /// \brief LiteralCaster handles type casting operations for Literal. /// This is an internal implementation class. class LiteralCaster { @@ -193,12 +225,36 @@ Result LiteralCaster::CastFromString( ICEBERG_ASSIGN_OR_RAISE(auto uuid, Uuid::FromString(str_val)); return Literal::UUID(uuid); } - case TypeId::kDate: - case TypeId::kTime: - case TypeId::kTimestamp: - case TypeId::kTimestampTz: - return NotImplemented("Cast from String to {} is not implemented yet", - target_type->ToString()); + case TypeId::kDate: { + ICEBERG_ASSIGN_OR_RAISE(auto days, TransformUtil::ParseDay(str_val)); + return Literal::Date(days); + } + case TypeId::kTime: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTime(str_val)); + return Literal::Time(micros); + } + case TypeId::kTimestamp: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, TransformUtil::ParseTimestamp(str_val)); + return Literal::Timestamp(micros); + } + case TypeId::kTimestampTz: { + ICEBERG_ASSIGN_OR_RAISE(auto micros, + TransformUtil::ParseTimestampWithZone(str_val)); + return Literal::TimestampTz(micros); + } + case TypeId::kBinary: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, HexStringToBytes(str_val)); + return Literal::Binary(std::move(bytes)); + } + case TypeId::kFixed: { + ICEBERG_ASSIGN_OR_RAISE(auto bytes, HexStringToBytes(str_val)); + return Literal::Fixed(std::move(bytes)); + } + case TypeId::kDecimal: { + const auto& dec_type = internal::checked_cast(*target_type); + ICEBERG_ASSIGN_OR_RAISE(auto dec, Decimal::FromString(str_val)); + return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale()); + } default: return NotSupported("Cast from String to {} is not supported", target_type->ToString()); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index 933cb5b7a..db0426825 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -26,8 +26,10 @@ #include #include "iceberg/expression/expression.h" +#include "iceberg/expression/expressions.h" #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" #include "iceberg/expression/term.h" #include "iceberg/schema.h" #include "iceberg/test/matchers.h" diff --git a/src/iceberg/test/literal_test.cc b/src/iceberg/test/literal_test.cc index 01a7a7ce6..97724aad9 100644 --- a/src/iceberg/test/literal_test.cc +++ b/src/iceberg/test/literal_test.cc @@ -787,6 +787,37 @@ INSTANTIATE_TEST_SUITE_P( .target_type = uuid(), .expected_literal = Literal::UUID( Uuid::FromString("123e4567-e89b-12d3-a456-426614174000").value())}, + CastLiteralTestParam{.test_name = "StringToDate", + .source_literal = Literal::String("2024-01-16"), + .target_type = date(), + .expected_literal = Literal::Date(19738)}, + CastLiteralTestParam{.test_name = "StringToTime", + .source_literal = Literal::String("14:30"), + .target_type = time(), + .expected_literal = Literal::Time(52200000000LL)}, + CastLiteralTestParam{.test_name = "StringToTimestamp", + .source_literal = Literal::String("2026-01-01T00:00:01.500"), + .target_type = timestamp(), + .expected_literal = Literal::Timestamp(1767225601500000L)}, + CastLiteralTestParam{ + .test_name = "StringToTimestampTz", + .source_literal = Literal::String("2026-01-01T00:00:01.500+00:00"), + .target_type = timestamp_tz(), + .expected_literal = Literal::TimestampTz(1767225601500000L)}, + CastLiteralTestParam{.test_name = "StringToBinary", + .source_literal = Literal::String("010203FF"), + .target_type = binary(), + .expected_literal = Literal::Binary(std::vector{ + 0x01, 0x02, 0x03, 0xFF})}, + CastLiteralTestParam{.test_name = "StringToFixed", + .source_literal = Literal::String("01020304"), + .target_type = fixed(4), + .expected_literal = Literal::Fixed(std::vector{ + 0x01, 0x02, 0x03, 0x04})}, + CastLiteralTestParam{.test_name = "StringToDecimal", + .source_literal = Literal::String("1234.56"), + .target_type = decimal(6, 2), + .expected_literal = Literal::Decimal(123456, 6, 2)}, // Same type cast test CastLiteralTestParam{.test_name = "IntToInt", .source_literal = Literal::Int(42), diff --git a/src/iceberg/test/transform_util_test.cc b/src/iceberg/test/transform_util_test.cc index 76f6824b3..48a455973 100644 --- a/src/iceberg/test/transform_util_test.cc +++ b/src/iceberg/test/transform_util_test.cc @@ -21,6 +21,8 @@ #include +#include "iceberg/test/matchers.h" + namespace iceberg { TEST(TransformUtilTest, HumanYear) { @@ -157,4 +159,97 @@ TEST(TransformUtilTest, Base64Encode) { EXPECT_EQ("AA==", TransformUtil::Base64Encode({"\x00", 1})); } +struct ParseRoundTripParam { + std::string name; + std::string str; + int64_t value; + enum Kind { kDay, kTime, kTimestamp, kTimestampTz } kind; +}; + +class ParseRoundTripTest : public ::testing::TestWithParam {}; + +TEST_P(ParseRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + switch (param.kind) { + case ParseRoundTripParam::kDay: { + EXPECT_EQ(TransformUtil::HumanDay(static_cast(param.value)), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseDay(param.str)); + EXPECT_EQ(parsed, static_cast(param.value)); + break; + } + case ParseRoundTripParam::kTime: { + EXPECT_EQ(TransformUtil::HumanTime(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTime(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestamp: { + EXPECT_EQ(TransformUtil::HumanTimestamp(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, TransformUtil::ParseTimestamp(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + case ParseRoundTripParam::kTimestampTz: { + EXPECT_EQ(TransformUtil::HumanTimestampWithZone(param.value), param.str); + ICEBERG_UNWRAP_OR_FAIL(auto parsed, + TransformUtil::ParseTimestampWithZone(param.str)); + EXPECT_EQ(parsed, param.value); + break; + } + } +} + +INSTANTIATE_TEST_SUITE_P( + TransformUtilTest, ParseRoundTripTest, + ::testing::Values( + // Day round-trips + ParseRoundTripParam{"DayEpoch", "1970-01-01", 0, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNext", "1970-01-02", 1, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayBeforeEpoch", "1969-12-31", -1, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayYear999", "0999-12-31", -354286, + ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayNonLeap", "1971-01-01", 365, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"DayY2K", "2000-01-01", 10957, ParseRoundTripParam::kDay}, + ParseRoundTripParam{"Day2026", "2026-01-01", 20454, ParseRoundTripParam::kDay}, + // Time round-trips + ParseRoundTripParam{"TimeMidnight", "00:00", 0, ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneSec", "00:00:01", 1000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMillis", "00:00:01.500", 1500000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeOneMillis", "00:00:01.001", 1001000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeMicros", "00:00:01.000001", 1000001, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeHourMinSec", "01:02:03", 3723000000, + ParseRoundTripParam::kTime}, + ParseRoundTripParam{"TimeEndOfDay", "23:59:59", 86399000000, + ParseRoundTripParam::kTime}, + // Timestamp round-trips + ParseRoundTripParam{"TimestampEpoch", "1970-01-01T00:00:00", 0, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneSec", "1970-01-01T00:00:01", 1000000, + ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMillis", "2026-01-01T00:00:01.500", + 1767225601500000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampOneMillis", "2026-01-01T00:00:01.001", + 1767225601001000L, ParseRoundTripParam::kTimestamp}, + ParseRoundTripParam{"TimestampMicros", "2026-01-01T00:00:01.000001", + 1767225601000001L, ParseRoundTripParam::kTimestamp}, + // TimestampTz round-trips + ParseRoundTripParam{"TimestampTzEpoch", "1970-01-01T00:00:00+00:00", 0, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneSec", "1970-01-01T00:00:01+00:00", 1000000, + ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMillis", "2026-01-01T00:00:01.500+00:00", + 1767225601500000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzOneMillis", "2026-01-01T00:00:01.001+00:00", + 1767225601001000L, ParseRoundTripParam::kTimestampTz}, + ParseRoundTripParam{"TimestampTzMicros", "2026-01-01T00:00:01.000001+00:00", + 1767225601000001L, ParseRoundTripParam::kTimestampTz}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + } // namespace iceberg diff --git a/src/iceberg/util/transform_util.cc b/src/iceberg/util/transform_util.cc index fe1523437..42892835f 100644 --- a/src/iceberg/util/transform_util.cc +++ b/src/iceberg/util/transform_util.cc @@ -20,14 +20,33 @@ #include "iceberg/util/transform_util.h" #include +#include #include +#include "iceberg/util/macros.h" + namespace iceberg { namespace { constexpr auto kEpochDate = std::chrono::year{1970} / std::chrono::January / 1; constexpr int64_t kMicrosPerMillis = 1'000; constexpr int64_t kMicrosPerSecond = 1'000'000; +constexpr int64_t kMicrosPerDay = 86'400'000'000LL; + +/// Parse fractional seconds (after '.') and return micros. +/// Accepts 1-6 digits, zero-padded on the right to 6 digits. +Result ParseFractionalMicros(std::string_view frac) { + int32_t val = 0; + auto [_, ec] = std::from_chars(frac.data(), frac.data() + frac.size(), val); + if (frac.empty() || frac.size() > 6 || ec != std::errc{}) { + return InvalidArgument("Invalid fractional seconds: '{}'", frac); + } + // Right-pad to 6 digits: "500" → 500000, "001" → 1000, "000001" → 1 + for (size_t i = frac.size(); i < 6; ++i) { + val *= 10; + } + return static_cast(val); +} } // namespace std::string TransformUtil::HumanYear(int32_t year_ordinal) { @@ -92,6 +111,87 @@ std::string TransformUtil::HumanTimestampWithZone(int64_t timestamp_micros) { } } +Result TransformUtil::ParseDay(std::string_view str) { + // Expected format: "yyyy-MM-dd" ) + // Parse year, month, day manually + auto dash1 = str.find('-', str[0] == '-' ? 1 : 0); + auto dash2 = str.find('-', dash1 + 1); + if (str.size() < 10 || dash1 == std::string_view::npos || + dash2 == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid date string: '{}'", str); + } + int32_t year = 0, month = 0, day = 0; + auto [_, e1] = std::from_chars(str.data(), str.data() + dash1, year); + auto [__, e2] = std::from_chars(str.data() + dash1 + 1, str.data() + dash2, month); + auto [___, e3] = std::from_chars(str.data() + dash2 + 1, str.data() + str.size(), day); + + if (e1 != std::errc{} || e2 != std::errc{} || e3 != std::errc{}) [[unlikely]] { + return InvalidArgument("Invalid year in date string: '{}'", str); + } + + auto ymd = std::chrono::year{year} / std::chrono::month{static_cast(month)} / + std::chrono::day{static_cast(day)}; + if (!ymd.ok()) [[unlikely]] { + return InvalidArgument("Invalid date: '{}'", str); + } + + auto days = std::chrono::sys_days(ymd) - std::chrono::sys_days(kEpochDate); + return static_cast(days.count()); +} + +Result TransformUtil::ParseTime(std::string_view str) { + int64_t hours = 0, minutes = 0, seconds = 0; + + auto [_, eh] = std::from_chars(str.data(), str.data() + 2, hours); + + auto [__, em] = std::from_chars(str.data() + 3, str.data() + 5, minutes); + + if ((em != std::errc{}) || (eh != std::errc{}) || (str.size()) < 5) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + + int64_t frac_micros = 0; + if (str.size() > 5) { + auto [_, es] = std::from_chars(str.data() + 6, str.data() + 8, seconds); + if (str[5] != ':' || str.size() < 8 || es != std::errc{}) [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + if (str.size() > 8) { + if (str[8] != '.') [[unlikely]] { + return InvalidArgument("Invalid time string: '{}'", str); + } + ICEBERG_ASSIGN_OR_RAISE(frac_micros, ParseFractionalMicros(str.substr(9))); + } + } + + return hours * 3'600 * kMicrosPerSecond + minutes * 60 * kMicrosPerSecond + + seconds * kMicrosPerSecond + frac_micros; +} + +Result TransformUtil::ParseTimestamp(std::string_view str) { + // Format: "yyyy-MM-ddTHH:mm:ss[.SSS[SSS]]" + auto t_pos = str.find('T'); + if (t_pos == std::string_view::npos) [[unlikely]] { + return InvalidArgument("Invalid timestamp string (missing 'T'): '{}'", str); + } + + ICEBERG_ASSIGN_OR_RAISE(auto days, ParseDay(str.substr(0, t_pos))); + ICEBERG_ASSIGN_OR_RAISE(auto time_micros, ParseTime(str.substr(t_pos + 1))); + + return static_cast(days) * kMicrosPerDay + time_micros; +} + +Result TransformUtil::ParseTimestampWithZone(std::string_view str) { + // Format: same as ParseTimestamp but with "+00:00" suffix + constexpr std::string_view kZoneSuffix = "+00:00"; + if (str.size() < kZoneSuffix.size() || + str.substr(str.size() - kZoneSuffix.size()) != kZoneSuffix) [[unlikely]] { + return InvalidArgument("Invalid timestamptz string (missing '+00:00' suffix): '{}'", + str); + } + return ParseTimestamp(str.substr(0, str.size() - kZoneSuffix.size())); +} + std::string TransformUtil::Base64Encode(std::string_view str_to_encode) { static constexpr std::string_view kBase64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; diff --git a/src/iceberg/util/transform_util.h b/src/iceberg/util/transform_util.h index 7482b0dba..2c2a78bbf 100644 --- a/src/iceberg/util/transform_util.h +++ b/src/iceberg/util/transform_util.h @@ -22,6 +22,7 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" namespace iceberg { @@ -98,6 +99,36 @@ class ICEBERG_EXPORT TransformUtil { /// \return a string representation of this timestamp. static std::string HumanTimestampWithZone(int64_t timestamp_micros); + /// \brief Parses a date string in "yyyy-MM-dd" format into days since epoch. + /// + /// \param str The date string to parse. + /// \return The number of days since 1970-01-01, or an error. + static Result ParseDay(std::string_view str); + + /// \brief Parses a time string into microseconds from midnight. + /// + /// Accepts: "HH:mm", "HH:mm:ss", "HH:mm:ss.SSS", "HH:mm:ss.SSSSSS". + /// + /// \param str The time string to parse. + /// \return The number of microseconds from midnight, or an error. + static Result ParseTime(std::string_view str); + + /// \brief Parses a timestamp string into microseconds since epoch. + /// + /// Accepts: "yyyy-MM-ddTHH:mm:ss", with optional fractional seconds (.SSS or .SSSSSS). + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestamp(std::string_view str); + + /// \brief Parses a timestamp-with-zone string into microseconds since epoch. + /// + /// Accepts the same formats as ParseTimestamp, with a "+00:00" suffix. + /// + /// \param str The timestamp string to parse. + /// \return The number of microseconds since epoch, or an error. + static Result ParseTimestampWithZone(std::string_view str); + /// \brief Base64 encode a string static std::string Base64Encode(std::string_view str_to_encode); };