diff --git a/src/iceberg/expression/json_serde.cc b/src/iceberg/expression/json_serde.cc index 0fd7dd01d..b1617139f 100644 --- a/src/iceberg/expression/json_serde.cc +++ b/src/iceberg/expression/json_serde.cc @@ -17,47 +17,119 @@ * under the License. */ -#include #include #include -#include #include #include +#include "iceberg/expression/binder.h" #include "iceberg/expression/json_serde_internal.h" #include "iceberg/expression/literal.h" +#include "iceberg/expression/predicate.h" +#include "iceberg/expression/term.h" +#include "iceberg/schema.h" +#include "iceberg/transform.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/json_util_internal.h" #include "iceberg/util/macros.h" +#include "iceberg/util/transform_util.h" namespace iceberg { namespace { -// Expression type strings -constexpr std::string_view kTypeTrue = "true"; -constexpr std::string_view kTypeFalse = "false"; -constexpr std::string_view kTypeEq = "eq"; -constexpr std::string_view kTypeAnd = "and"; -constexpr std::string_view kTypeOr = "or"; -constexpr std::string_view kTypeNot = "not"; -constexpr std::string_view kTypeIn = "in"; -constexpr std::string_view kTypeNotIn = "not-in"; -constexpr std::string_view kTypeLt = "lt"; -constexpr std::string_view kTypeLtEq = "lt-eq"; -constexpr std::string_view kTypeGt = "gt"; -constexpr std::string_view kTypeGtEq = "gt-eq"; -constexpr std::string_view kTypeNotEq = "not-eq"; -constexpr std::string_view kTypeStartsWith = "starts-with"; -constexpr std::string_view kTypeNotStartsWith = "not-starts-with"; -constexpr std::string_view kTypeIsNull = "is-null"; -constexpr std::string_view kTypeNotNull = "not-null"; -constexpr std::string_view kTypeIsNan = "is-nan"; -constexpr std::string_view kTypeNotNan = "not-nan"; -constexpr std::string_view kTypeCount = "count"; -constexpr std::string_view kTypeCountNull = "count-null"; -constexpr std::string_view kTypeCountStar = "count-star"; -constexpr std::string_view kTypeMin = "min"; -constexpr std::string_view kTypeMax = "max"; +constexpr std::string_view kType = "type"; +constexpr std::string_view kTerm = "term"; +constexpr std::string_view kTransform = "transform"; +constexpr std::string_view kValue = "value"; +constexpr std::string_view kValues = "values"; +constexpr std::string_view kLeft = "left"; +constexpr std::string_view kRight = "right"; +constexpr std::string_view kChild = "child"; +constexpr std::string_view kTrue = "true"; +constexpr std::string_view kFalse = "false"; +constexpr std::string_view kEq = "eq"; +constexpr std::string_view kAnd = "and"; +constexpr std::string_view kOr = "or"; +constexpr std::string_view kNot = "not"; +constexpr std::string_view kIn = "in"; +constexpr std::string_view kNotIn = "not-in"; +constexpr std::string_view kLt = "lt"; +constexpr std::string_view kLtEq = "lt-eq"; +constexpr std::string_view kGt = "gt"; +constexpr std::string_view kGtEq = "gt-eq"; +constexpr std::string_view kNotEq = "not-eq"; +constexpr std::string_view kStartsWith = "starts-with"; +constexpr std::string_view kNotStartsWith = "not-starts-with"; +constexpr std::string_view kIsNull = "is-null"; +constexpr std::string_view kNotNull = "not-null"; +constexpr std::string_view kIsNan = "is-nan"; +constexpr std::string_view kNotNan = "not-nan"; +constexpr std::string_view kCount = "count"; +constexpr std::string_view kCountNull = "count-null"; +constexpr std::string_view kCountStar = "count-star"; +constexpr std::string_view kMin = "min"; +constexpr std::string_view kMax = "max"; +constexpr std::string_view kLiteral = "literal"; +constexpr std::string_view kReference = "reference"; + +/// Helper to build the transform JSON object shared by Unbound/BoundTransform +nlohmann::json MakeTransformJson(std::string_view transform_str, + std::string_view ref_name) { + nlohmann::json json; + json[kType] = kTransform; + json[kTransform] = transform_str; + json[kTerm] = ref_name; + return json; +} + +/// Helper to check if a JSON term represents a transform +bool IsTransformTerm(const nlohmann::json& json) { + return json.is_object() && json.contains(kType) && + json[kType].get() == kTransform && json.contains(kTerm); +} + +/// Template helper to create predicates from JSON with the appropriate term type +template +Result> PredicateFromJson( + Expression::Operation op, std::shared_ptr> term, + const nlohmann::json& json) { + if (IsUnaryOperation(op)) { + if (json.contains(kValue)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'value' field: {}", + SafeDumpJson(json)); + } + if (json.contains(kValues)) [[unlikely]] { + return JsonParseError("Unary predicate has invalid 'values' field: {}", + SafeDumpJson(json)); + } + return UnboundPredicateImpl::Make(op, std::move(term)); + } + + if (IsSetOperation(op)) { + std::vector literals; + if (!json.contains(kValues) || !json[kValues].is_array() || json.contains(kValue)) + [[unlikely]] { + return JsonParseError( + "Set predicate must include an array 'values' field and must not include " + "'value': {}", + SafeDumpJson(json)); + } + for (const auto& val : json[kValues]) { + ICEBERG_ASSIGN_OR_RAISE(auto lit, LiteralFromJson(val)); + literals.push_back(std::move(lit)); + } + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literals)); + } + + // Literal predicate + if (!json.contains(kValue) || json.contains(kValues)) [[unlikely]] { + return JsonParseError( + "Literal predicate requires 'value' and must not include 'values': {}", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto literal, LiteralFromJson(json[kValue])); + return UnboundPredicateImpl::Make(op, std::move(term), std::move(literal)); +} } // namespace bool IsUnaryOperation(Expression::Operation op) { @@ -83,34 +155,34 @@ bool IsSetOperation(Expression::Operation op) { } Result OperationTypeFromJson(const nlohmann::json& json) { - if (!json.is_string()) { + if (!json.is_string()) [[unlikely]] { return JsonParseError("Unable to create operation. Json value is not a string"); } auto typeStr = json.get(); - if (typeStr == kTypeTrue) return Expression::Operation::kTrue; - if (typeStr == kTypeFalse) return Expression::Operation::kFalse; - if (typeStr == kTypeAnd) return Expression::Operation::kAnd; - if (typeStr == kTypeOr) return Expression::Operation::kOr; - if (typeStr == kTypeNot) return Expression::Operation::kNot; - if (typeStr == kTypeEq) return Expression::Operation::kEq; - if (typeStr == kTypeNotEq) return Expression::Operation::kNotEq; - if (typeStr == kTypeLt) return Expression::Operation::kLt; - if (typeStr == kTypeLtEq) return Expression::Operation::kLtEq; - if (typeStr == kTypeGt) return Expression::Operation::kGt; - if (typeStr == kTypeGtEq) return Expression::Operation::kGtEq; - if (typeStr == kTypeIn) return Expression::Operation::kIn; - if (typeStr == kTypeNotIn) return Expression::Operation::kNotIn; - if (typeStr == kTypeIsNull) return Expression::Operation::kIsNull; - if (typeStr == kTypeNotNull) return Expression::Operation::kNotNull; - if (typeStr == kTypeIsNan) return Expression::Operation::kIsNan; - if (typeStr == kTypeNotNan) return Expression::Operation::kNotNan; - if (typeStr == kTypeStartsWith) return Expression::Operation::kStartsWith; - if (typeStr == kTypeNotStartsWith) return Expression::Operation::kNotStartsWith; - if (typeStr == kTypeCount) return Expression::Operation::kCount; - if (typeStr == kTypeCountNull) return Expression::Operation::kCountNull; - if (typeStr == kTypeCountStar) return Expression::Operation::kCountStar; - if (typeStr == kTypeMin) return Expression::Operation::kMin; - if (typeStr == kTypeMax) return Expression::Operation::kMax; + if (typeStr == kTrue) return Expression::Operation::kTrue; + if (typeStr == kFalse) return Expression::Operation::kFalse; + if (typeStr == kAnd) return Expression::Operation::kAnd; + if (typeStr == kOr) return Expression::Operation::kOr; + if (typeStr == kNot) return Expression::Operation::kNot; + if (typeStr == kEq) return Expression::Operation::kEq; + if (typeStr == kNotEq) return Expression::Operation::kNotEq; + if (typeStr == kLt) return Expression::Operation::kLt; + if (typeStr == kLtEq) return Expression::Operation::kLtEq; + if (typeStr == kGt) return Expression::Operation::kGt; + if (typeStr == kGtEq) return Expression::Operation::kGtEq; + if (typeStr == kIn) return Expression::Operation::kIn; + if (typeStr == kNotIn) return Expression::Operation::kNotIn; + if (typeStr == kIsNull) return Expression::Operation::kIsNull; + if (typeStr == kNotNull) return Expression::Operation::kNotNull; + if (typeStr == kIsNan) return Expression::Operation::kIsNan; + if (typeStr == kNotNan) return Expression::Operation::kNotNan; + if (typeStr == kStartsWith) return Expression::Operation::kStartsWith; + if (typeStr == kNotStartsWith) return Expression::Operation::kNotStartsWith; + if (typeStr == kCount) return Expression::Operation::kCount; + if (typeStr == kCountNull) return Expression::Operation::kCountNull; + if (typeStr == kCountStar) return Expression::Operation::kCountStar; + if (typeStr == kMin) return Expression::Operation::kMin; + if (typeStr == kMax) return Expression::Operation::kMax; return JsonParseError("Unknown expression type: {}", typeStr); } @@ -120,30 +192,316 @@ nlohmann::json ToJson(Expression::Operation op) { std::ranges::transform(json, json.begin(), [](unsigned char c) -> char { return (c == '_') ? '-' : static_cast(std::tolower(c)); }); + return nlohmann::json(std::move(json)); +} + +nlohmann::json ToJson(const NamedReference& ref) { return nlohmann::json(ref.name()); } + +Result> NamedReferenceFromJson( + const nlohmann::json& json) { + if (json.is_object() && json.contains(kType) && + json[kType].get() == kReference && json.contains(kTerm)) { + return NamedReference::Make(json[kTerm].get()); + } + if (!json.is_string()) [[unlikely]] { + return JsonParseError("Expected string for named reference"); + } + return NamedReference::Make(json.get()); +} + +nlohmann::json ToJson(const UnboundTransform& transform) { + auto& mut = const_cast(transform); + return MakeTransformJson(transform.transform()->ToString(), mut.reference()->name()); +} + +nlohmann::json ToJson(const BoundReference& ref) { return nlohmann::json(ref.name()); } + +nlohmann::json ToJson(const BoundTransform& transform) { + auto& mut = const_cast(transform); + return MakeTransformJson(transform.transform()->ToString(), mut.reference()->name()); +} + +Result> UnboundTransformFromJson( + const nlohmann::json& json) { + if (IsTransformTerm(json)) { + ICEBERG_ASSIGN_OR_RAISE(auto transform_str, + GetJsonValue(json, kTransform)); + ICEBERG_ASSIGN_OR_RAISE(auto transform, TransformFromString(transform_str)); + ICEBERG_ASSIGN_OR_RAISE(auto ref, NamedReferenceFromJson(json[kTerm])); + return UnboundTransform::Make(std::move(ref), std::move(transform)); + } + return JsonParseError("Invalid unbound transform json: {}", SafeDumpJson(json)); +} + +Result ToJson(const Literal& literal) { + if (literal.IsNull()) { + return nlohmann::json(nullptr); + } + + const auto type_id = literal.type()->type_id(); + const auto& value = literal.value(); + + switch (type_id) { + case TypeId::kBoolean: + return nlohmann::json(std::get(value)); + case TypeId::kInt: + return nlohmann::json(std::get(value)); + case TypeId::kDate: + return nlohmann::json(TransformUtil::HumanDay(std::get(value))); + case TypeId::kLong: + return nlohmann::json(std::get(value)); + case TypeId::kTime: + return nlohmann::json(TransformUtil::HumanTime(std::get(value))); + case TypeId::kTimestamp: + return nlohmann::json(TransformUtil::HumanTimestamp(std::get(value))); + case TypeId::kTimestampTz: + return nlohmann::json( + TransformUtil::HumanTimestampWithZone(std::get(value))); + case TypeId::kFloat: + return nlohmann::json(std::get(value)); + case TypeId::kDouble: + return nlohmann::json(std::get(value)); + case TypeId::kString: + return nlohmann::json(std::get(value)); + case TypeId::kBinary: + case TypeId::kFixed: { + // base 16 encoding for binary data + const auto& bytes = std::get>(value); + std::string hex; + hex.reserve(bytes.size() * 2); + for (uint8_t byte : bytes) { + hex += std::format("{:02X}", byte); + } + return nlohmann::json(std::move(hex)); + } + case TypeId::kDecimal: { + return nlohmann::json(literal.ToString()); + } + case TypeId::kUuid: + return nlohmann::json(std::get(value).ToString()); + default: + return NotSupported("Unsupported literal type for JSON serialization: {}", + literal.type()->ToString()); + } +} + +Result LiteralFromJson(const nlohmann::json& json) { + // Unwrap {"type": "literal", "value": } wrapper + if (json.is_object() && json.contains(kType) && + json[kType].get() == kLiteral && json.contains(kValue)) { + return LiteralFromJson(json[kValue]); + } + if (json.is_null()) { + return Literal::Null(nullptr); + } + if (json.is_boolean()) { + return Literal::Boolean(json.get()); + } + if (json.is_number_integer()) { + return Literal::Long(json.get()); + } + if (json.is_number_float()) { + return Literal::Double(json.get()); + } + if (json.is_string()) { + // All strings are returned as String literals. + // Conversion to binary/date/time/etc. happens during binding + // when schema type information is available. + return Literal::String(json.get()); + } + return JsonParseError("Unsupported literal JSON type"); +} + +Result ToJson(const Term& term) { + switch (term.kind()) { + case Term::Kind::kReference: + if (term.is_unbound()) + return ToJson(internal::checked_cast(term)); + return ToJson(internal::checked_cast(term)); + case Term::Kind::kTransform: + if (term.is_unbound()) + return ToJson(internal::checked_cast(term)); + return ToJson(internal::checked_cast(term)); + default: + return NotSupported( + "Unsupported term kind for JSON serialization only reference and transform " + "terms are supported"); + } +} + +Result ToJson(const UnboundPredicate& pred) { + nlohmann::json json; + json[kType] = ToJson(pred.op()); + + ICEBERG_ASSIGN_OR_RAISE(json[kTerm], ToJson(pred.unbound_term())); + std::span literals = pred.literals(); + + if (IsSetOperation(pred.op())) { + nlohmann::json values = nlohmann::json::array(); + for (const auto& lit : literals) { + ICEBERG_ASSIGN_OR_RAISE(auto lit_json, ToJson(lit)); + values.push_back(std::move(lit_json)); + } + json[kValues] = std::move(values); + } else if (!literals.empty()) { + ICEBERG_DCHECK(literals.size() == 1, + "Expected exactly one literal for non-set predicate"); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(literals[0])); + } return json; } -Result> ExpressionFromJson(const nlohmann::json& json) { - // Handle boolean +Result ToJson(const BoundPredicate& pred) { + nlohmann::json json; + json[kType] = ToJson(pred.op()); + ICEBERG_ASSIGN_OR_RAISE(json[kTerm], ToJson(*pred.term())); + + if (IsSetOperation(pred.op())) { + const auto& sp = internal::checked_cast(pred); + nlohmann::json values = nlohmann::json::array(); + for (const auto& lit : sp.literal_set()) { + ICEBERG_ASSIGN_OR_RAISE(auto lit_json, ToJson(lit)); + values.push_back(std::move(lit_json)); + } + json[kValues] = std::move(values); + } else if (!IsUnaryOperation(pred.op())) { + const auto& lp = internal::checked_cast(pred); + ICEBERG_ASSIGN_OR_RAISE(json[kValue], ToJson(lp.literal())); + } + return json; +} + +Result> UnboundPredicateFromJson( + const nlohmann::json& json) { + if (!json.contains(kType) || !json.contains(kTerm)) [[unlikely]] { + return JsonParseError("Invalid predicate JSON: missing 'type' or 'term' field : {}", + SafeDumpJson(json)); + } + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + const auto& term_json = json[kTerm]; + + if (IsTransformTerm(term_json)) { + ICEBERG_ASSIGN_OR_RAISE(auto term, UnboundTransformFromJson(term_json)); + return PredicateFromJson(op, std::move(term), json); + } + + ICEBERG_ASSIGN_OR_RAISE(auto term, NamedReferenceFromJson(term_json)); + return PredicateFromJson(op, std::move(term), json); +} + +Result> ExpressionFromJson(const nlohmann::json& json, + const Schema* schema) { + // Handle boolean constants if (json.is_boolean()) { return json.get() ? internal::checked_pointer_cast(True::Instance()) : internal::checked_pointer_cast(False::Instance()); } - return JsonParseError("Only booleans are currently supported."); + if (json.is_string()) { + auto s = json.get(); + std::ranges::transform(s, s.begin(), [](unsigned char c) -> char { + return static_cast(std::tolower(c)); + }); + if (s == kTrue) return internal::checked_pointer_cast(True::Instance()); + if (s == kFalse) return internal::checked_pointer_cast(False::Instance()); + } + + if (!json.is_object() || !json.contains(kType)) [[unlikely]] { + return JsonParseError("expression JSON must be an object with a 'type' field: {}", + SafeDumpJson(json)); + } + if (json[kType].get() == kLiteral) { + if (!json.contains(kValue) || !json[kValue].is_boolean()) [[unlikely]] { + return JsonParseError( + "Expression of type 'literal' must have a boolean 'value' field: {}", + SafeDumpJson(json)); + } + return json[kValue].get() + ? internal::checked_pointer_cast(True::Instance()) + : internal::checked_pointer_cast(False::Instance()); + } + + ICEBERG_ASSIGN_OR_RAISE(auto op, OperationTypeFromJson(json[kType])); + + switch (op) { + case Expression::Operation::kAnd: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("AND expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft], schema)); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight], schema)); + return And::Make(std::move(left), std::move(right)); + } + case Expression::Operation::kOr: { + if (!json.contains(kLeft) || !json.contains(kRight)) [[unlikely]] { + return JsonParseError("OR expression missing 'left' or 'right' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto left, ExpressionFromJson(json[kLeft], schema)); + ICEBERG_ASSIGN_OR_RAISE(auto right, ExpressionFromJson(json[kRight], schema)); + return Or::Make(std::move(left), std::move(right)); + } + case Expression::Operation::kNot: { + if (!json.contains(kChild)) [[unlikely]] { + return JsonParseError("NOT expression missing 'child' field"); + } + ICEBERG_ASSIGN_OR_RAISE(auto child, ExpressionFromJson(json[kChild], schema)); + return Not::Make(std::move(child)); + } + case Expression::Operation::kCount: + case Expression::Operation::kCountNull: + case Expression::Operation::kCountStar: + case Expression::Operation::kMin: + case Expression::Operation::kMax: { + // unsupported operations for JSON deserialization + return NotSupported("Unsupported expression type for JSON deserialization: {}", + ToString(op)); + } + default: { + ICEBERG_ASSIGN_OR_RAISE(auto pred, UnboundPredicateFromJson(json)); + if (schema != nullptr) { + return pred->Bind(*schema, false); + } + return pred; + } + } } -nlohmann::json ToJson(const Expression& expr) { +Result ToJson(const Expression& expr) { switch (expr.op()) { case Expression::Operation::kTrue: - return true; - + return nlohmann::json(true); case Expression::Operation::kFalse: - return false; + return nlohmann::json(false); + case Expression::Operation::kAnd: { + const auto& and_expr = internal::checked_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + ICEBERG_ASSIGN_OR_RAISE(json[kLeft], ToJson(*and_expr.left())); + ICEBERG_ASSIGN_OR_RAISE(json[kRight], ToJson(*and_expr.right())); + return json; + } + case Expression::Operation::kOr: { + const auto& or_expr = internal::checked_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + ICEBERG_ASSIGN_OR_RAISE(json[kLeft], ToJson(*or_expr.left())); + ICEBERG_ASSIGN_OR_RAISE(json[kRight], ToJson(*or_expr.right())); + return json; + } + case Expression::Operation::kNot: { + const auto& not_expr = internal::checked_cast(expr); + nlohmann::json json; + json[kType] = ToJson(expr.op()); + ICEBERG_ASSIGN_OR_RAISE(json[kChild], ToJson(*not_expr.child())); + return json; + } default: - // TODO(evindj): This code will be removed as we implemented the full expression - // serialization. - ICEBERG_CHECK_OR_DIE(false, "Only booleans are currently supported."); + if (expr.is_unbound_predicate()) + return ToJson(internal::checked_cast(expr)); + if (expr.is_bound_predicate()) + return ToJson(internal::checked_cast(expr)); + return NotSupported("Unsupported expression type for JSON serialization"); } } diff --git a/src/iceberg/expression/json_serde_internal.h b/src/iceberg/expression/json_serde_internal.h index e44234d39..cd250a690 100644 --- a/src/iceberg/expression/json_serde_internal.h +++ b/src/iceberg/expression/json_serde_internal.h @@ -46,16 +46,87 @@ ICEBERG_EXPORT nlohmann::json ToJson(Expression::Operation op); /// \brief Deserializes a JSON object into an Expression. /// +/// When \p schema is nullptr the result is an unbound expression. When \p schema is +/// provided the expression is bound against the schema before being returned. +/// /// \param json A JSON object representing an expression +/// \param schema Optional schema used to bind field references and coerce literal +/// types. When null, returns an unbound expression. /// \return A shared pointer to the deserialized Expression or an error ICEBERG_EXPORT Result> ExpressionFromJson( - const nlohmann::json& json); + const nlohmann::json& json, const Schema* schema = nullptr); /// \brief Serializes an Expression into its JSON representation. /// /// \param expr The expression to serialize -/// \return A JSON object representing the expression -ICEBERG_EXPORT nlohmann::json ToJson(const Expression& expr); +/// \return A JSON object representing the expression, or an error +ICEBERG_EXPORT Result ToJson(const Expression& expr); + +/// \brief Deserializes a JSON object into a NamedReference. +/// +/// \param json A JSON object representing a named reference +/// \return A shared pointer to the deserialized NamedReference or an error +ICEBERG_EXPORT Result> NamedReferenceFromJson( + const nlohmann::json& json); + +/// \brief Serializes a NamedReference into its JSON representation. +/// +/// \param ref The named reference to serialize +/// \return A JSON object representing the named reference, or an error +ICEBERG_EXPORT nlohmann::json ToJson(const NamedReference& ref); + +/// \brief Serializes an UnboundTransform into its JSON representation. +/// +/// \param transform The unbound transform to serialize +/// \return A JSON object representing the unbound transform, or an error +ICEBERG_EXPORT nlohmann::json ToJson(const UnboundTransform& transform); + +/// \brief Deserializes a JSON object into an UnboundTransform. +/// +/// \param json A JSON object representing an unbound transform +/// \return A shared pointer to the deserialized UnboundTransform or an error +ICEBERG_EXPORT Result> UnboundTransformFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Literal into its JSON representation. +/// +/// \param literal The literal to serialize +/// \return A JSON value representing the literal, or an error +ICEBERG_EXPORT Result ToJson(const Literal& literal); + +/// \brief Deserializes a JSON value into a Literal. +/// +/// \param json A JSON value representing a literal. +/// \return The deserialized Literal or an error. +ICEBERG_EXPORT Result LiteralFromJson(const nlohmann::json& json); + +/// \brief Serializes an UnboundPredicate into its JSON representation. +/// +/// \param pred The unbound predicate to serialize +/// \return A JSON object representing the predicate, or an error +ICEBERG_EXPORT Result ToJson(const UnboundPredicate& pred); + +/// \brief Serializes a BoundReference into its JSON representation (field name string). +ICEBERG_EXPORT nlohmann::json ToJson(const BoundReference& ref); + +/// \brief Serializes a BoundTransform into its JSON representation. +ICEBERG_EXPORT nlohmann::json ToJson(const BoundTransform& transform); + +/// \brief Serializes a BoundPredicate into its JSON representation. +ICEBERG_EXPORT Result ToJson(const BoundPredicate& pred); + +/// \brief Deserializes a JSON object into an UnboundPredicate. +/// +/// \param json A JSON object representing an unbound predicate +/// \return A pointer to the deserialized UnboundPredicate or an error +ICEBERG_EXPORT Result> UnboundPredicateFromJson( + const nlohmann::json& json); + +/// \brief Serializes a Term into its JSON representation. +/// +/// \param term The term to serialize (NamedReference or UnboundTransform) +/// \return A JSON value representing the term, or an error +ICEBERG_EXPORT Result ToJson(const Term& term); /// Check if an operation is a unary predicate ICEBERG_EXPORT bool IsUnaryOperation(Expression::Operation op); diff --git a/src/iceberg/expression/predicate.h b/src/iceberg/expression/predicate.h index cdd3d1f52..6df0de1a6 100644 --- a/src/iceberg/expression/predicate.h +++ b/src/iceberg/expression/predicate.h @@ -76,6 +76,12 @@ class ICEBERG_EXPORT UnboundPredicate : public virtual Expression, bool is_unbound_predicate() const override { return true; } + /// \brief Returns the term of this predicate as a base Term reference. + virtual const Term& unbound_term() const = 0; + + /// \brief Returns the literals of this predicate. + virtual std::span literals() const = 0; + protected: UnboundPredicate() = default; }; @@ -130,7 +136,9 @@ class ICEBERG_EXPORT UnboundPredicateImpl : public UnboundPredicate, Result> Negate() const override; - std::span literals() const { return values_; } + const Term& unbound_term() const override { return *BASE::term(); } + + std::span literals() const override { return values_; } private: UnboundPredicateImpl(Expression::Operation op, std::shared_ptr> term); diff --git a/src/iceberg/test/expression_json_test.cc b/src/iceberg/test/expression_json_test.cc index dd3ac5e3e..9f3f4a7a0 100644 --- a/src/iceberg/test/expression_json_test.cc +++ b/src/iceberg/test/expression_json_test.cc @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -30,37 +31,444 @@ #include "iceberg/expression/literal.h" #include "iceberg/expression/predicate.h" #include "iceberg/expression/term.h" +#include "iceberg/schema.h" #include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" +#include "iceberg/util/uuid.h" namespace iceberg { -// Test boolean constant expressions -TEST(ExpressionJsonTest, CheckBooleanExpression) { - auto checkBoolean = [](std::shared_ptr expr, bool value) { - auto json = ToJson(*expr); - EXPECT_TRUE(json.is_boolean()); - EXPECT_EQ(json.get(), value); - - auto result = ExpressionFromJson(json); - ASSERT_THAT(result, IsOk()); - if (value) { - EXPECT_EQ(result.value()->op(), Expression::Operation::kTrue); - } else { - EXPECT_EQ(result.value()->op(), Expression::Operation::kFalse); - } - }; - checkBoolean(True::Instance(), true); - checkBoolean(False::Instance(), false); +struct ExpressionJsonRoundTripParam { + std::string name; + nlohmann::json json; + Expression::Operation expected_op; +}; + +class ExpressionJsonRoundTripTest + : public ::testing::TestWithParam {}; + +TEST_P(ExpressionJsonRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(param.json)); + EXPECT_EQ(expr->op(), param.expected_op); + ICEBERG_UNWRAP_OR_FAIL(auto round_trip, ToJson(*expr)); + EXPECT_EQ(round_trip, param.json); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, ExpressionJsonRoundTripTest, + ::testing::Values( + ExpressionJsonRoundTripParam{"BooleanTrue", true, Expression::Operation::kTrue}, + ExpressionJsonRoundTripParam{"BooleanFalse", false, + Expression::Operation::kFalse}, + ExpressionJsonRoundTripParam{"UnaryIsNull", + {{"type", "is-null"}, {"term", "col"}}, + Expression::Operation::kIsNull}, + ExpressionJsonRoundTripParam{"LiteralGt", + {{"type", "gt"}, {"term", "age"}, {"value", 21}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "SetIn", + {{"type", "in"}, + {"term", "status"}, + {"values", nlohmann::json::array({"active", "pending"})}}, + Expression::Operation::kIn}, + ExpressionJsonRoundTripParam{ + "AndExpression", + {{"type", "and"}, + {"left", {{"type", "gt"}, {"term", "age"}, {"value", 18}}}, + {"right", {{"type", "lt"}, {"term", "age"}, {"value", 65}}}}, + Expression::Operation::kAnd}, + ExpressionJsonRoundTripParam{ + "NotExpression", + {{"type", "not"}, {"child", {{"type", "is-null"}, {"term", "name"}}}}, + Expression::Operation::kNot}, + ExpressionJsonRoundTripParam{ + "TransformDay", + {{"type", "eq"}, + {"term", {{"type", "transform"}, {"transform", "day"}, {"term", "ts"}}}, + {"value", 19738}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{ + "TransformYear", + {{"type", "gt"}, + {"term", + {{"type", "transform"}, {"transform", "year"}, {"term", "timestamp_col"}}}, + {"value", 2020}}, + Expression::Operation::kGt}, + ExpressionJsonRoundTripParam{ + "TransformTruncate", + {{"type", "lt"}, + {"term", + {{"type", "transform"}, {"transform", "truncate[4]"}, {"term", "col"}}}, + {"value", 100}}, + Expression::Operation::kLt}, + ExpressionJsonRoundTripParam{ + "LiteralNotEq", + {{"type", "not-eq"}, {"term", "status"}, {"value", "closed"}}, + Expression::Operation::kNotEq}, + ExpressionJsonRoundTripParam{ + "LiteralLtEq", + {{"type", "lt-eq"}, {"term", "price"}, {"value", 100}}, + Expression::Operation::kLtEq}, + ExpressionJsonRoundTripParam{ + "LiteralGtEq", + {{"type", "gt-eq"}, {"term", "quantity"}, {"value", 1}}, + Expression::Operation::kGtEq}, + ExpressionJsonRoundTripParam{ + "SetNotIn", + {{"type", "not-in"}, + {"term", "category"}, + {"values", nlohmann::json::array({"archived", "deleted"})}}, + Expression::Operation::kNotIn}, + ExpressionJsonRoundTripParam{"UnaryNotNan", + {{"type", "not-nan"}, {"term", "score"}}, + Expression::Operation::kNotNan}, + ExpressionJsonRoundTripParam{ + "LiteralStartsWith", + {{"type", "starts-with"}, {"term", "name"}, {"value", "prefix"}}, + Expression::Operation::kStartsWith}, + ExpressionJsonRoundTripParam{ + "LiteralNotStartsWith", + {{"type", "not-starts-with"}, {"term", "name"}, {"value", "bad"}}, + Expression::Operation::kNotStartsWith}, + ExpressionJsonRoundTripParam{ + "OrExpression", + {{"type", "or"}, + {"left", {{"type", "lt"}, {"term", "price"}, {"value", 50}}}, + {"right", {{"type", "not-null"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "NestedWithDecimals", + {{"type", "or"}, + {"left", + {{"type", "and"}, + {"left", + {{"type", "in"}, + {"term", "price"}, + {"values", nlohmann::json::array({3.14, 2.72})}}}, + {"right", {{"type", "eq"}, {"term", "currency"}, {"value", "USD"}}}}}, + {"right", {{"type", "is-nan"}, {"term", "discount"}}}}, + Expression::Operation::kOr}, + ExpressionJsonRoundTripParam{ + "FixedBinaryInPredicate", + {{"type", "eq"}, {"term", "col"}, {"value", "010203"}}, + Expression::Operation::kEq}, + ExpressionJsonRoundTripParam{"ScaleDecimalInSet", + {{"type", "in"}, + {"term", "amount"}, + {"values", nlohmann::json::array({"3.14E+4"})}}, + Expression::Operation::kIn}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Object wrapper normalization tests -- + +TEST(ExpressionJsonTest, PredicateWithObjectLiteral) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", "col"}, + {"value", {{"type", "literal"}, {"value", 50}}}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + ICEBERG_UNWRAP_OR_FAIL(auto result, ToJson(*expr)); + EXPECT_EQ(result, expected); +} + +TEST(ExpressionJsonTest, LiteralBoolean) { + nlohmann::json input = {{"type", "literal"}, {"value", true}}; + nlohmann::json expected = true; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + ICEBERG_UNWRAP_OR_FAIL(auto result, ToJson(*expr)); + EXPECT_EQ(result, expected); +} + +TEST(ExpressionJsonTest, PredicateWithObjectReference) { + nlohmann::json input = {{"type", "lt-eq"}, + {"term", {{"type", "reference"}, {"term", "col"}}}, + {"value", 50}}; + nlohmann::json expected = {{"type", "lt-eq"}, {"term", "col"}, {"value", 50}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(input)); + ICEBERG_UNWRAP_OR_FAIL(auto result, ToJson(*expr)); + EXPECT_EQ(result, expected); +} + +// -- Parameterized invalid expression tests -- + +struct InvalidExpressionParam { + std::string name; + nlohmann::json json; + std::string expected_error_substr; +}; + +class InvalidExpressionTest : public ::testing::TestWithParam {}; + +TEST_P(InvalidExpressionTest, ReturnsError) { + const auto& param = GetParam(); + auto result = ExpressionFromJson(param.json); + EXPECT_THAT(result, HasErrorMessage(param.expected_error_substr)); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, InvalidExpressionTest, + ::testing::Values( + InvalidExpressionParam{"NotBooleanOrObject", 42, "an object with a 'type'"}, + InvalidExpressionParam{"UnknownOperationType", + {{"type", "illegal"}, {"term", "col"}}, + "Unknown expression type"}, + InvalidExpressionParam{ + "AndMissingLeft", + {{"type", "and"}, {"right", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{ + "OrMissingRight", + {{"type", "or"}, {"left", {{"type", "is-null"}, {"term", "col"}}}}, + "missing 'left' or 'right'"}, + InvalidExpressionParam{"NotMissingChild", {{"type", "not"}}, "missing 'child'"}, + InvalidExpressionParam{"UnaryWithSpuriousValue", + {{"type", "not-nan"}, {"term", "col"}, {"value", 42}}, + "invalid 'value' field"}, + InvalidExpressionParam{"UnaryWithSpuriousValues", + {{"type", "is-nan"}, + {"term", "col"}, + {"values", nlohmann::json::array({1, 2})}}, + "invalid 'values' field"}, + InvalidExpressionParam{"NumericTerm", + {{"type", "lt"}, {"term", 23}, {"value", 10}}, + "Expected string for named reference"}, + InvalidExpressionParam{"SetMissingValues", + {{"type", "in"}, {"term", "col"}, {"value", 42}}, + "values"}, + InvalidExpressionParam{ + "LiteralMissingValue", {{"type", "gt"}, {"term", "col"}}, "value"}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +struct BooleanStringParam { + std::string name; + std::string json_value; + Expression::Operation expected_op; +}; + +class BooleanStringDeserializationTest + : public ::testing::TestWithParam {}; + +TEST_P(BooleanStringDeserializationTest, ParsesBooleanStrings) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(nlohmann::json(param.json_value))); + EXPECT_EQ(expr->op(), param.expected_op); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, BooleanStringDeserializationTest, + ::testing::Values( + BooleanStringParam{"LowerTrue", "true", Expression::Operation::kTrue}, + BooleanStringParam{"LowerFalse", "false", Expression::Operation::kFalse}, + BooleanStringParam{"UpperTrue", "TRuE", Expression::Operation::kTrue}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Bound predicate ToJson tests -- + +struct BoundPredicateToJsonParam { + std::string name; + std::shared_ptr pred; + nlohmann::json expected_json; +}; + +class BoundPredicateToJsonTest + : public ::testing::TestWithParam { + protected: + static void SetUpTestSuite() { + schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64())}, + /*schema_id=*/0); + } + static std::shared_ptr schema_; +}; + +std::shared_ptr BoundPredicateToJsonTest::schema_; + +TEST_P(BoundPredicateToJsonTest, ToJson) { + const auto& param = GetParam(); + ICEBERG_UNWRAP_OR_FAIL(auto bound, param.pred->Bind(*schema_, /*case_sensitive=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*bound)); + EXPECT_EQ(json, param.expected_json); +} + +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, BoundPredicateToJsonTest, + ::testing::Values( + BoundPredicateToJsonParam{"UnaryIsNull", + Expressions::IsNull("name"), + {{"type", "is-null"}, {"term", "name"}}}, + BoundPredicateToJsonParam{"UnaryNotNull", + Expressions::NotNull("name"), + {{"type", "not-null"}, {"term", "name"}}}, + BoundPredicateToJsonParam{"UnaryIsNan", + Expressions::IsNaN("salary"), + {{"type", "is-nan"}, {"term", "salary"}}}, + BoundPredicateToJsonParam{"UnaryNotNan", + Expressions::NotNaN("salary"), + {{"type", "not-nan"}, {"term", "salary"}}}, + BoundPredicateToJsonParam{"LiteralEq", + Expressions::Equal("age", Literal::Int(25)), + {{"type", "eq"}, {"term", "age"}, {"value", 25}}}, + BoundPredicateToJsonParam{"LiteralLt", + Expressions::LessThan("age", Literal::Int(18)), + {{"type", "lt"}, {"term", "age"}, {"value", 18}}}, + BoundPredicateToJsonParam{ + "LiteralGtEq", + Expressions::GreaterThanOrEqual("age", Literal::Int(21)), + {{"type", "gt-eq"}, {"term", "age"}, {"value", 21}}}, + BoundPredicateToJsonParam{ + "LiteralStartsWith", + Expressions::StartsWith("name", "prefix"), + {{"type", "starts-with"}, {"term", "name"}, {"value", "prefix"}}}, + BoundPredicateToJsonParam{"LiteralNotEq", + Expressions::NotEqual("age", Literal::Int(7)), + {{"type", "not-eq"}, {"term", "age"}, {"value", 7}}}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Set operation round-trip tests -- +// Tests the full cycle: bind UnboundPredicate → serialize BoundPredicate to JSON +// → deserialize to UnboundPredicate → compare op, term, and values. + +struct SetOpRoundTripParam { + std::string name; + std::shared_ptr pred; + Expression::Operation expected_op; + std::string expected_term; + std::vector expected_values; +}; + +class SetOpRoundTripTest : public ::testing::TestWithParam { + protected: + static void SetUpTestSuite() { + schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64())}, + /*schema_id=*/0); + } + static std::shared_ptr schema_; +}; + +std::shared_ptr SetOpRoundTripTest::schema_; + +TEST_P(SetOpRoundTripTest, RoundTrip) { + const auto& param = GetParam(); + + ICEBERG_UNWRAP_OR_FAIL(auto bound, param.pred->Bind(*schema_, /*case_sensitive=*/true)); + ICEBERG_UNWRAP_OR_FAIL(auto json, ToJson(*bound)); + ICEBERG_UNWRAP_OR_FAIL(auto unbound, UnboundPredicateFromJson(json)); + + EXPECT_EQ(unbound->op(), param.expected_op); + EXPECT_EQ(unbound->reference()->name(), param.expected_term); + std::vector got; + got.reserve(unbound->literals().size()); + for (const auto& lit : unbound->literals()) { + got.push_back(lit.ToString()); + } + + std::vector expected; + expected.reserve(param.expected_values.size()); + for (const auto& lit : param.expected_values) { + expected.push_back(lit.ToString()); + } + + EXPECT_THAT(got, ::testing::UnorderedElementsAreArray(expected)); } -TEST(ExpressionJsonTest, OperationTypeTests) { - EXPECT_EQ(OperationTypeFromJson("true"), Expression::Operation::kTrue); - EXPECT_EQ("true", ToJson(Expression::Operation::kTrue)); - EXPECT_TRUE(IsSetOperation(Expression::Operation::kIn)); - EXPECT_FALSE(IsSetOperation(Expression::Operation::kTrue)); +INSTANTIATE_TEST_SUITE_P( + ExpressionJsonTest, SetOpRoundTripTest, + ::testing::Values( + SetOpRoundTripParam{ + "In", + Expressions::In("age", {Literal::Int(1), Literal::Int(2), Literal::Int(3)}), + Expression::Operation::kIn, + "age", + {Literal::Int(1), Literal::Int(2), Literal::Int(3)}}, + SetOpRoundTripParam{ + "NotIn", + Expressions::NotIn("age", {Literal::Int(5), Literal::Int(10)}), + Expression::Operation::kNotIn, + "age", + {Literal::Int(5), Literal::Int(10)}}), + [](const ::testing::TestParamInfo& info) { + return info.param.name; + }); + +// -- Schema-aware ExpressionFromJson tests -- +class SchemaAwareExpressionFromJsonTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + schema_ = std::make_shared( + std::vector{SchemaField::MakeRequired(1, "id", int64()), + SchemaField::MakeOptional(2, "name", string()), + SchemaField::MakeRequired(3, "age", int32()), + SchemaField::MakeOptional(4, "salary", float64())}, + /*schema_id=*/0); + } + static std::shared_ptr schema_; +}; + +std::shared_ptr SchemaAwareExpressionFromJsonTest::schema_; + +TEST_F(SchemaAwareExpressionFromJsonTest, NullSchemaReturnsUnbound) { + nlohmann::json json = {{"type", "eq"}, {"term", "age"}, {"value", 25}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(json)); + EXPECT_TRUE(expr->is_unbound_predicate()); +} + +TEST_F(SchemaAwareExpressionFromJsonTest, WithSchemaReturnsBound) { + nlohmann::json json = {{"type", "eq"}, {"term", "age"}, {"value", 25}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(json, schema_.get())); + EXPECT_TRUE(expr->is_bound_predicate()); +} + +TEST_F(SchemaAwareExpressionFromJsonTest, UnaryPredicateWithSchema) { + nlohmann::json json = {{"type", "is-null"}, {"term", "name"}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(json, schema_.get())); + EXPECT_TRUE(expr->is_bound_predicate()); + EXPECT_EQ(expr->op(), Expression::Operation::kIsNull); +} + +TEST_F(SchemaAwareExpressionFromJsonTest, AndExpressionWithSchema) { + nlohmann::json json = {{"type", "and"}, + {"left", {{"type", "gt"}, {"term", "age"}, {"value", 18}}}, + {"right", {{"type", "lt"}, {"term", "age"}, {"value", 65}}}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(json, schema_.get())); + EXPECT_EQ(expr->op(), Expression::Operation::kAnd); + const auto& and_expr = dynamic_cast(*expr); + EXPECT_TRUE(and_expr.left()->is_bound_predicate()); + EXPECT_TRUE(and_expr.right()->is_bound_predicate()); +} + +TEST_F(SchemaAwareExpressionFromJsonTest, NotExpressionWithSchema) { + nlohmann::json json = {{"type", "not"}, + {"child", {{"type", "is-null"}, {"term", "name"}}}}; + ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(json, schema_.get())); + EXPECT_EQ(expr->op(), Expression::Operation::kNot); + const auto& not_expr = dynamic_cast(*expr); + EXPECT_TRUE(not_expr.child()->is_bound_predicate()); +} - EXPECT_TRUE(IsUnaryOperation(Expression::Operation::kIsNull)); - EXPECT_FALSE(IsUnaryOperation(Expression::Operation::kTrue)); +TEST_F(SchemaAwareExpressionFromJsonTest, BooleanConstantWithSchemaUnchanged) { + ICEBERG_UNWRAP_OR_FAIL(auto t, ExpressionFromJson(nlohmann::json(true), schema_.get())); + EXPECT_EQ(t->op(), Expression::Operation::kTrue); + ICEBERG_UNWRAP_OR_FAIL(auto f, + ExpressionFromJson(nlohmann::json(false), schema_.get())); + EXPECT_EQ(f->op(), Expression::Operation::kFalse); } } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index e97de0ac5..491775ee3 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -129,8 +129,11 @@ class BoundReference; class BoundTransform; class Expression; class Literal; +class NamedReference; class Term; +class Transform; class UnboundPredicate; +class UnboundTransform; /// \brief Evaluator. class Evaluator;