From 6c3da12151b234fd920f8b0980e5503e362bfb16 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Tue, 7 Apr 2026 17:34:23 +0200 Subject: [PATCH 01/11] Add IR to compiler; move tag allocation logic out of PPX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce Ir.t, a pure intermediate representation for sedlex patterns that captures regexp structure and named captures before tag allocation. The compiler's new compile_ir entry point handles tag allocation (Start_plus/End_minus optimizations), discriminator insertion for or-patterns, and DFA construction. The PPX becomes a thin translator from OCaml pattern AST to Ir.t. Reject capture that shadows an inner binding of the same name Validate that `(... as x) as x` is rejected — a Capture node must not bind a name that already appears inside its inner pattern. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 137 ++++++++++ src/compiler/ir.mli | 64 +++++ src/compiler/sedlex.ml | 231 ++++++++++++++++ src/compiler/sedlex.mli | 38 +++ src/syntax/ppx_sedlex.ml | 527 +++++++++--------------------------- test/codegen/test_errors.ml | 15 +- 6 files changed, 612 insertions(+), 400 deletions(-) create mode 100644 src/compiler/ir.ml create mode 100644 src/compiler/ir.mli diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml new file mode 100644 index 0000000..85be905 --- /dev/null +++ b/src/compiler/ir.ml @@ -0,0 +1,137 @@ +(* The package sedlex is released under the terms of an MIT-like license. *) +(* See the attached LICENSE file. *) +(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) + +type t = + | Chars of Cset.t + | Seq of t list + | Alt of t * t + | Star of t + | Plus of t + | Rep of t * int * int + | Eps + | Capture of string * t + +(* Smart constructors *) + +let chars c = Chars c +let eps = Eps +let capture name inner = Capture (name, inner) +let star t = Star t +let plus t = Plus t +let rep t n m = Rep (t, n, m) + +let seq a b = + match (a, b) with + | Eps, x | x, Eps -> x + | Seq l1, Seq l2 -> Seq (l1 @ l2) + | Seq l1, x -> Seq (l1 @ [x]) + | x, Seq l2 -> Seq (x :: l2) + | _ -> Seq [a; b] + +let alt a b = + match (a, b) with + | Chars c1, Chars c2 -> Chars (Cset.union c1 c2) + | _ -> Alt (a, b) + +(* Analysis *) + +let rec fixed_length = function + | Chars _ -> Some 1 + | Eps -> Some 0 + | Capture (_, inner) -> fixed_length inner + | Seq elems -> + List.fold_left + (fun acc e -> + match (acc, fixed_length e) with + | Some a, Some b -> Some (a + b) + | _ -> None) + (Some 0) elems + | Alt (a, b) -> ( + match (fixed_length a, fixed_length b) with + | Some n1, Some n2 when n1 = n2 -> Some n1 + | _ -> None) + | Rep (inner, n, m) -> + if n = m then ( + match fixed_length inner with Some l -> Some (n * l) | None -> None) + else None + | Star _ | Plus _ -> None + +let rec capture_names_acc acc = function + | Chars _ | Eps -> acc + | Capture (name, inner) -> + let acc = if List.mem name acc then acc else name :: acc in + capture_names_acc acc inner + | Seq elems -> List.fold_left capture_names_acc acc elems + | Alt (a, b) -> capture_names_acc (capture_names_acc acc a) b + | Star inner | Plus inner | Rep (inner, _, _) -> capture_names_acc acc inner + +let capture_names t = capture_names_acc [] t |> List.sort_uniq String.compare + +(* Validation *) + +let validate t = + let rec check ~inside_rep t = + match t with + | Chars _ | Eps -> Ok () + | Capture (_, _) when inside_rep -> + Error "'as' bindings are not supported inside repetition operators" + | Capture (name, inner) -> + if List.mem name (capture_names inner) then + Error + (Printf.sprintf + "'as' binding '%s' shadows an inner binding of the same name" + name) + else check ~inside_rep inner + | Seq elems -> + List.fold_left + (fun acc e -> + match acc with Error _ -> acc | Ok () -> check ~inside_rep e) + (Ok ()) elems + | Alt (a, b) -> + let* () = check ~inside_rep a in + let* () = check ~inside_rep b in + let na = capture_names a in + let nb = capture_names b in + if na <> nb && (na <> [] || nb <> []) then + Error "both sides of '|' must bind the same names with 'as'" + else Ok () + | Star inner | Plus inner -> check ~inside_rep:true inner + | Rep (inner, _, _) -> check ~inside_rep:true inner + and ( let* ) r f = match r with Error _ as e -> e | Ok x -> f x in + check ~inside_rep:false t + +(* Pretty-printing *) + +let rec pp fmt = function + | Chars cset -> ( + let intervals = (cset : Cset.t :> (int * int) list) in + match intervals with + | [(c, c')] when c = c' -> + if c >= 32 && c <= 126 then Format.fprintf fmt "'%c'" (Char.chr c) + else Format.fprintf fmt "0x%04X" c + | _ -> + Format.fprintf fmt "[%a]" + (Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ", ") + (fun fmt (lo, hi) -> + if lo = hi then + if lo >= 32 && lo <= 126 then + Format.fprintf fmt "'%c'" (Char.chr lo) + else Format.fprintf fmt "0x%04X" lo + else Format.fprintf fmt "0x%04X-0x%04X" lo hi)) + intervals) + | Eps -> Format.fprintf fmt "eps" + | Seq elems -> + Format.fprintf fmt "(%a)" + (Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt ", ") + pp) + elems + | Alt (a, b) -> Format.fprintf fmt "(%a | %a)" pp a pp b + | Star inner -> Format.fprintf fmt "Star %a" pp inner + | Plus inner -> Format.fprintf fmt "Plus %a" pp inner + | Rep (inner, n, m) -> Format.fprintf fmt "Rep(%a, %d..%d)" pp inner n m + | Capture (name, inner) -> Format.fprintf fmt "(%a as %s)" pp inner name + +let show t = Format.asprintf "%a" pp t diff --git a/src/compiler/ir.mli b/src/compiler/ir.mli new file mode 100644 index 0000000..b84ceca --- /dev/null +++ b/src/compiler/ir.mli @@ -0,0 +1,64 @@ +(* The package sedlex is released under the terms of an MIT-like license. *) +(* See the attached LICENSE file. *) +(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) + +(** Intermediate representation for sedlex patterns. + + This IR captures the regexp structure including named capture annotations + ([as] bindings) but {b before} tag allocation, fixed-length optimization, or + discriminator handling. The compiler processes this IR to produce a compiled + DFA with binding extraction information. + + The IR does not depend on ppxlib and can be constructed directly in tests. +*) + +(** A pattern with named captures. *) +type t = + | Chars of Cset.t (** Match a single code point in the given set. *) + | Seq of t list + (** Sequence (concatenation). Invariant: length >= 2. Use {!seq} smart + constructor. *) + | Alt of t * t (** Alternation: match left or right. *) + | Star of t (** Kleene star (zero or more). *) + | Plus of t (** One or more. *) + | Rep of t * int * int + (** [Rep (r, n, m)]: between [n] and [m] repetitions. *) + | Eps (** Empty string (epsilon). *) + | Capture of string * t + (** Named capture: [Capture (name, inner)] wraps [inner] with an [as] + binding. The compiler decides tag allocation strategy. *) + +(** {2 Smart constructors} *) + +val chars : Cset.t -> t +val seq : t -> t -> t +val alt : t -> t -> t +val star : t -> t +val plus : t -> t +val rep : t -> int -> int -> t +val eps : t +val capture : string -> t -> t + +(** {2 Analysis} *) + +(** [fixed_length t] returns [Some n] if [t] always matches exactly [n] code + points, or [None] if the length is variable. *) +val fixed_length : t -> int option + +(** [capture_names t] returns the set of capture names in [t], sorted and + deduplicated. *) +val capture_names : t -> string list + +(** {2 Validation} + + [validate t] checks structural constraints: + - [Capture] not inside [Star], [Plus], [Rep], or set operations + - Both sides of [Alt] bind the same capture names + + Returns [Ok ()] or [Error msg]. *) +val validate : t -> (unit, string) result + +(** {2 Pretty-printing} *) + +val pp : Format.formatter -> t -> unit +val show : t -> string diff --git a/src/compiler/sedlex.ml b/src/compiler/sedlex.ml index 5e7a3cd..e3695c8 100644 --- a/src/compiler/sedlex.ml +++ b/src/compiler/sedlex.ml @@ -341,6 +341,237 @@ let compile rs = num_tags = !cur_tag; } +(* High-level compilation from IR. + + [compile_ir] lowers [Ir.t] patterns into low-level regexps with tag + annotations, then compiles them via [compile]. The lowering phase decides + how to allocate tags for [as] bindings: + - [Start_plus n]: the position is [n] code points from the token start. + - [End_minus n]: the position is [n] code points before the token end. + - [Tag {tag; offset}]: read memory cell [tag] and add [offset]. + When both boundaries of a capture can be expressed as [Start_plus] or + [End_minus], no memory cells are needed at all. + + Or-patterns [(p1 as x) | (p2 as x)] additionally use discriminator cells: + integer values that record which branch was taken, so the PPX can extract + the correct positions at match time. *) + +type pos_expr = + | Tag of { tag : int; offset : int } + | Start_plus of int + | End_minus of int + +type compiled_binding = { + name : string; + start_pos : pos_expr; + end_pos : pos_expr; + disc : (int * int) list; +} + +type compiled_ir = { + dfa : dfa; + init_tags : tag_op list; + num_tags : int; + bindings : compiled_binding list array; +} + +(* [shift_pos pe delta] shifts a position expression by [delta] code points + (positive = forward, negative = backward). Returns [None] if either + argument is unknown. *) +let shift_pos pe delta = + match (pe, delta) with + | Some (Start_plus n), Some d -> Some (Start_plus (n + d)) + | Some (End_minus n), Some d -> Some (End_minus (n - d)) + | Some (Tag { tag; offset }), Some d -> + Some (Tag { tag; offset = offset + d }) + | _ -> None + +let advance pe len = shift_pos pe len +let retreat pe len = shift_pos pe (Option.map Int.neg len) + +(* [add_discriminators r1 tags1 r2 tags2] wraps two or-pattern branches + with discriminator tags so the generated code can tell which branch matched. + Reuses an existing discriminator cell from the left branch when possible + (OCaml desugars [a | b | c] into [(a | b) | c], so the left branch may + already carry a discriminator from an inner [|]). *) +let add_discriminators r1 tags1 r2 tags2 = + let names tags = + List.map (fun (ti : compiled_binding) -> ti.name) tags + |> List.sort_uniq String.compare + in + if names tags1 <> names tags2 then + invalid_arg "both sides of '|' must bind the same names with 'as'"; + if tags1 = tags2 then (alt r1 r2, tags1) + else ( + let stamp disc_cell value tags = + List.map + (fun (ti : compiled_binding) -> + { ti with disc = (disc_cell, value) :: ti.disc }) + tags + in + let reusable_cell = + match tags1 with + | [] | { disc = []; _ } :: _ -> None + | { disc = (c, _) :: _; _ } :: rest -> + if + List.for_all + (fun (ti : compiled_binding) -> + match ti.disc with (c2, _) :: _ -> c2 = c | [] -> false) + rest + then Some c + else None + in + match reusable_cell with + | Some disc_cell -> + let max_val = + List.fold_left + (fun acc (ti : compiled_binding) -> + match ti.disc with (_, v) :: _ -> max acc v | [] -> acc) + (-1) tags1 + in + let new_val = max_val + 1 in + let r2w = bind_disc r2 disc_cell new_val in + (alt r1 r2w, tags1 @ stamp disc_cell new_val tags2) + | None -> + let disc_cell = new_disc_cell () in + let r1w = bind_disc r1 disc_cell 0 in + let r2w = bind_disc r2 disc_cell 1 in + (alt r1w r2w, stamp disc_cell 0 tags1 @ stamp disc_cell 1 tags2)) + +(* [lower ir ~left ~right] converts an IR pattern to a low-level regexp + and a list of compiled bindings. [left] and [right] are the known + position contexts at the start and end of this pattern element. *) +let rec lower ~left ~right (ir : Ir.t) : regexp * compiled_binding list = + match ir with + | Ir.Chars cset -> (chars cset, []) + | Ir.Eps -> (eps, []) + | Ir.Star inner -> + let r, _ = lower ~left:None ~right:None inner in + (rep r, []) + | Ir.Plus inner -> + let r, _ = lower ~left:None ~right:None inner in + (plus r, []) + | Ir.Rep (inner, n, m) -> + let r, _ = lower ~left:None ~right:None inner in + (repeat r n m, []) + | Ir.Capture (name, inner) -> + (* Named capture — try to derive each boundary from [left]/[right] + context or [fixed_length]; allocate tags only for boundaries that + cannot be computed statically. Best case: 0 tags. Worst case: 2. *) + let r, tags = lower ~left ~right inner in + let elem_len = Ir.fixed_length inner in + let known_start = + match left with Some _ -> left | None -> retreat right elem_len + in + let known_end = + match right with + | Some _ -> right + | None -> advance known_start elem_len + in + let st, et, r = + match (known_start, known_end) with + | Some st, Some et -> (st, et, r) + | Some st, None -> + let wrapped, end_tag = bind_end_only r in + (st, Tag { tag = end_tag; offset = 0 }, wrapped) + | None, Some et -> + let wrapped, start_tag = bind_start_only r in + (Tag { tag = start_tag; offset = 0 }, et, wrapped) + | None, None -> ( + match elem_len with + | Some len -> + let wrapped, start_tag = bind_start_only r in + ( Tag { tag = start_tag; offset = 0 }, + Tag { tag = start_tag; offset = len }, + wrapped ) + | None -> + let wrapped, start_tag, end_tag = bind r in + ( Tag { tag = start_tag; offset = 0 }, + Tag { tag = end_tag; offset = 0 }, + wrapped )) + in + (r, { name; start_pos = st; end_pos = et; disc = [] } :: tags) + | Ir.Alt (a, b) -> + let r1, tags1 = lower ~left ~right a in + let r2, tags2 = lower ~left ~right b in + if tags1 <> [] || tags2 <> [] then add_discriminators r1 tags1 r2 tags2 + else (alt r1 r2, []) + | Ir.Seq elems -> + (* Sequence — propagate left/right position contexts through elements. + Right positions are computed right-to-left; left positions are + updated left-to-right after lowering each element. *) + let n = List.length elems in + let lengths = List.map Ir.fixed_length elems in + let lengths_arr = Array.of_list lengths in + (* Compute right positions (right-to-left) *) + let rights = Array.make n None in + let () = + let acc = ref right in + for i = n - 1 downto 0 do + rights.(i) <- !acc; + acc := retreat !acc lengths_arr.(i) + done + in + (* Fallback for [update_left]: if [advance] fails (unknown + current left or variable-length element), but the element + was a [Capture] whose end position is a [Tag], that tag + records a runtime position we can use as the [left] anchor + for the next element. [Start_plus]/[End_minus] endpoints + are not useful here: they are relative to token boundaries, + so they only propagate through [advance]/[retreat] — they + cannot serve as fresh anchors after a variable-length gap. *) + let left_from_end_tag ir tags' = + match ir with + | Ir.Capture _ -> ( + match tags' with + | { end_pos = Tag _ as et; _ } :: _ -> Some et + | _ -> None) + | _ -> None + in + let update_left cur i ir tags' = + match advance cur lengths_arr.(i) with + | Some _ as s -> s + | None -> left_from_end_tag ir tags' + in + let elems_arr = Array.of_list elems in + let r0, tags0 = lower ~left ~right:rights.(0) elems_arr.(0) in + let left0 = update_left left 0 elems_arr.(0) tags0 in + let _, _, r_acc, tags_acc = + Array.fold_left + (fun (i, cur_left, r_acc, tags_acc) ir_elem -> + if i = 0 then (1, left0, r_acc, tags_acc) + else ( + let r', tags' = + lower ~left:cur_left ~right:rights.(i) ir_elem + in + let new_left = update_left cur_left i ir_elem tags' in + (i + 1, new_left, seq r_acc r', tags_acc @ tags'))) + (0, left, r0, tags0) elems_arr + in + (r_acc, tags_acc) + +let compile_ir (rules : Ir.t array) = + Array.iter + (fun ir -> + match Ir.validate ir with Ok () -> () | Error msg -> invalid_arg msg) + rules; + reset_tags (); + let lowered = + Array.map + (fun ir -> + lower ~left:(Some (Start_plus 0)) ~right:(Some (End_minus 0)) ir) + rules + in + let regexps = Array.map fst lowered in + let bindings = Array.map snd lowered in + let compiled = compile regexps in + { + dfa = compiled.dfa; + init_tags = compiled.init_tags; + num_tags = compiled.num_tags; + bindings; + } + let cset_to_label cset = let escape_dot c = match c with diff --git a/src/compiler/sedlex.mli b/src/compiler/sedlex.mli index f4360aa..320edd5 100644 --- a/src/compiler/sedlex.mli +++ b/src/compiler/sedlex.mli @@ -120,6 +120,44 @@ type compiled = { initial state. *) val compile : regexp array -> compiled +(** {2 High-level compilation from IR} + + [compile_ir] takes an array of {!Ir.t} patterns, handles tag allocation + (including [Start_plus]/[End_minus] optimizations), discriminator insertion + for or-patterns, and DFA construction. Returns the DFA and all information + needed to generate binding extraction code. *) + +(** How to compute a sub-match boundary position at runtime. *) +type pos_expr = + | Tag of { tag : int; offset : int } + (** Read memory cell [tag] and add [offset]. *) + | Start_plus of int (** Position is [n] code points from the token start. *) + | End_minus of int (** Position is [n] code points before the token end. *) + +(** Information about a single binding in the compiled output. *) +type compiled_binding = { + name : string; + start_pos : pos_expr; + end_pos : pos_expr; + disc : (int * int) list; + (** Discriminator conditions: [(cell, value)] pairs. Empty for simple + (non-or) bindings. For or-patterns, each branch has distinct values. + *) +} + +(** Result of [compile_ir]. *) +type compiled_ir = { + dfa : dfa; + init_tags : tag_op list; + num_tags : int; + bindings : compiled_binding list array; + (** [bindings.(i)] is the list of binding info for rule [i]. *) +} + +(** [compile_ir rules] compiles an array of IR patterns into a tagged DFA. + Raises [Invalid_argument] if validation fails. *) +val compile_ir : Ir.t array -> compiled_ir + (** [dfa_to_dot dfa] returns a Graphviz DOT representation of the DFA, including state labels, accepting state markers, transition character sets, and tag operations on edges. *) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index b71a8c9..9d63526 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -129,8 +129,7 @@ end) let builtin_regexps = List.fold_left - (fun acc (n, c) -> - StringMap.add n (Sedlex.chars c, Some 1 (* fixed length *)) acc) + (fun acc (n, c) -> StringMap.add n (Ir.chars c) acc) StringMap.empty ([ ("any", Cset.any); @@ -420,113 +419,14 @@ let rev_csets_of_string ~loc ~encoding s = (* Code generation for `as` bindings. - [regexp_of_pattern] parses OCaml patterns into regexps and collects a - [tag_info list] for every [as] binding it encounters. Each [tag_info] - records the variable name and a [pos_expr] for its start/end positions. - For or-patterns like [(p1 as x) | (p2 as x)], each branch gets its own - positions and a discriminator [(cell, value)] so the generated code can - determine which branch matched. - - Position expressions ([pos_expr]) allow sub-match boundaries to be - computed without memory cells when possible. [fixed_length] determines - if a pattern has a statically known length; [advance] and [retreat] - propagate known positions through concatenation. For example, in - ['a', ((Plus 'b') as x)], the start of [x] is [Start_plus 1] (one - code point from the token start), so no start tag is needed. - - [gen_binding_code] turns the [tag_info list] into [let] bindings that - extract sub-matches from the lexbuf's memory cells (or computed offsets). - For or-patterns with discriminators it emits a chain of if/else - checks on the discriminator cell to select the correct positions. *) - -(* A [pos_expr] represents a sub-match boundary position that may be - computed without a memory cell: - - [Tag {tag; offset}]: read memory cell [tag] and add [offset]. - - [Start_plus n]: the position is [n] code points from the token start. - - [End_minus n]: the position is [n] code points before the token end. - When both boundaries of an [as] binding can be expressed as [Start_plus] - or [End_minus], no memory cells are needed at all. *) -type pos_expr = - | Tag of { tag : int; offset : int } - | Start_plus of int - | End_minus of int - -type tag_info = { - name : string; - start_pos : pos_expr; - end_pos : pos_expr; - disc : (int * int) list; - (* Discriminator conditions: [(cell, value)] pairs. For simple - bindings this is [[]]. For or-patterns, each branch has a - distinct value in the shared discriminator cell. *) -} - -(* [shift_pos pe delta] shifts a position expression by [delta] code points - (positive = forward, negative = backward). Returns [None] if either - argument is unknown. *) -let shift_pos pe delta = - match (pe, delta) with - | Some (Start_plus n), Some d -> Some (Start_plus (n + d)) - | Some (End_minus n), Some d -> Some (End_minus (n - d)) - | Some (Tag { tag; offset }), Some d -> - Some (Tag { tag; offset = offset + d }) - | _ -> None - -let advance pe len = shift_pos pe len -let retreat pe len = shift_pos pe (Option.map Int.neg len) - -(* [add_discriminators ~loc r1 tags1 r2 tags2] wraps two or-pattern branches - with discriminator tags so the generated code can tell which branch matched. - Reuses an existing discriminator cell from the left branch when possible - (OCaml desugars [a | b | c] into [(a | b) | c], so the left branch may - already carry a discriminator from an inner [|]). Returns [(regexp, tags)]. *) -let add_discriminators ~loc r1 tags1 r2 tags2 = - let names tags = - List.map (fun ti -> ti.name) tags |> List.sort_uniq String.compare - in - if names tags1 <> names tags2 then - err loc "both sides of '|' must bind the same names with 'as'"; - (* When both branches produce identical positions (e.g. - same fixed-length elements), no discriminator is needed. *) - if tags1 = tags2 then (Sedlex.alt r1 r2, tags1) - else ( - let stamp disc_cell value tags = - List.map (fun ti -> { ti with disc = (disc_cell, value) :: ti.disc }) tags - in - (* Check if the left branch already has a shared discriminator cell - that we can extend with a new value for the right branch. *) - let reusable_cell = - match tags1 with - | [] | { disc = []; _ } :: _ -> None - | { disc = (c, _) :: _; _ } :: rest -> - if - List.for_all - (fun ti -> - match ti.disc with (c2, _) :: _ -> c2 = c | [] -> false) - rest - then Some c - else None - in - match reusable_cell with - | Some disc_cell -> - let max_val = - List.fold_left - (fun acc ti -> - match ti.disc with (_, v) :: _ -> max acc v | [] -> acc) - (-1) tags1 - in - let new_val = max_val + 1 in - let r2w = Sedlex.bind_disc r2 disc_cell new_val in - (Sedlex.alt r1 r2w, tags1 @ stamp disc_cell new_val tags2) - | None -> - let disc_cell = Sedlex.new_disc_cell () in - let r1w = Sedlex.bind_disc r1 disc_cell 0 in - let r2w = Sedlex.bind_disc r2 disc_cell 1 in - (Sedlex.alt r1w r2w, stamp disc_cell 0 tags1 @ stamp disc_cell 1 tags2)) + [gen_binding_code] turns the [compiled_binding list] (from the compiler) + into [let] bindings that extract sub-matches from the lexbuf's memory cells + (or computed offsets). For or-patterns with discriminators it emits a chain + of if/else checks on the discriminator cell to select the correct positions. *) (* [gen_pos_expr lexbuf pe] generates code that evaluates a [pos_expr] to an integer position (offset from token start, in code points). *) -let gen_pos_expr lexbuf pe = +let gen_pos_expr lexbuf (pe : Sedlex.pos_expr) = let loc = default_loc in match pe with | Tag { tag; offset = 0 } -> @@ -550,26 +450,26 @@ let gen_sub_lexeme lexbuf st et = let __e = [%e gen_pos_expr lexbuf et] in { Sedlexing.lexbuf = [%e lexbuf]; pos = __s; len = __e - __s }] -(* [gen_binding_code lexbuf tag_info action] wraps [action] with [let] +(* [gen_binding_code lexbuf bindings action] wraps [action] with [let] bindings that extract sub-match values from the lexbuf's memory cells. For a single binding (no or-pattern), emits a direct sub_lexeme call. For or-patterns with multiple tag pairs, emits a chain of [if disc_cell = value then ...] to select the correct positions at runtime. *) -let gen_binding_code lexbuf (tag_info : tag_info list) action = +let gen_binding_code lexbuf (bindings : Sedlex.compiled_binding list) action = let loc = default_loc in ignore loc; - if tag_info = [] then action + if bindings = [] then action else ( - (* Group tag_info by variable name *) + (* Group bindings by variable name *) let by_name = let tbl = Hashtbl.create 8 in let order = ref [] in List.iter - (fun { name; start_pos; end_pos; disc } -> + (fun ({ name; start_pos; end_pos; disc } : Sedlex.compiled_binding) -> if not (Hashtbl.mem tbl name) then order := name :: !order; let existing = try Hashtbl.find tbl name with Not_found -> [] in Hashtbl.replace tbl name (existing @ [(start_pos, end_pos, disc)])) - tag_info; + bindings; List.rev_map (fun name -> (name, Hashtbl.find tbl name)) !order in List.fold_right @@ -613,240 +513,75 @@ let gen_binding_code lexbuf (tag_info : tag_info list) action = [%e acc]]) by_name action) -(* [codepoint_count ~encoding s] returns the number of Unicode code points - in string [s] under the given encoding. *) -let codepoint_count ~encoding s = - match encoding with - | Latin1 | Ascii -> String.length s - | Utf8 -> - let n = ref 0 in - String.iter (fun c -> if Char.code c land 0xC0 <> 0x80 then incr n) s; - !n - -(* [fixed_length env ~encoding p] returns [Some n] if pattern [p] always - matches exactly [n] code points, or [None] if the length is variable. - Used to compute [Start_plus]/[End_minus] offsets for [as] bindings. *) -let rec fixed_length env ~encoding p = - match p.ppat_desc with - | Ppat_alias (inner, _) -> fixed_length env ~encoding inner - | Ppat_or (p1, p2) -> ( - match - (fixed_length env ~encoding p1, fixed_length env ~encoding p2) - with - | Some n1, Some n2 when n1 = n2 -> Some n1 - | _ -> None) - | Ppat_tuple (p :: pl) -> - List.fold_left - (fun acc p -> - match (acc, fixed_length env ~encoding p) with - | Some a, Some b -> Some (a + b) - | _ -> None) - (fixed_length env ~encoding p) - pl - | Ppat_constant (Pconst_string (s, _, _)) -> - Some (codepoint_count ~encoding s) - | Ppat_constant (Pconst_char _) -> Some 1 - | Ppat_constant (Pconst_integer _) -> Some 1 - | Ppat_interval _ -> Some 1 - | Ppat_construct ({ txt = Lident "Chars"; _ }, _) -> Some 1 - | Ppat_construct ({ txt = Lident "Compl"; _ }, _) -> Some 1 - | Ppat_construct ({ txt = Lident "Sub"; _ }, _) -> Some 1 - | Ppat_construct ({ txt = Lident "Intersect"; _ }, _) -> Some 1 - | Ppat_construct ({ txt = Lident "Utf8"; _ }, Some (_, p)) -> - fixed_length env ~encoding:Utf8 p - | Ppat_construct ({ txt = Lident "Latin1"; _ }, Some (_, p)) -> - fixed_length env ~encoding:Latin1 p - | Ppat_construct ({ txt = Lident "Ascii"; _ }, Some (_, p)) -> - fixed_length env ~encoding:Ascii p - | Ppat_construct - ( { txt = Lident "Rep"; _ }, - Some - ( _, - { - ppat_desc = - Ppat_tuple - [ - p0; - { - ppat_desc = - Ppat_constant (i1 as i2) | Ppat_interval (i1, i2); - _; - }; - ]; - _; - } ) ) -> ( - match (i1, i2) with - | Pconst_integer (i1, _), Pconst_integer (i2, _) when i1 = i2 -> ( - match fixed_length env ~encoding p0 with - | Some l -> Some (int_of_string i1 * l) - | None -> None) - | _ -> None) - | Ppat_var { txt = x; _ } -> ( - match StringMap.find_opt x env with - | Some (_, len) -> len - | None -> None) - | _ -> None - -(* [regexp_of_pattern env pattern] parses an OCaml pattern AST into a - [Sedlex.regexp] and a [tag_info list] for any [as] bindings encountered. - [env] maps names to previously defined regexps (built-in + user-defined). - Handles all sedlex pattern constructors: literals, Star, Plus, Rep, Opt, - Compl, Sub, Intersect, Chars, character intervals, tuple (sequence), - or-patterns, and [Ppat_alias] for [as] bindings. *) -let regexp_of_pattern env = - let no_tags r = (r, ([] : tag_info list)) in - let reject_tags loc ctx (r, tags) = - if tags <> [] then err loc "'as' bindings are not supported inside %s" ctx; - r +(* [ir_of_pattern env ~encoding p] parses an OCaml pattern AST into an + [Ir.t]. The env maps names to previously defined IR patterns. + All tag allocation, discriminator handling, and fixed-length optimization + are deferred to the compiler's [compile_ir]. *) +let ir_of_pattern env = + let reject_captures loc ctx ir = + if Ir.capture_names ir <> [] then + err loc "'as' bindings are not supported inside %s" ctx; + ir in let rec char_pair_op func name ~encoding ~loc tuple = (* Construct something like Sub(a,b) *) match tuple with - | Some { ppat_desc = Ppat_tuple [p0; p1]; _ } -> begin - let r0 = - reject_tags p0.ppat_loc name - (aux ~left:None ~right:None ~encoding p0) - in - let r1 = - reject_tags p1.ppat_loc name - (aux ~left:None ~right:None ~encoding p1) - in + | Some { ppat_desc = Ppat_tuple [p0; p1]; _ } -> ( + let r0 = reject_captures p0.ppat_loc name (aux ~encoding p0) in + let r1 = reject_captures p1.ppat_loc name (aux ~encoding p1) in match func r0 r1 with - | Some r -> no_tags r + | Some r -> r | None -> err loc "the %s operator can only applied to single-character length \ regexps" - name - end + name) | _ -> err loc "the %s operator requires two arguments, like %s(a,b)" name name - and aux ~left ~right ~encoding p = - (* [left]: known position at the start of this pattern element. - [right]: known position at the end of this pattern element. - Both are [pos_expr option]: any variant is possible, or [None] - when the position cannot be determined statically. *) - match p.ppat_desc with - (* name as x — named sub-match binding. - Try to derive each boundary from [left]/[right] context or - [fixed_length]; allocate tags only for boundaries that cannot - be computed statically. Best case: 0 tags. Worst case: 2. *) - | Ppat_alias (inner, { txt = name; _ }) -> - let r, tags = aux ~left ~right ~encoding inner in - let elem_len = fixed_length env ~encoding inner in - let known_start = - match left with Some _ -> left | None -> retreat right elem_len - in - let known_end = - match right with - | Some _ -> right - | None -> advance known_start elem_len - in - let st, et, r = - match (known_start, known_end) with - | Some st, Some et -> (st, et, r) - | Some st, None -> - let wrapped, end_tag = Sedlex.bind_end_only r in - (st, Tag { tag = end_tag; offset = 0 }, wrapped) - | None, Some et -> - let wrapped, start_tag = Sedlex.bind_start_only r in - (Tag { tag = start_tag; offset = 0 }, et, wrapped) - | None, None -> ( - match elem_len with - | Some len -> - let wrapped, start_tag = Sedlex.bind_start_only r in - ( Tag { tag = start_tag; offset = 0 }, - Tag { tag = start_tag; offset = len }, - wrapped ) - | None -> - let wrapped, start_tag, end_tag = Sedlex.bind r in - ( Tag { tag = start_tag; offset = 0 }, - Tag { tag = end_tag; offset = 0 }, - wrapped )) - in - (r, { name; start_pos = st; end_pos = et; disc = [] } :: tags) + and ir_compl r = + match r with + | Ir.Chars c -> Some (Ir.Chars (Cset.difference Cset.any c)) + | _ -> None + and ir_subtract r0 r1 = + match (r0, r1) with + | Ir.Chars c0, Ir.Chars c1 -> Some (Ir.Chars (Cset.difference c0 c1)) + | _ -> None + and ir_intersection r0 r1 = + match (r0, r1) with + | Ir.Chars c0, Ir.Chars c1 -> Some (Ir.Chars (Cset.intersection c0 c1)) + | _ -> None + and aux ~encoding p = + match p.ppat_desc with + (* name as x — named sub-match binding *) + | Ppat_alias (inner, { txt = name; loc = name_loc }) -> + let ir_inner = aux ~encoding inner in + if List.mem name (Ir.capture_names ir_inner) then + err name_loc + "'as' binding '%s' shadows an inner binding of the same name" name; + Ir.capture name ir_inner (* p1 | p2 — alternation *) - | Ppat_or (p1, p2) -> - let r1, tags1 = aux ~left ~right ~encoding p1 in - let r2, tags2 = aux ~left ~right ~encoding p2 in - if tags1 <> [] || tags2 <> [] then - add_discriminators ~loc:p.ppat_loc r1 tags1 r2 tags2 - else (Sedlex.alt r1 r2, tags1 @ tags2) + | Ppat_or (p1, p2) -> Ir.alt (aux ~encoding p1) (aux ~encoding p2) (* (p1, p2, ...) — sequence *) | Ppat_tuple (p :: pl) -> - let all = p :: pl in - let n = List.length all in - let lengths = List.map (fun p -> fixed_length env ~encoding p) all in - let lengths_arr = Array.of_list lengths in - (* Compute right positions (right-to-left) *) - let rights = Array.make n None in - let () = - let acc = ref right in - for i = n - 1 downto 0 do - rights.(i) <- !acc; - acc := retreat !acc lengths_arr.(i) - done - in - (* Fallback for [update_left]: if [advance] fails (unknown - current left or variable-length element), but the element - was an [as] binding whose end position is a [Tag], that - tag marks exactly where the next element starts — so we - can use it as [left] for the next element. We only use - [Tag] endpoints because they carry a runtime position; - [Start_plus]/[End_minus] endpoints are static offsets - that don't help recover a new anchor. *) - let left_from_end_tag p tags' = - match p.ppat_desc with - | Ppat_alias _ -> ( - match tags' with - | { end_pos = Tag _ as et; _ } :: _ -> Some et - | _ -> None) - | _ -> None - in - (* Update left after processing element i *) - let update_left cur i p tags' = - match advance cur lengths_arr.(i) with - | Some _ as s -> s - | None -> left_from_end_tag p tags' - in - let r0, tags0 = aux ~left ~right:rights.(0) ~encoding p in - let left0 = update_left left 0 p tags0 in - let _, _, result = - List.fold_left - (fun (i, cur_left, (r, tags)) p -> - let r', tags' = - aux ~left:cur_left ~right:rights.(i) ~encoding p - in - let new_left = update_left cur_left i p tags' in - (i + 1, new_left, (Sedlex.seq r r', tags @ tags'))) - (1, left0, (r0, tags0)) - pl - in - result + List.fold_left + (fun acc p -> Ir.seq acc (aux ~encoding p)) + (aux ~encoding p) pl (* Star p — zero-or-more repetition *) | Ppat_construct ({ txt = Lident "Star"; _ }, Some (_, p)) -> - let r = - reject_tags p.ppat_loc "Star" - (aux ~left:None ~right:None ~encoding p) - in - no_tags (Sedlex.rep r) + Ir.star (reject_captures p.ppat_loc "Star" (aux ~encoding p)) (* Plus p — one-or-more repetition *) | Ppat_construct ({ txt = Lident "Plus"; _ }, Some (_, p)) -> - let r = - reject_tags p.ppat_loc "Plus" - (aux ~left:None ~right:None ~encoding p) - in - no_tags (Sedlex.plus r) + Ir.plus (reject_captures p.ppat_loc "Plus" (aux ~encoding p)) (* Utf8 p — switch to UTF-8 encoding *) | Ppat_construct ({ txt = Lident "Utf8"; _ }, Some (_, p)) -> - aux ~left ~right ~encoding:Utf8 p + aux ~encoding:Utf8 p (* Latin1 p — switch to Latin-1 encoding *) | Ppat_construct ({ txt = Lident "Latin1"; _ }, Some (_, p)) -> - aux ~left ~right ~encoding:Latin1 p + aux ~encoding:Latin1 p (* Ascii p — switch to ASCII encoding *) | Ppat_construct ({ txt = Lident "Ascii"; _ }, Some (_, p)) -> - aux ~left ~right ~encoding:Ascii p + aux ~encoding:Ascii p (* Rep (p, n..m) — bounded repetition *) | Ppat_construct ( { txt = Lident "Rep"; _ }, @@ -864,54 +599,44 @@ let regexp_of_pattern env = }; ]; _; - } ) ) -> begin - let r = - reject_tags p0.ppat_loc "Rep" - (aux ~left:None ~right:None ~encoding p0) - in + } ) ) -> ( + let r = reject_captures p0.ppat_loc "Rep" (aux ~encoding p0) in match (i1, i2) with | Pconst_integer (i1, _), Pconst_integer (i2, _) -> let i1 = int_of_string i1 in let i2 = int_of_string i2 in - if 0 <= i1 && i1 <= i2 then no_tags (Sedlex.repeat r i1 i2) + if 0 <= i1 && i1 <= i2 then Ir.rep r i1 i2 else err p.ppat_loc "Invalid range for Rep operator" | _ -> - err p.ppat_loc "Rep must take an integer constant or interval" - end - (* Rep _ — malformed Rep *) + err p.ppat_loc "Rep must take an integer constant or interval") + (* Rep _ — malformed *) | Ppat_construct ({ txt = Lident "Rep"; _ }, _) -> err p.ppat_loc "the Rep operator takes 2 arguments" (* Opt p — optional (zero or one) *) | Ppat_construct ({ txt = Lident "Opt"; _ }, Some (_, p)) -> - let r = - reject_tags p.ppat_loc "Opt" - (aux ~left:None ~right:None ~encoding p) - in - no_tags (Sedlex.alt Sedlex.eps r) + let r = reject_captures p.ppat_loc "Opt" (aux ~encoding p) in + Ir.alt Ir.eps r (* Compl p — complement of a character class *) - | Ppat_construct ({ txt = Lident "Compl"; _ }, arg) -> begin + | Ppat_construct ({ txt = Lident "Compl"; _ }, arg) -> ( match arg with - | Some (_, p0) -> begin + | Some (_, p0) -> ( let r = - reject_tags p0.ppat_loc "Compl" - (aux ~left:None ~right:None ~encoding p0) + reject_captures p0.ppat_loc "Compl" (aux ~encoding p0) in - match Sedlex.compl r with - | Some r -> no_tags r + match ir_compl r with + | Some r -> r | None -> err p.ppat_loc "the Compl operator can only applied to a \ - single-character length regexp" - end - | _ -> err p.ppat_loc "the Compl operator requires an argument" - end + single-character length regexp") + | _ -> err p.ppat_loc "the Compl operator requires an argument") (* Sub (a, b) — character class subtraction *) | Ppat_construct ({ txt = Lident "Sub"; _ }, arg) -> - char_pair_op ~encoding Sedlex.subtract "Sub" ~loc:p.ppat_loc + char_pair_op ~encoding ir_subtract "Sub" ~loc:p.ppat_loc (Option.map (fun (_, arg) -> arg) arg) (* Intersect (a, b) — character class intersection *) | Ppat_construct ({ txt = Lident "Intersect"; _ }, arg) -> - char_pair_op ~encoding Sedlex.intersection "Intersect" ~loc:p.ppat_loc + char_pair_op ~encoding ir_intersection "Intersect" ~loc:p.ppat_loc (Option.map (fun (_, arg) -> arg) arg) (* Chars "..." — character set from string literal *) | Ppat_construct ({ txt = Lident "Chars"; _ }, arg) -> ( @@ -924,11 +649,11 @@ let regexp_of_pattern env = | Some (Pconst_string (s, _, _)) -> let l = rev_csets_of_string ~loc:p.ppat_loc ~encoding s in let chars = List.fold_left Cset.union Cset.empty l in - no_tags (Sedlex.chars chars) + Ir.chars chars | _ -> err p.ppat_loc "the Chars operator requires a string argument") (* 'a' .. 'z' or 0x41 .. 0x5a — character/codepoint range *) - | Ppat_interval (i_start, i_end) -> begin + | Ppat_interval (i_start, i_end) -> ( match (i_start, i_end) with | Pconst_char c1, Pconst_char c2 -> let valid = @@ -943,47 +668,40 @@ let regexp_of_pattern env = err p.ppat_loc "this pattern is not a valid %s interval regexp" (string_of_encoding encoding); - no_tags - (Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2))) + Ir.chars (Cset.interval (Char.code c1) (Char.code c2)) | Pconst_integer (i1, _), Pconst_integer (i2, _) -> - no_tags - (Sedlex.chars - (Cset.interval - (codepoint (int_of_string i1)) - (codepoint (int_of_string i2)))) - | _ -> err p.ppat_loc "this pattern is not a valid interval regexp" - end + Ir.chars + (Cset.interval + (codepoint (int_of_string i1)) + (codepoint (int_of_string i2))) + | _ -> err p.ppat_loc "this pattern is not a valid interval regexp") (* "string" or 'c' or 0x42 — literal string, char, or codepoint *) - | Ppat_constant const -> begin + | Ppat_constant const -> ( match const with | Pconst_string (s, _, _) -> let rev_l = rev_csets_of_string s ~loc:p.ppat_loc ~encoding in - no_tags - (List.fold_left - (fun acc cset -> Sedlex.seq (Sedlex.chars cset) acc) - Sedlex.eps rev_l) - | Pconst_char c -> no_tags (Sedlex.chars (char c)) + List.fold_left + (fun acc cset -> Ir.seq (Ir.chars cset) acc) + Ir.eps rev_l + | Pconst_char c -> Ir.chars (char c) | Pconst_integer (i, _) -> - no_tags - (Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))) - | _ -> err p.ppat_loc "this pattern is not a valid regexp" - end + Ir.chars (Cset.singleton (codepoint (int_of_string i))) + | _ -> err p.ppat_loc "this pattern is not a valid regexp") (* name — reference to a previously defined regexp *) - | Ppat_var { txt = x; _ } -> begin - try no_tags (fst (StringMap.find x env)) - with Not_found -> err p.ppat_loc "unbound regexp %s" x - end + | Ppat_var { txt = x; _ } -> ( + match StringMap.find_opt x env with + | Some ir -> ir + | None -> err p.ppat_loc "unbound regexp %s" x) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in - aux ~left:(Some (Start_plus 0)) ~right:(Some (End_minus 0)) ~encoding:Ascii + aux ~encoding:Ascii (* [handle_sedlex_match_ ~env ~map_rhs match_expr] is the main entry point for compiling a [match%sedlex lexbuf with ...] expression. It: 1. Extracts the lexbuf identifier and match cases. - 2. Parses each case's pattern into a regexp + tag_info via [regexp_of_pattern]. - 3. Compiles all regexps into a single DFA via [Sedlex.compile]. - 4. Applies [map_rhs] to each case's right-hand side (for recursive PPX - expansion of nested [match%sedlex] blocks). + 2. Parses each case's pattern into [Ir.t] via [ir_of_pattern]. + 3. Compiles all patterns via [Sedlex.compile_ir] (tag allocation, DFA). + 4. Applies [map_rhs] to each case's right-hand side. 5. Wraps each RHS with [gen_binding_code] for [as] binding extraction. 6. Generates the full lexer code via [gen_definition]. Returns [(generated_expr, dfa)] for use by the main mapper and tests. *) @@ -1014,30 +732,45 @@ let handle_sedlex_match_ ~env ~map_rhs match_expr = err p.ppat_loc "the last branch must be a catch-all error case" in let cases = List.rev (List.tl cases) in - Sedlex.reset_tags (); - let cases_parsed = + let cases_with_ir = List.map (function | { pc_lhs = p; pc_rhs = e; pc_guard = None } -> - let regexp, tag_info = regexp_of_pattern env p in - (regexp, tag_info, e) + let ir = ir_of_pattern env p in + (match Ir.validate ir with + | Ok () -> () + | Error msg -> err p.ppat_loc "%s" msg); + (ir, p.ppat_loc, e) | { pc_guard = Some e; _ } -> err e.pexp_loc "'when' guards are not supported") cases in let compiled = - Sedlex.compile (Array.of_list (List.map (fun (r, _, _) -> r) cases_parsed)) + Sedlex.compile_ir + (Array.of_list (List.map (fun (ir, _, _) -> ir) cases_with_ir)) in - (* map_rhs is called after compile so that nested match%sedlex blocks - (which call reset_tags) cannot corrupt the outer tag counter. *) + (* map_rhs is called after compile_ir so that nested match%sedlex blocks + (which call compile_ir, resetting tags) cannot corrupt the outer + tag counter. *) let cases = - List.map - (fun (_, tag_info, e) -> - let action = gen_binding_code (snd lexbuf) tag_info (map_rhs e) in + List.mapi + (fun i (_, _, e) -> + let action = + gen_binding_code (snd lexbuf) + (Array.get compiled.bindings i) + (map_rhs e) + in ((), action)) - cases_parsed + cases_with_ir + in + let compiled_basic : Sedlex.compiled = + { + dfa = compiled.dfa; + init_tags = compiled.init_tags; + num_tags = compiled.num_tags; + } in - (gen_definition lexbuf compiled cases error, compiled.dfa) + (gen_definition lexbuf compiled_basic cases error, compiled.dfa) let handle_sedlex_match match_expr = handle_sedlex_match_ ~env:builtin_regexps ~map_rhs:Fun.id match_expr @@ -1054,30 +787,28 @@ let mapper = object (this) inherit Ast_traverse.map as super val env = builtin_regexps - method define_regexp name r_len = {} + method define_regexp name ir = {} method eval_regexp_expr e = match e with (* [%sedlex.regexp? ] *) | [%expr [%sedlex.regexp? [%p? p]]] -> - let r, tags = regexp_of_pattern env p in - if tags <> [] then + let ir = ir_of_pattern env p in + if Ir.capture_names ir <> [] then err p.ppat_loc "'as' bindings are not allowed in regexp definitions"; - let len = fixed_length env ~encoding:Ascii p in - Some (r, len) + Some ir (* let = [%sedlex.regexp? ] in *) | [%expr let [%p? { ppat_desc = Ppat_var { txt = name; _ }; _ }] = [%sedlex.regexp? [%p? p]] in [%e? body]] -> - let r, tags = regexp_of_pattern env p in - if tags <> [] then + let ir = ir_of_pattern env p in + if Ir.capture_names ir <> [] then err p.ppat_loc "'as' bindings are not allowed in regexp definitions"; - let len = fixed_length env ~encoding:Ascii p in - (this#define_regexp name (r, len))#eval_regexp_expr body + (this#define_regexp name ir)#eval_regexp_expr body | _ -> None method! expression e = diff --git a/test/codegen/test_errors.ml b/test/codegen/test_errors.ml index 50745da..7457770 100644 --- a/test/codegen/test_errors.ml +++ b/test/codegen/test_errors.ml @@ -78,6 +78,17 @@ let%expect_test "error: as inside Intersect" = Error: Sedlex: 'as' bindings are not supported inside Intersect |}] +let%expect_test "error: as shadows inner binding" = + [%compile_error + [%sedlex match buf with (('a' as x), 'b') as x -> ignore x | _ -> ()]]; + [%expect + {| + File "test/codegen/test_errors.ml", characters 49-50: + | [%sedlex match buf with (('a' as x), 'b') as x -> ignore x | _ -> ()]]; + ^ + Error: Sedlex: 'as' binding 'x' shadows an inner binding of the same name + |}] + let%expect_test "error: different names in or-pattern" = [%compile_error [%sedlex @@ -97,8 +108,8 @@ let%expect_test "error: Sub on multi-char regexp" = [%expect {| File "test/codegen/test_errors.ml", characters 42-57: - | [%compile_error [%sedlex match buf with Sub ("ab", 'a') -> () | _ -> ()]]; - ^^^^^^^^^^^^^^^ + | [%compile_error [%sedlex match buf with Sub ("ab", 'a') -> () | _ -> ()]]; + ^^^^^^^^^^^^^^^ Error: Sedlex: the Sub operator can only applied to single-character length regexps |}] From 5eb48a030a89ef00ad5938bb35c249c02b5e4215 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Tue, 7 Apr 2026 17:59:56 +0200 Subject: [PATCH 02/11] Flatten Alt to n-ary in the IR Change Alt from binary (Alt of t * t) to n-ary (Alt of t list) with smart constructor that flattens nested Alts. This lets the compiler see all branches at once and assign discriminators in a single pass, removing the reusable_cell heuristic that was needed to handle OCaml's left-nested desugaring of or-patterns. Branches with identical bindings now share discriminator values, which can reduce the number of distinct values needed. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 48 ++++++++++----- src/compiler/ir.mli | 8 ++- src/compiler/sedlex.ml | 116 +++++++++++++++++++----------------- src/compiler/sedlex.mli | 3 +- src/syntax/ppx_sedlex.ml | 8 +-- test/codegen/test_errors.ml | 2 +- test/codegen/test_gen.ml | 32 +++++----- 7 files changed, 121 insertions(+), 96 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index 85be905..20a5196 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -5,7 +5,7 @@ type t = | Chars of Cset.t | Seq of t list - | Alt of t * t + | Alt of t list | Star of t | Plus of t | Rep of t * int * int @@ -32,7 +32,10 @@ let seq a b = let alt a b = match (a, b) with | Chars c1, Chars c2 -> Chars (Cset.union c1 c2) - | _ -> Alt (a, b) + | Alt l1, Alt l2 -> Alt (l1 @ l2) + | Alt l1, x -> Alt (l1 @ [x]) + | x, Alt l2 -> Alt (x :: l2) + | _ -> Alt [a; b] (* Analysis *) @@ -47,10 +50,11 @@ let rec fixed_length = function | Some a, Some b -> Some (a + b) | _ -> None) (Some 0) elems - | Alt (a, b) -> ( - match (fixed_length a, fixed_length b) with - | Some n1, Some n2 when n1 = n2 -> Some n1 - | _ -> None) + | Alt branches -> ( + match List.map fixed_length branches with + | [] -> None + | first :: rest -> + if List.for_all (( = ) first) rest then first else None) | Rep (inner, n, m) -> if n = m then ( match fixed_length inner with Some l -> Some (n * l) | None -> None) @@ -63,7 +67,7 @@ let rec capture_names_acc acc = function let acc = if List.mem name acc then acc else name :: acc in capture_names_acc acc inner | Seq elems -> List.fold_left capture_names_acc acc elems - | Alt (a, b) -> capture_names_acc (capture_names_acc acc a) b + | Alt branches -> List.fold_left capture_names_acc acc branches | Star inner | Plus inner | Rep (inner, _, _) -> capture_names_acc acc inner let capture_names t = capture_names_acc [] t |> List.sort_uniq String.compare @@ -88,14 +92,21 @@ let validate t = (fun acc e -> match acc with Error _ -> acc | Ok () -> check ~inside_rep e) (Ok ()) elems - | Alt (a, b) -> - let* () = check ~inside_rep a in - let* () = check ~inside_rep b in - let na = capture_names a in - let nb = capture_names b in - if na <> nb && (na <> [] || nb <> []) then - Error "both sides of '|' must bind the same names with 'as'" - else Ok () + | Alt branches -> ( + let* () = + List.fold_left + (fun acc e -> + match acc with Error _ -> acc | Ok () -> check ~inside_rep e) + (Ok ()) branches + in + let names_per_branch = List.map capture_names branches in + match names_per_branch with + | [] | [_] -> Ok () + | first :: rest -> + if List.for_all (fun ns -> ns = first) rest then Ok () + else + Error "all branches of '|' must bind the same names with 'as'" + ) | Star inner | Plus inner -> check ~inside_rep:true inner | Rep (inner, _, _) -> check ~inside_rep:true inner and ( let* ) r f = match r with Error _ as e -> e | Ok x -> f x in @@ -128,7 +139,12 @@ let rec pp fmt = function ~pp_sep:(fun fmt () -> Format.fprintf fmt ", ") pp) elems - | Alt (a, b) -> Format.fprintf fmt "(%a | %a)" pp a pp b + | Alt branches -> + Format.fprintf fmt "(%a)" + (Format.pp_print_list + ~pp_sep:(fun fmt () -> Format.fprintf fmt " | ") + pp) + branches | Star inner -> Format.fprintf fmt "Star %a" pp inner | Plus inner -> Format.fprintf fmt "Plus %a" pp inner | Rep (inner, n, m) -> Format.fprintf fmt "Rep(%a, %d..%d)" pp inner n m diff --git a/src/compiler/ir.mli b/src/compiler/ir.mli index b84ceca..f5575ec 100644 --- a/src/compiler/ir.mli +++ b/src/compiler/ir.mli @@ -18,7 +18,9 @@ type t = | Seq of t list (** Sequence (concatenation). Invariant: length >= 2. Use {!seq} smart constructor. *) - | Alt of t * t (** Alternation: match left or right. *) + | Alt of t list + (** Alternation. Invariant: length >= 2. Use {!alt} smart constructor + which flattens nested [Alt]s. *) | Star of t (** Kleene star (zero or more). *) | Plus of t (** One or more. *) | Rep of t * int * int @@ -52,8 +54,8 @@ val capture_names : t -> string list (** {2 Validation} [validate t] checks structural constraints: - - [Capture] not inside [Star], [Plus], [Rep], or set operations - - Both sides of [Alt] bind the same capture names + - [Capture] not inside [Star], [Plus], or [Rep] + - All branches of [Alt] bind the same capture names Returns [Ok ()] or [Error msg]. *) val validate : t -> (unit, string) result diff --git a/src/compiler/sedlex.ml b/src/compiler/sedlex.ml index e3695c8..0f006e9 100644 --- a/src/compiler/sedlex.ml +++ b/src/compiler/sedlex.ml @@ -353,8 +353,8 @@ let compile rs = [End_minus], no memory cells are needed at all. Or-patterns [(p1 as x) | (p2 as x)] additionally use discriminator cells: - integer values that record which branch was taken, so the PPX can extract - the correct positions at match time. *) + integer values that record which branch was taken, so the code generator + can emit the correct position extraction at match time. *) type pos_expr = | Tag of { tag : int; offset : int } @@ -389,54 +389,53 @@ let shift_pos pe delta = let advance pe len = shift_pos pe len let retreat pe len = shift_pos pe (Option.map Int.neg len) -(* [add_discriminators r1 tags1 r2 tags2] wraps two or-pattern branches - with discriminator tags so the generated code can tell which branch matched. - Reuses an existing discriminator cell from the left branch when possible - (OCaml desugars [a | b | c] into [(a | b) | c], so the left branch may - already carry a discriminator from an inner [|]). *) -let add_discriminators r1 tags1 r2 tags2 = - let names tags = - List.map (fun (ti : compiled_binding) -> ti.name) tags - |> List.sort_uniq String.compare +(* [add_discriminators branches] takes a list of [(regexp, bindings)] pairs + from an n-ary alternation and wraps each branch with a discriminator tag + so the generated code can tell which branch matched. Branches with + identical bindings share the same discriminator value. If all branches + have identical bindings, no discriminator cell is allocated. *) +let add_discriminators (branches : (regexp * compiled_binding list) list) = + let fold_alt = function + | [] -> assert false + | (r, _) :: rest -> List.fold_left (fun acc (r, _) -> alt acc r) r rest in - if names tags1 <> names tags2 then - invalid_arg "both sides of '|' must bind the same names with 'as'"; - if tags1 = tags2 then (alt r1 r2, tags1) + (* Check if all branches produce identical bindings — if so, no + discriminator is needed at all. *) + let all_same = + match branches with + | [] | [_] -> true + | (_, first) :: rest -> List.for_all (fun (_, tags) -> tags = first) rest + in + if all_same then (fold_alt branches, snd (List.hd branches)) else ( - let stamp disc_cell value tags = + let disc_cell = new_disc_cell () in + let stamp value tags = List.map (fun (ti : compiled_binding) -> { ti with disc = (disc_cell, value) :: ti.disc }) tags in - let reusable_cell = - match tags1 with - | [] | { disc = []; _ } :: _ -> None - | { disc = (c, _) :: _; _ } :: rest -> - if - List.for_all - (fun (ti : compiled_binding) -> - match ti.disc with (c2, _) :: _ -> c2 = c | [] -> false) - rest - then Some c - else None + (* Assign discriminator values. Branches with identical bindings + share the same value. *) + let next_val = ref 0 in + let seen : (compiled_binding list * int) list ref = ref [] in + let get_value tags = + match List.assoc_opt tags !seen with + | Some v -> v + | None -> + let v = !next_val in + incr next_val; + seen := (tags, v) :: !seen; + v in - match reusable_cell with - | Some disc_cell -> - let max_val = - List.fold_left - (fun acc (ti : compiled_binding) -> - match ti.disc with (_, v) :: _ -> max acc v | [] -> acc) - (-1) tags1 - in - let new_val = max_val + 1 in - let r2w = bind_disc r2 disc_cell new_val in - (alt r1 r2w, tags1 @ stamp disc_cell new_val tags2) - | None -> - let disc_cell = new_disc_cell () in - let r1w = bind_disc r1 disc_cell 0 in - let r2w = bind_disc r2 disc_cell 1 in - (alt r1w r2w, stamp disc_cell 0 tags1 @ stamp disc_cell 1 tags2)) + let wrapped = + List.map + (fun (r, tags) -> + let v = get_value tags in + (bind_disc r disc_cell v, stamp v tags)) + branches + in + (fold_alt wrapped, List.concat_map snd wrapped)) (* [lower ir ~left ~right] converts an IR pattern to a low-level regexp and a list of compiled bindings. [left] and [right] are the known @@ -491,11 +490,18 @@ let rec lower ~left ~right (ir : Ir.t) : regexp * compiled_binding list = wrapped )) in (r, { name; start_pos = st; end_pos = et; disc = [] } :: tags) - | Ir.Alt (a, b) -> - let r1, tags1 = lower ~left ~right a in - let r2, tags2 = lower ~left ~right b in - if tags1 <> [] || tags2 <> [] then add_discriminators r1 tags1 r2 tags2 - else (alt r1 r2, []) + | Ir.Alt branches -> + let lowered = List.map (lower ~left ~right) branches in + let has_captures = List.exists (fun (_, tags) -> tags <> []) lowered in + if has_captures then add_discriminators lowered + else ( + let r = + List.fold_left + (fun acc (r, _) -> alt acc r) + (fst (List.hd lowered)) + (List.tl lowered) + in + (r, [])) | Ir.Seq elems -> (* Sequence — propagate left/right position contexts through elements. Right positions are computed right-to-left; left positions are @@ -512,14 +518,14 @@ let rec lower ~left ~right (ir : Ir.t) : regexp * compiled_binding list = acc := retreat !acc lengths_arr.(i) done in - (* Fallback for [update_left]: if [advance] fails (unknown - current left or variable-length element), but the element - was a [Capture] whose end position is a [Tag], that tag - records a runtime position we can use as the [left] anchor - for the next element. [Start_plus]/[End_minus] endpoints - are not useful here: they are relative to token boundaries, - so they only propagate through [advance]/[retreat] — they - cannot serve as fresh anchors after a variable-length gap. *) + (* Fallback for [update_left]: if [advance] returns [None] + (because the current left is unknown or the element has + variable length), but the element was a [Capture] whose + end position is a [Tag], we can use that tag as the [left] + anchor for the next element — it records a runtime position. + [Start_plus]/[End_minus] endpoints don't help here: they + are already factored into [advance], so if [advance] failed, + they have nothing more to offer. *) let left_from_end_tag ir tags' = match ir with | Ir.Capture _ -> ( diff --git a/src/compiler/sedlex.mli b/src/compiler/sedlex.mli index 320edd5..b9c6bac 100644 --- a/src/compiler/sedlex.mli +++ b/src/compiler/sedlex.mli @@ -141,7 +141,8 @@ type compiled_binding = { end_pos : pos_expr; disc : (int * int) list; (** Discriminator conditions: [(cell, value)] pairs. Empty for simple - (non-or) bindings. For or-patterns, each branch has distinct values. + (non-or) bindings. For or-patterns, branches with different positions + have distinct values; branches with identical positions share a value. *) } diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 9d63526..9c685ce 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -513,10 +513,10 @@ let gen_binding_code lexbuf (bindings : Sedlex.compiled_binding list) action = [%e acc]]) by_name action) -(* [ir_of_pattern env ~encoding p] parses an OCaml pattern AST into an - [Ir.t]. The env maps names to previously defined IR patterns. - All tag allocation, discriminator handling, and fixed-length optimization - are deferred to the compiler's [compile_ir]. *) +(* [ir_of_pattern env] returns a function [pattern -> Ir.t] that parses an + OCaml pattern AST into an IR node. [env] maps names to previously defined + IR patterns. All tag allocation, discriminator handling, and fixed-length + optimization are deferred to the compiler's [compile_ir]. *) let ir_of_pattern env = let reject_captures loc ctx ir = if Ir.capture_names ir <> [] then diff --git a/test/codegen/test_errors.ml b/test/codegen/test_errors.ml index 7457770..a4425d8 100644 --- a/test/codegen/test_errors.ml +++ b/test/codegen/test_errors.ml @@ -98,7 +98,7 @@ let%expect_test "error: different names in or-pattern" = File "test/codegen/test_errors.ml", characters 21-44: | match buf with ('a' as x) | ('b' as y) -> ignore (x, y) | _ -> ()]]; ^^^^^^^^^^^^^^^^^^^^^^^ - Error: Sedlex: both sides of '|' must bind the same names with 'as' + Error: Sedlex: all branches of '|' must bind the same names with 'as' |}] (* Error tests for Sub/Intersect/Compl on multi-char regexps *) diff --git a/test/codegen/test_gen.ml b/test/codegen/test_gen.ml index 44af2e0..78102f5 100644 --- a/test/codegen/test_gen.ml +++ b/test/codegen/test_gen.ml @@ -398,7 +398,7 @@ let%expect_test "as binding: 3-way or reuses disc cell" = state3 -> state4 [label="'d' {d0=0}"]; state3 -> state6 [label="'e' {d0=1}"]; state4 [label="4\n[rule 0]", shape=doublecircle]; - state4 -> state5 [label="'f' {d0=2}"]; + state4 -> state5 [label="'f' {d0=0}"]; state5 [label="5\n[rule 0]", shape=doublecircle]; state6 [label="6\n[rule 0]", shape=doublecircle]; } @@ -423,7 +423,7 @@ let%expect_test "as binding: 3-way or reuses disc cell" = and __sedlex_state_4 buf = Sedlexing.mark buf 0; (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with - | 0 -> (Sedlexing.__private__set_mem_value buf 0 2; 0) + | 0 -> (Sedlexing.__private__set_mem_value buf 0 0; 0) | _ -> Sedlexing.backtrack buf) in match Sedlexing.start buf; Sedlexing.__private__init_mem buf 1; @@ -1189,7 +1189,7 @@ let%expect_test "as binding: or-chain then nested or on right" = state2 [label="2"]; state2 -> state3 [label="'e'"]; state3 [label="3"]; - state3 -> state4 [label="'f' {d0=0}"]; + state3 -> state4 [label="'f' {d1=0}"]; state4 [label="4\n[rule 0]", shape=doublecircle]; state5 [label="5"]; state5 -> state6 [label="'d'"]; @@ -1197,14 +1197,14 @@ let%expect_test "as binding: or-chain then nested or on right" = state6 [label="6"]; state6 -> state7 [label="'e'"]; state7 [label="7"]; - state7 -> state8 [label="'f' {d1=0}"]; + state7 -> state8 [label="'f' {d0=0}"]; state8 [label="8"]; state8 -> state9 [label="'g'"]; state9 [label="9"]; - state9 -> state10 [label="'h' {d0=2}"]; + state9 -> state10 [label="'h' {d1=2}"]; state10 [label="10\n[rule 0]", shape=doublecircle]; state11 [label="11"]; - state11 -> state12 [label="'f' {d1=1}"]; + state11 -> state12 [label="'f' {d0=1}"]; state12 [label="12"]; state12 -> state9 [label="'g'"]; } @@ -1224,7 +1224,7 @@ let%expect_test "as binding: or-chain then nested or on right" = | _ -> Sedlexing.backtrack buf and __sedlex_state_3 buf = match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with - | 0 -> (Sedlexing.__private__set_mem_value buf 0 0; 0) + | 0 -> (Sedlexing.__private__set_mem_value buf 1 0; 0) | _ -> Sedlexing.backtrack buf and __sedlex_state_5 buf = match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with @@ -1237,7 +1237,7 @@ let%expect_test "as binding: or-chain then nested or on right" = | _ -> Sedlexing.backtrack buf and __sedlex_state_7 buf = match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with - | 0 -> (Sedlexing.__private__set_mem_value buf 1 0; __sedlex_state_8 buf) + | 0 -> (Sedlexing.__private__set_mem_value buf 0 0; __sedlex_state_8 buf) | _ -> Sedlexing.backtrack buf and __sedlex_state_8 buf = match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with @@ -1245,11 +1245,11 @@ let%expect_test "as binding: or-chain then nested or on right" = | _ -> Sedlexing.backtrack buf and __sedlex_state_9 buf = match __sedlex_partition_7 (Sedlexing.__private__next_int buf) with - | 0 -> (Sedlexing.__private__set_mem_value buf 0 2; 0) + | 0 -> (Sedlexing.__private__set_mem_value buf 1 2; 0) | _ -> Sedlexing.backtrack buf and __sedlex_state_11 buf = match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with - | 0 -> (Sedlexing.__private__set_mem_value buf 1 1; __sedlex_state_12 buf) + | 0 -> (Sedlexing.__private__set_mem_value buf 0 1; __sedlex_state_12 buf) | _ -> Sedlexing.backtrack buf and __sedlex_state_12 buf = match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with @@ -1261,21 +1261,21 @@ let%expect_test "as binding: or-chain then nested or on right" = with | 0 -> let x = - if (Sedlexing.__private__mem_value buf 0) = 0 + if (Sedlexing.__private__mem_value buf 1) = 0 then let __s = 0 in let __e = (Sedlexing.lexeme_length buf) - 2 in { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } else - if (Sedlexing.__private__mem_value buf 0) = 1 + if (Sedlexing.__private__mem_value buf 1) = 1 then (let __s = 0 in let __e = (Sedlexing.lexeme_length buf) - 3 in { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) else if - ((Sedlexing.__private__mem_value buf 0) = 2) && - ((Sedlexing.__private__mem_value buf 1) = 0) + ((Sedlexing.__private__mem_value buf 1) = 2) && + ((Sedlexing.__private__mem_value buf 0) = 0) then (let __s = 0 in let __e = (Sedlexing.lexeme_length buf) - 3 in @@ -1285,13 +1285,13 @@ let%expect_test "as binding: or-chain then nested or on right" = let __e = (Sedlexing.lexeme_length buf) - 4 in { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) in let y = - if (Sedlexing.__private__mem_value buf 0) = 0 + if (Sedlexing.__private__mem_value buf 1) = 0 then let __s = 2 in let __e = Sedlexing.lexeme_length buf in { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } else - if (Sedlexing.__private__mem_value buf 0) = 1 + if (Sedlexing.__private__mem_value buf 1) = 1 then (let __s = 1 in let __e = Sedlexing.lexeme_length buf in From 5765898bae668234b5c3964b5c8da4d04cf88e21 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Tue, 7 Apr 2026 22:09:04 +0200 Subject: [PATCH 03/11] Use SSet for capture_names instead of string list Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 12 ++++++------ src/compiler/ir.mli | 7 ++++--- src/syntax/ppx_sedlex.ml | 8 ++++---- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index 20a5196..eca29e3 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -61,16 +61,16 @@ let rec fixed_length = function else None | Star _ | Plus _ -> None +module SSet = Set.Make (String) + let rec capture_names_acc acc = function | Chars _ | Eps -> acc - | Capture (name, inner) -> - let acc = if List.mem name acc then acc else name :: acc in - capture_names_acc acc inner + | Capture (name, inner) -> capture_names_acc (SSet.add name acc) inner | Seq elems -> List.fold_left capture_names_acc acc elems | Alt branches -> List.fold_left capture_names_acc acc branches | Star inner | Plus inner | Rep (inner, _, _) -> capture_names_acc acc inner -let capture_names t = capture_names_acc [] t |> List.sort_uniq String.compare +let capture_names t = capture_names_acc SSet.empty t (* Validation *) @@ -81,7 +81,7 @@ let validate t = | Capture (_, _) when inside_rep -> Error "'as' bindings are not supported inside repetition operators" | Capture (name, inner) -> - if List.mem name (capture_names inner) then + if SSet.mem name (capture_names inner) then Error (Printf.sprintf "'as' binding '%s' shadows an inner binding of the same name" @@ -103,7 +103,7 @@ let validate t = match names_per_branch with | [] | [_] -> Ok () | first :: rest -> - if List.for_all (fun ns -> ns = first) rest then Ok () + if List.for_all (SSet.equal first) rest then Ok () else Error "all branches of '|' must bind the same names with 'as'" ) diff --git a/src/compiler/ir.mli b/src/compiler/ir.mli index f5575ec..f9dedb1 100644 --- a/src/compiler/ir.mli +++ b/src/compiler/ir.mli @@ -47,9 +47,10 @@ val capture : string -> t -> t points, or [None] if the length is variable. *) val fixed_length : t -> int option -(** [capture_names t] returns the set of capture names in [t], sorted and - deduplicated. *) -val capture_names : t -> string list +module SSet : Set.S with type elt = string + +(** [capture_names t] returns the set of capture names in [t]. *) +val capture_names : t -> SSet.t (** {2 Validation} diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 9c685ce..cad748f 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -519,7 +519,7 @@ let gen_binding_code lexbuf (bindings : Sedlex.compiled_binding list) action = optimization are deferred to the compiler's [compile_ir]. *) let ir_of_pattern env = let reject_captures loc ctx ir = - if Ir.capture_names ir <> [] then + if not (Ir.SSet.is_empty (Ir.capture_names ir)) then err loc "'as' bindings are not supported inside %s" ctx; ir in @@ -556,7 +556,7 @@ let ir_of_pattern env = (* name as x — named sub-match binding *) | Ppat_alias (inner, { txt = name; loc = name_loc }) -> let ir_inner = aux ~encoding inner in - if List.mem name (Ir.capture_names ir_inner) then + if Ir.SSet.mem name (Ir.capture_names ir_inner) then err name_loc "'as' binding '%s' shadows an inner binding of the same name" name; Ir.capture name ir_inner @@ -794,7 +794,7 @@ let mapper = (* [%sedlex.regexp? ] *) | [%expr [%sedlex.regexp? [%p? p]]] -> let ir = ir_of_pattern env p in - if Ir.capture_names ir <> [] then + if not (Ir.SSet.is_empty (Ir.capture_names ir)) then err p.ppat_loc "'as' bindings are not allowed in regexp definitions"; Some ir @@ -805,7 +805,7 @@ let mapper = in [%e? body]] -> let ir = ir_of_pattern env p in - if Ir.capture_names ir <> [] then + if not (Ir.SSet.is_empty (Ir.capture_names ir)) then err p.ppat_loc "'as' bindings are not allowed in regexp definitions"; (this#define_regexp name ir)#eval_regexp_expr body From 02010a9cca8f61d8b441485c9b4a492840eb912b Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Tue, 7 Apr 2026 22:49:57 +0200 Subject: [PATCH 04/11] Validate in IR smart constructors; use unwrap in PPX Move structural validation from the PPX into IR smart constructors that return (t, string) result: capture checks shadowing, alt checks name consistency across branches, star/plus/rep reject inner captures. Add reject_captures for contexts that forbid captures (Opt, Compl, Sub, Intersect). The PPX uses a local unwrap helper to bridge IR results to ppxlib's exception-based error reporting. Remove the now-redundant validate function in favour of a debug-only check_invariant assertion. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 144 +++++++++++++++++++++------------------ src/compiler/ir.mli | 38 ++++++----- src/compiler/sedlex.ml | 5 +- src/syntax/ppx_sedlex.ml | 43 ++++++------ 4 files changed, 120 insertions(+), 110 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index eca29e3..cd60da9 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -12,30 +12,20 @@ type t = | Eps | Capture of string * t -(* Smart constructors *) +module SSet = Set.Make (String) -let chars c = Chars c -let eps = Eps -let capture name inner = Capture (name, inner) -let star t = Star t -let plus t = Plus t -let rep t n m = Rep (t, n, m) +let rec capture_names_acc acc = function + | Chars _ | Eps -> acc + | Capture (name, inner) -> capture_names_acc (SSet.add name acc) inner + | Seq elems -> List.fold_left capture_names_acc acc elems + | Alt branches -> List.fold_left capture_names_acc acc branches + | Star inner | Plus inner | Rep (inner, _, _) -> capture_names_acc acc inner -let seq a b = - match (a, b) with - | Eps, x | x, Eps -> x - | Seq l1, Seq l2 -> Seq (l1 @ l2) - | Seq l1, x -> Seq (l1 @ [x]) - | x, Seq l2 -> Seq (x :: l2) - | _ -> Seq [a; b] +let capture_names t = capture_names_acc SSet.empty t -let alt a b = - match (a, b) with - | Chars c1, Chars c2 -> Chars (Cset.union c1 c2) - | Alt l1, Alt l2 -> Alt (l1 @ l2) - | Alt l1, x -> Alt (l1 @ [x]) - | x, Alt l2 -> Alt (x :: l2) - | _ -> Alt [a; b] +let reject_captures ctx t = + if SSet.is_empty (capture_names t) then Ok t + else Error (Printf.sprintf "'as' bindings are not supported inside %s" ctx) (* Analysis *) @@ -61,55 +51,75 @@ let rec fixed_length = function else None | Star _ | Plus _ -> None -module SSet = Set.Make (String) +(* Smart constructors *) -let rec capture_names_acc acc = function - | Chars _ | Eps -> acc - | Capture (name, inner) -> capture_names_acc (SSet.add name acc) inner - | Seq elems -> List.fold_left capture_names_acc acc elems - | Alt branches -> List.fold_left capture_names_acc acc branches - | Star inner | Plus inner | Rep (inner, _, _) -> capture_names_acc acc inner +let chars c = Chars c +let eps = Eps -let capture_names t = capture_names_acc SSet.empty t +let capture name inner = + if SSet.mem name (capture_names inner) then + Error + (Printf.sprintf + "'as' binding '%s' shadows an inner binding of the same name" name) + else Ok (Capture (name, inner)) + +let star t = + match reject_captures "Star" t with Error _ as e -> e | Ok t -> Ok (Star t) + +let plus t = + match reject_captures "Plus" t with Error _ as e -> e | Ok t -> Ok (Plus t) + +let rep t n m = + match reject_captures "Rep" t with + | Error _ as e -> e + | Ok t -> Ok (Rep (t, n, m)) + +let seq a b = + match (a, b) with + | Eps, x | x, Eps -> x + | Seq l1, Seq l2 -> Seq (l1 @ l2) + | Seq l1, x -> Seq (l1 @ [x]) + | x, Seq l2 -> Seq (x :: l2) + | _ -> Seq [a; b] + +let alt a b = + let branches = + match (a, b) with + | Chars c1, Chars c2 -> [Chars (Cset.union c1 c2)] + | Alt l1, Alt l2 -> l1 @ l2 + | Alt l1, x -> l1 @ [x] + | x, Alt l2 -> x :: l2 + | _ -> [a; b] + in + let names = List.map capture_names branches in + match names with + | [] | [_] -> Ok (match branches with [x] -> x | _ -> Alt branches) + | first :: rest -> + if List.for_all (SSet.equal first) rest then + Ok (match branches with [x] -> x | _ -> Alt branches) + else Error "all branches of '|' must bind the same names with 'as'" -(* Validation *) - -let validate t = - let rec check ~inside_rep t = - match t with - | Chars _ | Eps -> Ok () - | Capture (_, _) when inside_rep -> - Error "'as' bindings are not supported inside repetition operators" - | Capture (name, inner) -> - if SSet.mem name (capture_names inner) then - Error - (Printf.sprintf - "'as' binding '%s' shadows an inner binding of the same name" - name) - else check ~inside_rep inner - | Seq elems -> - List.fold_left - (fun acc e -> - match acc with Error _ -> acc | Ok () -> check ~inside_rep e) - (Ok ()) elems - | Alt branches -> ( - let* () = - List.fold_left - (fun acc e -> - match acc with Error _ -> acc | Ok () -> check ~inside_rep e) - (Ok ()) branches - in - let names_per_branch = List.map capture_names branches in - match names_per_branch with - | [] | [_] -> Ok () - | first :: rest -> - if List.for_all (SSet.equal first) rest then Ok () - else - Error "all branches of '|' must bind the same names with 'as'" - ) - | Star inner | Plus inner -> check ~inside_rep:true inner - | Rep (inner, _, _) -> check ~inside_rep:true inner - and ( let* ) r f = match r with Error _ as e -> e | Ok x -> f x in +(* All structural constraints are enforced by the smart constructors. + [check_invariant] verifies them as a debug assertion. *) +let check_invariant t = + let rec check ~inside_rep = function + | Chars _ | Eps -> () + | Capture (name, inner) -> + assert (not inside_rep); + assert (not (SSet.mem name (capture_names inner))); + check ~inside_rep inner + | Seq elems -> + assert (List.length elems >= 2); + List.iter (check ~inside_rep) elems + | Alt branches -> + assert (List.length branches >= 2); + (match List.map capture_names branches with + | first :: rest -> assert (List.for_all (SSet.equal first) rest) + | [] -> assert false); + List.iter (check ~inside_rep) branches + | Star inner | Plus inner -> check ~inside_rep:true inner + | Rep (inner, _, _) -> check ~inside_rep:true inner + in check ~inside_rep:false t (* Pretty-printing *) diff --git a/src/compiler/ir.mli b/src/compiler/ir.mli index f9dedb1..0156499 100644 --- a/src/compiler/ir.mli +++ b/src/compiler/ir.mli @@ -30,36 +30,42 @@ type t = (** Named capture: [Capture (name, inner)] wraps [inner] with an [as] binding. The compiler decides tag allocation strategy. *) -(** {2 Smart constructors} *) +(** {2 Smart constructors} + + Constructors that enforce structural invariants return [result]. + - {!alt} checks name consistency across branches. + - {!capture} checks for shadowed inner bindings. + - {!star}, {!plus}, {!rep} reject inner captures. *) val chars : Cset.t -> t val seq : t -> t -> t -val alt : t -> t -> t -val star : t -> t -val plus : t -> t -val rep : t -> int -> int -> t +val alt : t -> t -> (t, string) result +val star : t -> (t, string) result +val plus : t -> (t, string) result +val rep : t -> int -> int -> (t, string) result val eps : t -val capture : string -> t -> t +val capture : string -> t -> (t, string) result -(** {2 Analysis} *) +(** [reject_captures ctx t] returns [Ok t] if [t] contains no [Capture] nodes, + or [Error msg] mentioning [ctx] otherwise. *) +val reject_captures : string -> t -> (t, string) result -(** [fixed_length t] returns [Some n] if [t] always matches exactly [n] code - points, or [None] if the length is variable. *) -val fixed_length : t -> int option +(** {2 Analysis} *) module SSet : Set.S with type elt = string (** [capture_names t] returns the set of capture names in [t]. *) val capture_names : t -> SSet.t -(** {2 Validation} +(** [fixed_length t] returns [Some n] if [t] always matches exactly [n] code + points, or [None] if the length is variable. *) +val fixed_length : t -> int option - [validate t] checks structural constraints: - - [Capture] not inside [Star], [Plus], or [Rep] - - All branches of [Alt] bind the same capture names +(** {2 Invariant checking} - Returns [Ok ()] or [Error msg]. *) -val validate : t -> (unit, string) result + All structural constraints are enforced by the smart constructors. + [check_invariant] asserts these hold. Use for debugging. *) +val check_invariant : t -> unit (** {2 Pretty-printing} *) diff --git a/src/compiler/sedlex.ml b/src/compiler/sedlex.ml index 0f006e9..9571142 100644 --- a/src/compiler/sedlex.ml +++ b/src/compiler/sedlex.ml @@ -557,10 +557,7 @@ let rec lower ~left ~right (ir : Ir.t) : regexp * compiled_binding list = (r_acc, tags_acc) let compile_ir (rules : Ir.t array) = - Array.iter - (fun ir -> - match Ir.validate ir with Ok () -> () | Error msg -> invalid_arg msg) - rules; + Array.iter (fun ir -> Ir.check_invariant ir) rules; reset_tags (); let lowered = Array.map diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index cad748f..5a91fdb 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -518,17 +518,17 @@ let gen_binding_code lexbuf (bindings : Sedlex.compiled_binding list) action = IR patterns. All tag allocation, discriminator handling, and fixed-length optimization are deferred to the compiler's [compile_ir]. *) let ir_of_pattern env = - let reject_captures loc ctx ir = - if not (Ir.SSet.is_empty (Ir.capture_names ir)) then - err loc "'as' bindings are not supported inside %s" ctx; - ir - in + let unwrap loc = function Ok ir -> ir | Error msg -> err loc "%s" msg in let rec char_pair_op func name ~encoding ~loc tuple = (* Construct something like Sub(a,b) *) match tuple with | Some { ppat_desc = Ppat_tuple [p0; p1]; _ } -> ( - let r0 = reject_captures p0.ppat_loc name (aux ~encoding p0) in - let r1 = reject_captures p1.ppat_loc name (aux ~encoding p1) in + let r0 = + unwrap p0.ppat_loc (Ir.reject_captures name (aux ~encoding p0)) + in + let r1 = + unwrap p1.ppat_loc (Ir.reject_captures name (aux ~encoding p1)) + in match func r0 r1 with | Some r -> r | None -> @@ -555,13 +555,10 @@ let ir_of_pattern env = match p.ppat_desc with (* name as x — named sub-match binding *) | Ppat_alias (inner, { txt = name; loc = name_loc }) -> - let ir_inner = aux ~encoding inner in - if Ir.SSet.mem name (Ir.capture_names ir_inner) then - err name_loc - "'as' binding '%s' shadows an inner binding of the same name" name; - Ir.capture name ir_inner + unwrap name_loc (Ir.capture name (aux ~encoding inner)) (* p1 | p2 — alternation *) - | Ppat_or (p1, p2) -> Ir.alt (aux ~encoding p1) (aux ~encoding p2) + | Ppat_or (p1, p2) -> + unwrap p.ppat_loc (Ir.alt (aux ~encoding p1) (aux ~encoding p2)) (* (p1, p2, ...) — sequence *) | Ppat_tuple (p :: pl) -> List.fold_left @@ -569,10 +566,10 @@ let ir_of_pattern env = (aux ~encoding p) pl (* Star p — zero-or-more repetition *) | Ppat_construct ({ txt = Lident "Star"; _ }, Some (_, p)) -> - Ir.star (reject_captures p.ppat_loc "Star" (aux ~encoding p)) + unwrap p.ppat_loc (Ir.star (aux ~encoding p)) (* Plus p — one-or-more repetition *) | Ppat_construct ({ txt = Lident "Plus"; _ }, Some (_, p)) -> - Ir.plus (reject_captures p.ppat_loc "Plus" (aux ~encoding p)) + unwrap p.ppat_loc (Ir.plus (aux ~encoding p)) (* Utf8 p — switch to UTF-8 encoding *) | Ppat_construct ({ txt = Lident "Utf8"; _ }, Some (_, p)) -> aux ~encoding:Utf8 p @@ -600,12 +597,12 @@ let ir_of_pattern env = ]; _; } ) ) -> ( - let r = reject_captures p0.ppat_loc "Rep" (aux ~encoding p0) in + let r = aux ~encoding p0 in match (i1, i2) with | Pconst_integer (i1, _), Pconst_integer (i2, _) -> let i1 = int_of_string i1 in let i2 = int_of_string i2 in - if 0 <= i1 && i1 <= i2 then Ir.rep r i1 i2 + if 0 <= i1 && i1 <= i2 then unwrap p0.ppat_loc (Ir.rep r i1 i2) else err p.ppat_loc "Invalid range for Rep operator" | _ -> err p.ppat_loc "Rep must take an integer constant or interval") @@ -614,14 +611,17 @@ let ir_of_pattern env = err p.ppat_loc "the Rep operator takes 2 arguments" (* Opt p — optional (zero or one) *) | Ppat_construct ({ txt = Lident "Opt"; _ }, Some (_, p)) -> - let r = reject_captures p.ppat_loc "Opt" (aux ~encoding p) in - Ir.alt Ir.eps r + let r = + unwrap p.ppat_loc (Ir.reject_captures "Opt" (aux ~encoding p)) + in + unwrap p.ppat_loc (Ir.alt Ir.eps r) (* Compl p — complement of a character class *) | Ppat_construct ({ txt = Lident "Compl"; _ }, arg) -> ( match arg with | Some (_, p0) -> ( let r = - reject_captures p0.ppat_loc "Compl" (aux ~encoding p0) + unwrap p0.ppat_loc + (Ir.reject_captures "Compl" (aux ~encoding p0)) in match ir_compl r with | Some r -> r @@ -737,9 +737,6 @@ let handle_sedlex_match_ ~env ~map_rhs match_expr = (function | { pc_lhs = p; pc_rhs = e; pc_guard = None } -> let ir = ir_of_pattern env p in - (match Ir.validate ir with - | Ok () -> () - | Error msg -> err p.ppat_loc "%s" msg); (ir, p.ppat_loc, e) | { pc_guard = Some e; _ } -> err e.pexp_loc "'when' guards are not supported") From 211f7dc33cfd11eb15cdc1e29814c475b7d6bff3 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Tue, 7 Apr 2026 22:57:47 +0200 Subject: [PATCH 05/11] Helpful errors for misused sedlex operators - Bare operators without argument (Star, Plus, Opt, Utf8, Latin1, Ascii) now produce "the X operator requires an argument" instead of falling through to "this pattern is not a valid regexp". - Unknown constructors (e.g. Some) produce "unknown sedlex operator X". Co-Authored-By: Claude Opus 4.6 (1M context) --- src/syntax/ppx_sedlex.ml | 14 ++++++++++++++ test/codegen/test_errors.ml | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 5a91fdb..6d77177 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -687,6 +687,20 @@ let ir_of_pattern env = | Pconst_integer (i, _) -> Ir.chars (Cset.singleton (codepoint (int_of_string i))) | _ -> err p.ppat_loc "this pattern is not a valid regexp") + (* Bare operator without argument *) + | Ppat_construct + ( { + txt = + Lident + (("Star" | "Plus" | "Opt" | "Utf8" | "Latin1" | "Ascii") as + name); + _; + }, + None ) -> + err p.ppat_loc "the %s operator requires an argument" name + (* Unknown constructor *) + | Ppat_construct ({ txt = Lident name; _ }, _) -> + err p.ppat_loc "unknown sedlex operator %s" name (* name — reference to a previously defined regexp *) | Ppat_var { txt = x; _ } -> ( match StringMap.find_opt x env with diff --git a/test/codegen/test_errors.ml b/test/codegen/test_errors.ml index a4425d8..1481898 100644 --- a/test/codegen/test_errors.ml +++ b/test/codegen/test_errors.ml @@ -228,7 +228,7 @@ let%expect_test "error: invalid pattern" = File "test/codegen/test_errors.ml", characters 42-50: | [%compile_error [%sedlex match buf with Some 'a' -> () | _ -> ()]]; ^^^^^^^^ - Error: Sedlex: this pattern is not a valid regexp + Error: Sedlex: unknown sedlex operator Some |}] let%expect_test "error: invalid interval type" = @@ -327,6 +327,38 @@ let%expect_test "error: sedlex not on match expression" = Error: Sedlex: the %sedlex extension is only recognized on match expressions |}] +(* Error tests for bare operators (missing argument) *) + +let%expect_test "error: Star without argument" = + [%compile_error [%sedlex match buf with Star -> () | _ -> ()]]; + [%expect + {| + File "test/codegen/test_errors.ml", characters 42-46: + | [%compile_error [%sedlex match buf with Star -> () | _ -> ()]]; + ^^^^ + Error: Sedlex: the Star operator requires an argument + |}] + +let%expect_test "error: Plus without argument" = + [%compile_error [%sedlex match buf with Plus -> () | _ -> ()]]; + [%expect + {| + File "test/codegen/test_errors.ml", characters 42-46: + | [%compile_error [%sedlex match buf with Plus -> () | _ -> ()]]; + ^^^^ + Error: Sedlex: the Plus operator requires an argument + |}] + +let%expect_test "error: Opt without argument" = + [%compile_error [%sedlex match buf with Opt -> () | _ -> ()]]; + [%expect + {| + File "test/codegen/test_errors.ml", characters 42-45: + | [%compile_error [%sedlex match buf with Opt -> () | _ -> ()]]; + ^^^ + Error: Sedlex: the Opt operator requires an argument + |}] + (* Error tests for regexp definitions *) let%expect_test "error: as in regexp definition" = From bcb2f6cb5784b96461cd05e0558551edd3daed2d Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:14:35 +0200 Subject: [PATCH 06/11] Refactor IR alt, check_invariant, and pretty-printer Validate capture names before flattening in alt (simpler logic). Rewrite check_invariant to return SSet.t instead of threading ~inside_rep, making assertions more direct. Extract cp helper in pretty-printer and use it consistently for character ranges. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index cd60da9..8fa1c9a 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -83,64 +83,64 @@ let seq a b = | _ -> Seq [a; b] let alt a b = - let branches = - match (a, b) with - | Chars c1, Chars c2 -> [Chars (Cset.union c1 c2)] - | Alt l1, Alt l2 -> l1 @ l2 - | Alt l1, x -> l1 @ [x] - | x, Alt l2 -> x :: l2 - | _ -> [a; b] - in - let names = List.map capture_names branches in - match names with - | [] | [_] -> Ok (match branches with [x] -> x | _ -> Alt branches) - | first :: rest -> - if List.for_all (SSet.equal first) rest then - Ok (match branches with [x] -> x | _ -> Alt branches) - else Error "all branches of '|' must bind the same names with 'as'" + let an = capture_names a in + let bn = capture_names b in + if not (SSet.equal an bn) then + Error "all branches of '|' must bind the same names with 'as'" + else + Ok + (match (a, b) with + | Chars c1, Chars c2 -> Chars (Cset.union c1 c2) + | Alt l1, Alt l2 -> Alt (l1 @ l2) + | Alt l1, x -> Alt (l1 @ [x]) + | x, Alt l2 -> Alt (x :: l2) + | _ -> Alt [a; b]) (* All structural constraints are enforced by the smart constructors. [check_invariant] verifies them as a debug assertion. *) let check_invariant t = - let rec check ~inside_rep = function - | Chars _ | Eps -> () + let rec check = function + | Chars _ | Eps -> SSet.empty | Capture (name, inner) -> - assert (not inside_rep); - assert (not (SSet.mem name (capture_names inner))); - check ~inside_rep inner + let names = check inner in + assert (not (SSet.mem name names)); + SSet.add name names | Seq elems -> assert (List.length elems >= 2); - List.iter (check ~inside_rep) elems - | Alt branches -> - assert (List.length branches >= 2); - (match List.map capture_names branches with - | first :: rest -> assert (List.for_all (SSet.equal first) rest) - | [] -> assert false); - List.iter (check ~inside_rep) branches - | Star inner | Plus inner -> check ~inside_rep:true inner - | Rep (inner, _, _) -> check ~inside_rep:true inner + List.fold_left (fun acc x -> SSet.union acc (check x)) SSet.empty elems + | Alt [] | Alt [_] -> assert false + | Alt (first :: rest) -> + let names = check first in + List.iter (fun x -> assert (SSet.equal names (check x))) rest; + names + | Star inner | Plus inner -> + assert (SSet.is_empty (check inner)); + SSet.empty + | Rep (inner, _, _) -> + assert (SSet.is_empty (check inner)); + SSet.empty in - check ~inside_rep:false t + let _ : SSet.t = check t in + () (* Pretty-printing *) let rec pp fmt = function | Chars cset -> ( let intervals = (cset : Cset.t :> (int * int) list) in + let cp fmt c = + if c >= 32 && c <= 126 then Format.fprintf fmt "'%c'" (Char.chr c) + else Format.fprintf fmt "0x%04X" c + in match intervals with - | [(c, c')] when c = c' -> - if c >= 32 && c <= 126 then Format.fprintf fmt "'%c'" (Char.chr c) - else Format.fprintf fmt "0x%04X" c + | [(c, c')] when c = c' -> cp fmt c | _ -> Format.fprintf fmt "[%a]" (Format.pp_print_list ~pp_sep:(fun fmt () -> Format.fprintf fmt ", ") (fun fmt (lo, hi) -> - if lo = hi then - if lo >= 32 && lo <= 126 then - Format.fprintf fmt "'%c'" (Char.chr lo) - else Format.fprintf fmt "0x%04X" lo - else Format.fprintf fmt "0x%04X-0x%04X" lo hi)) + if lo = hi then cp fmt lo + else Format.fprintf fmt "%a-%a" cp lo cp hi)) intervals) | Eps -> Format.fprintf fmt "eps" | Seq elems -> From 3e45554a595a6d85d644aabd507045f957686315 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:14:43 +0200 Subject: [PATCH 07/11] Validate Rep range in IR smart constructor Move the 0 <= n <= m check into Ir.rep so it is enforced regardless of call site. Keep the explicit check in the PPX for better error locations. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 8 +++++--- src/syntax/ppx_sedlex.ml | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index 8fa1c9a..6cf6b10 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -70,9 +70,11 @@ let plus t = match reject_captures "Plus" t with Error _ as e -> e | Ok t -> Ok (Plus t) let rep t n m = - match reject_captures "Rep" t with - | Error _ as e -> e - | Ok t -> Ok (Rep (t, n, m)) + if not (0 <= n && n <= m) then Error "Invalid range for Rep operator" + else ( + match reject_captures "Rep" t with + | Error _ as e -> e + | Ok t -> Ok (Rep (t, n, m))) let seq a b = match (a, b) with diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 6d77177..33df3bc 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -602,6 +602,7 @@ let ir_of_pattern env = | Pconst_integer (i1, _), Pconst_integer (i2, _) -> let i1 = int_of_string i1 in let i2 = int_of_string i2 in + (* Check bounds explicitly here for better error locations *) if 0 <= i1 && i1 <= i2 then unwrap p0.ppat_loc (Ir.rep r i1 i2) else err p.ppat_loc "Invalid range for Rep operator" | _ -> From ccbe668cbd0bf1fd6e115395d6a324a683f7bccc Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:14:52 +0200 Subject: [PATCH 08/11] Simplify Alt lowering: always use add_discriminators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the special case that skipped discriminators for capture-free branches — add_discriminators already handles that case correctly. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/sedlex.ml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/compiler/sedlex.ml b/src/compiler/sedlex.ml index 9571142..7a28c66 100644 --- a/src/compiler/sedlex.ml +++ b/src/compiler/sedlex.ml @@ -492,16 +492,7 @@ let rec lower ~left ~right (ir : Ir.t) : regexp * compiled_binding list = (r, { name; start_pos = st; end_pos = et; disc = [] } :: tags) | Ir.Alt branches -> let lowered = List.map (lower ~left ~right) branches in - let has_captures = List.exists (fun (_, tags) -> tags <> []) lowered in - if has_captures then add_discriminators lowered - else ( - let r = - List.fold_left - (fun acc (r, _) -> alt acc r) - (fst (List.hd lowered)) - (List.tl lowered) - in - (r, [])) + add_discriminators lowered | Ir.Seq elems -> (* Sequence — propagate left/right position contexts through elements. Right positions are computed right-to-left; left positions are From 0858c6ab15a7c9c978f85dc2009a36ccceaa4284 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:19:02 +0200 Subject: [PATCH 09/11] Update copyright headers in IR module Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 2 +- src/compiler/ir.mli | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index 6cf6b10..12a973d 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -1,6 +1,6 @@ (* The package sedlex is released under the terms of an MIT-like license. *) (* See the attached LICENSE file. *) -(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) +(* Copyright 2026, Hugo Heuzard *) type t = | Chars of Cset.t diff --git a/src/compiler/ir.mli b/src/compiler/ir.mli index 0156499..4a498b4 100644 --- a/src/compiler/ir.mli +++ b/src/compiler/ir.mli @@ -1,6 +1,6 @@ (* The package sedlex is released under the terms of an MIT-like license. *) (* See the attached LICENSE file. *) -(* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) +(* Copyright 2026, Hugo Heuzard *) (** Intermediate representation for sedlex patterns. From 4535511bbf6af2022479db094729428c641201f4 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:35:09 +0200 Subject: [PATCH 10/11] Fix compile_ir docstring: Assert_failure, not Invalid_argument Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/sedlex.mli | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/sedlex.mli b/src/compiler/sedlex.mli index b9c6bac..01ac04b 100644 --- a/src/compiler/sedlex.mli +++ b/src/compiler/sedlex.mli @@ -156,7 +156,7 @@ type compiled_ir = { } (** [compile_ir rules] compiles an array of IR patterns into a tagged DFA. - Raises [Invalid_argument] if validation fails. *) + Raises [Assert_failure] if invariant checking fails. *) val compile_ir : Ir.t array -> compiled_ir (** [dfa_to_dot dfa] returns a Graphviz DOT representation of the DFA, including From 38172f7664a4293e3c548eec3d10a144714c9689 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Mon, 13 Apr 2026 22:35:30 +0200 Subject: [PATCH 11/11] Use Option.equal Int.equal instead of polymorphic ( = ) in fixed_length Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/ir.ml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/compiler/ir.ml b/src/compiler/ir.ml index 12a973d..1c04e80 100644 --- a/src/compiler/ir.ml +++ b/src/compiler/ir.ml @@ -44,7 +44,8 @@ let rec fixed_length = function match List.map fixed_length branches with | [] -> None | first :: rest -> - if List.for_all (( = ) first) rest then first else None) + if List.for_all (Option.equal Int.equal first) rest then first + else None) | Rep (inner, n, m) -> if n = m then ( match fixed_length inner with Some l -> Some (n * l) | None -> None)