From 2b429f50b3c4f23a4ba7ab28c8c294ea0064c4c6 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Thu, 2 Apr 2026 11:37:40 +0200 Subject: [PATCH 1/4] Add Cset.mem for code point membership testing Co-Authored-By: Claude Opus 4.6 (1M context) --- src/compiler/cset.ml | 5 +++++ src/compiler/cset.mli | 1 + 2 files changed, 6 insertions(+) diff --git a/src/compiler/cset.ml b/src/compiler/cset.ml index 118feeb..fbbef65 100644 --- a/src/compiler/cset.ml +++ b/src/compiler/cset.ml @@ -74,5 +74,10 @@ let complement c = in match c with (-1, j) :: l -> aux (succ j) l | l -> aux (-1) l +let rec mem c = function + | [] -> false + | (lo, hi) :: rest -> + if c < lo then false else if c <= hi then true else mem c rest + let intersection c1 c2 = complement (union (complement c1) (complement c2)) let difference c1 c2 = complement (union (complement c1) c2) diff --git a/src/compiler/cset.mli b/src/compiler/cset.mli index 59f8f34..1791246 100644 --- a/src/compiler/cset.mli +++ b/src/compiler/cset.mli @@ -23,4 +23,5 @@ val is_empty : t -> bool val eof : t val singleton : int -> t val interval : int -> int -> t +val mem : int -> t -> bool val to_seq : t -> int Seq.t From b6cb91349ac997235a14049cd4b9f55be5a5c16b Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Wed, 15 Apr 2026 12:08:29 +0200 Subject: [PATCH 2/4] Vendor ocamllex DFA compiler from OCaml 5.4.0 Copy lex/{cset,lexgen,syntax,table}.{ml,mli} verbatim from the OCaml compiler sources. These provide an independent TDFA implementation used as a reference oracle for testing sedlex's tagged DFA. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/ocamllex_vendored/.ocamlformat-ignore | 1 + test/ocamllex_vendored/README.md | 33 + test/ocamllex_vendored/cset.ml | 97 ++ test/ocamllex_vendored/cset.mli | 34 + test/ocamllex_vendored/dune | 4 + test/ocamllex_vendored/lexgen.ml | 1192 ++++++++++++++++++++ test/ocamllex_vendored/lexgen.mli | 60 + test/ocamllex_vendored/sync.sh | 31 + test/ocamllex_vendored/syntax.ml | 49 + test/ocamllex_vendored/syntax.mli | 46 + test/ocamllex_vendored/table.ml | 56 + 11 files changed, 1603 insertions(+) create mode 100644 test/ocamllex_vendored/.ocamlformat-ignore create mode 100644 test/ocamllex_vendored/README.md create mode 100644 test/ocamllex_vendored/cset.ml create mode 100644 test/ocamllex_vendored/cset.mli create mode 100644 test/ocamllex_vendored/dune create mode 100644 test/ocamllex_vendored/lexgen.ml create mode 100644 test/ocamllex_vendored/lexgen.mli create mode 100755 test/ocamllex_vendored/sync.sh create mode 100644 test/ocamllex_vendored/syntax.ml create mode 100644 test/ocamllex_vendored/syntax.mli create mode 100644 test/ocamllex_vendored/table.ml diff --git a/test/ocamllex_vendored/.ocamlformat-ignore b/test/ocamllex_vendored/.ocamlformat-ignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/test/ocamllex_vendored/.ocamlformat-ignore @@ -0,0 +1 @@ +* diff --git a/test/ocamllex_vendored/README.md b/test/ocamllex_vendored/README.md new file mode 100644 index 0000000..c5397c4 --- /dev/null +++ b/test/ocamllex_vendored/README.md @@ -0,0 +1,33 @@ +# Vendored ocamllex sources + +These files are copied verbatim from the OCaml compiler's `lex/` directory. +They provide ocamllex's DFA compiler (`Lexgen.make_dfa`) which is used as an +independent reference oracle to test sedlex's tagged DFA compilation. + +## Files + +| File | Description | +|------|-------------| +| `cset.ml` / `.mli` | Character sets | +| `lexgen.ml` / `.mli` | DFA compiler with tagged transitions | +| `syntax.ml` / `.mli` | Regular expression AST | +| `table.ml` | DFA table representation | + +## Updating + +Run `./sync.sh` to copy the latest sources from the current opam switch: + +``` +./test/ocamllex_vendored/sync.sh +``` + +To sync from a specific directory: + +``` +OCAML_LEX_DIR=/path/to/ocaml/lex ./test/ocamllex_vendored/sync.sh +``` + +## License + +These files are part of the OCaml compiler and are distributed under the +GNU Lesser General Public License version 2.1 (see file headers). diff --git a/test/ocamllex_vendored/cset.ml b/test/ocamllex_vendored/cset.ml new file mode 100644 index 0000000..b8f86f1 --- /dev/null +++ b/test/ocamllex_vendored/cset.ml @@ -0,0 +1,97 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Luc Maranget, Jerome Vouillon projet Cristal, *) +(* INRIA Rocquencourt *) +(* *) +(* Copyright 2002 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +exception Bad + +type t = (int * int) list + + +let empty = [] +let is_empty = function + | [] -> true + | _ -> false + +let singleton c = [c,c] + +let interval c1 c2 = + if c1 <= c2 then [c1,c2] + else [c2,c1] + + +let rec union s1 s2 = match s1,s2 with +| [],_ -> s2 +| _,[] -> s1 +| (c1,d1) as p1::r1, (c2,d2)::r2 -> + if c1 > c2 then + union s2 s1 + else begin (* c1 <= c2 *) + if d1+1 < c2 then + p1::union r1 s2 + else if d1 < d2 then + union ((c1,d2)::r2) r1 + else + union s1 r2 + end + +let rec inter l l' = match l, l' with + _, [] -> [] + | [], _ -> [] + | (c1, c2)::r, (c1', c2')::r' -> + if c2 < c1' then + inter r l' + else if c2' < c1 then + inter l r' + else if c2 < c2' then + (Int.max c1 c1', c2)::inter r l' + else + (Int.max c1 c1', c2')::inter l r' + +let rec diff l l' = match l, l' with + _, [] -> l + | [], _ -> [] + | (c1, c2)::r, (c1', c2')::r' -> + if c2 < c1' then + (c1, c2)::diff r l' + else if c2' < c1 then + diff l r' + else + let r'' = if c2' < c2 then (c2' + 1, c2) :: r else r in + if c1 < c1' then + (c1, c1' - 1)::diff r'' r' + else + diff r'' r' + + +let eof = singleton 256 +and all_chars = interval 0 255 +and all_chars_eof = interval 0 256 + +let complement s = diff all_chars s + +let env_to_array env = match env with +| [] -> assert false +| (_,x)::rem -> + let res = Array.make 257 x in + List.iter + (fun (c,y) -> + List.iter + (fun (i,j) -> + for k=i to j do + res.(k) <- y + done) + c) + rem ; + res diff --git a/test/ocamllex_vendored/cset.mli b/test/ocamllex_vendored/cset.mli new file mode 100644 index 0000000..527d53d --- /dev/null +++ b/test/ocamllex_vendored/cset.mli @@ -0,0 +1,34 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Luc Maranget, Jerome Vouillon projet Cristal, *) +(* INRIA Rocquencourt *) +(* *) +(* Copyright 2002 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* Set of characters encoded as list of intervals *) + +type t + +val empty : t +val is_empty : t -> bool +val all_chars : t +exception Bad + +val all_chars_eof : t +val eof : t +val singleton : int -> t +val interval : int -> int -> t +val union : t -> t -> t +val inter : t -> t -> t +val diff : t -> t -> t +val complement : t -> t +val env_to_array : (t * 'a) list -> 'a array diff --git a/test/ocamllex_vendored/dune b/test/ocamllex_vendored/dune new file mode 100644 index 0000000..4f88e06 --- /dev/null +++ b/test/ocamllex_vendored/dune @@ -0,0 +1,4 @@ +(library + (name ocamllex_vendored) + (flags + (:standard -w -32-34-37-60-69-70))) diff --git a/test/ocamllex_vendored/lexgen.ml b/test/ocamllex_vendored/lexgen.ml new file mode 100644 index 0000000..af79a34 --- /dev/null +++ b/test/ocamllex_vendored/lexgen.ml @@ -0,0 +1,1192 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Luc Maranget, projet Moscova, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* Compiling a lexer definition *) + +open Syntax +(*open Printf*) + +exception Memory_overflow + +(* Deep abstract syntax for regular expressions *) + +type ident = string * Syntax.location + +type tag_info = {id : string ; start : bool ; action : int} + +type regexp = + Empty + | Chars of int * bool + | Action of int + | Tag of tag_info + | Seq of regexp * regexp + | Alt of regexp * regexp + | Star of regexp + +type tag_base = Start | End | Mem of int +type tag_addr = Sum of (tag_base * int) +type ident_info = + | Ident_string of bool * tag_addr * tag_addr + | Ident_char of bool * tag_addr +type t_env = (ident * ident_info) list + +type ('args,'action) lexer_entry = + { lex_name: string; + lex_regexp: regexp; + lex_mem_tags: int ; + lex_actions: (int * t_env * 'action) list } + + +type automata = + Perform of int * tag_action list + | Shift of automata_trans * (automata_move * memory_action list) array + +and automata_trans = + No_remember + | Remember of int * tag_action list + +and automata_move = + Backtrack + | Goto of int + +and memory_action = + | Copy of int * int + | Set of int + +and tag_action = SetTag of int * int | EraseTag of int + +(* Representation of entry points *) + +type ('args,'action) automata_entry = + { auto_name: string; + auto_args: 'args ; + auto_mem_size : int ; + auto_initial_state: int * memory_action list; + auto_actions: (int * t_env * 'action) list } + + +(* A lot of sets and map structures *) + +module Ints = + Set.Make(struct type t = int let compare (x:t) y = compare x y end) + +let id_compare (id1,_) (id2,_) = String.compare id1 id2 + +let tag_compare + {id=id1; start=start1; action=action1} + {id=id2; start=start2; action=action2} = + let c = String.compare id1 id2 in + if c <> 0 then c else + let c = Bool.compare start1 start2 in + if c <> 0 then c else + Int.compare action1 action2 + +module Tags = Set.Make(struct type t = tag_info let compare = tag_compare end) + +module TagMap = + Map.Make (struct type t = tag_info let compare = tag_compare end) + +module IdSet = + Set.Make (struct type t = ident let compare = id_compare end) + +(*********************) +(* Variable cleaning *) +(*********************) + +(* Silently eliminate nested variables *) + +let rec do_remove_nested to_remove = function + | Bind (e,x) -> + if IdSet.mem x to_remove then + do_remove_nested to_remove e + else + Bind (do_remove_nested (IdSet.add x to_remove) e, x) + | Epsilon|Eof|Characters _ as e -> e + | Sequence (e1, e2) -> + Sequence + (do_remove_nested to_remove e1, do_remove_nested to_remove e2) + | Alternative (e1, e2) -> + Alternative + (do_remove_nested to_remove e1, do_remove_nested to_remove e2) + | Repetition e -> + Repetition (do_remove_nested to_remove e) + +let remove_nested_as e = do_remove_nested IdSet.empty e + +(*********************) +(* Variable analysis *) +(*********************) + +(* + Optional variables. + A variable is optional when matching of regexp does not + implies it binds. + The typical case is: + ("" | 'a' as x) -> optional + ("" as x | 'a' as x) -> non-optional +*) + +let stringset_delta s1 s2 = + IdSet.union + (IdSet.diff s1 s2) + (IdSet.diff s2 s1) + +let rec find_all_vars = function + | Characters _|Epsilon|Eof -> + IdSet.empty + | Bind (e,x) -> + IdSet.add x (find_all_vars e) + | Sequence (e1,e2)|Alternative (e1,e2) -> + IdSet.union (find_all_vars e1) (find_all_vars e2) + | Repetition e -> find_all_vars e + + +let rec do_find_opt = function + | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty + | Bind (e,x) -> + let opt,all = do_find_opt e in + opt, IdSet.add x all + | Sequence (e1,e2) -> + let opt1,all1 = do_find_opt e1 + and opt2,all2 = do_find_opt e2 in + IdSet.union opt1 opt2, IdSet.union all1 all2 + | Alternative (e1,e2) -> + let opt1,all1 = do_find_opt e1 + and opt2,all2 = do_find_opt e2 in + IdSet.union + (IdSet.union opt1 opt2) + (stringset_delta all1 all2), + IdSet.union all1 all2 + | Repetition e -> + let r = find_all_vars e in + r,r + +let find_optional e = + let r,_ = do_find_opt e in r + +(* + Double variables + A variable is double when it can be bound more than once + in a single matching + The typical case is: + (e1 as x) (e2 as x) + +*) + +let rec do_find_double = function + | Characters _|Epsilon|Eof -> IdSet.empty, IdSet.empty + | Bind (e,x) -> + let dbl,all = do_find_double e in + (if IdSet.mem x all then + IdSet.add x dbl + else + dbl), + IdSet.add x all + | Sequence (e1,e2) -> + let dbl1, all1 = do_find_double e1 + and dbl2, all2 = do_find_double e2 in + IdSet.union + (IdSet.inter all1 all2) + (IdSet.union dbl1 dbl2), + IdSet.union all1 all2 + | Alternative (e1,e2) -> + let dbl1, all1 = do_find_double e1 + and dbl2, all2 = do_find_double e2 in + IdSet.union dbl1 dbl2, + IdSet.union all1 all2 + | Repetition e -> + let r = find_all_vars e in + r,r + +let find_double e = do_find_double e + +(* + Type of variables: + A variable is bound to a char when all its occurrences + bind a pattern of length 1. + The typical case is: + (_ as x) -> char +*) + +let add_some x = function + | Some i -> Some (x+i) + | None -> None + +let add_some_some x y = match x,y with +| Some i, Some j -> Some (i+j) +| _,_ -> None + +let rec do_find_chars sz = function + | Epsilon|Eof -> IdSet.empty, IdSet.empty, sz + | Characters _ -> IdSet.empty, IdSet.empty, add_some 1 sz + | Bind (e,x) -> + let c,s,e_sz = do_find_chars (Some 0) e in + begin match e_sz with + | Some 1 -> + IdSet.add x c,s,add_some 1 sz + | _ -> + c, IdSet.add x s, add_some_some sz e_sz + end + | Sequence (e1,e2) -> + let c1,s1,sz1 = do_find_chars sz e1 in + let c2,s2,sz2 = do_find_chars sz1 e2 in + IdSet.union c1 c2, + IdSet.union s1 s2, + sz2 + | Alternative (e1,e2) -> + let c1,s1,sz1 = do_find_chars sz e1 + and c2,s2,sz2 = do_find_chars sz e2 in + IdSet.union c1 c2, + IdSet.union s1 s2, + (if sz1 = sz2 then sz1 else None) + | Repetition e -> do_find_chars None e + + + +let find_chars e = + let c,s,_ = do_find_chars (Some 0) e in + IdSet.diff c s + +(*******************************) +(* From shallow to deep syntax *) +(*******************************) + +let chars = ref ([] : Cset.t list) +let chars_count = ref 0 + + +let rec encode_regexp char_vars act = function + Epsilon -> Empty + | Characters cl -> + let n = !chars_count in + chars := cl :: !chars; + incr chars_count; + Chars(n,false) + | Eof -> + let n = !chars_count in + chars := Cset.eof :: !chars; + incr chars_count; + Chars(n,true) + | Sequence(r1,r2) -> + let r1 = encode_regexp char_vars act r1 in + let r2 = encode_regexp char_vars act r2 in + Seq (r1, r2) + | Alternative(r1,r2) -> + let r1 = encode_regexp char_vars act r1 in + let r2 = encode_regexp char_vars act r2 in + Alt(r1, r2) + | Repetition r -> + let r = encode_regexp char_vars act r in + Star r + | Bind (r,((name,_) as x)) -> + let r = encode_regexp char_vars act r in + if IdSet.mem x char_vars then + Seq (Tag {id=name ; start=true ; action=act},r) + else + Seq (Tag {id=name ; start=true ; action=act}, + Seq (r, Tag {id=name ; start=false ; action=act})) + + +(* Optimisation, + Static optimization : + Replace tags by offsets relative to the beginning + or end of matched string. + Dynamic optimization: + Replace some non-optional, non-double tags by offsets w.r.t + a previous similar tag. +*) + +let opt = true + +let mk_seq r1 r2 = match r1,r2 with +| Empty,_ -> r2 +| _,Empty -> r1 +| _,_ -> Seq (r1,r2) + +let add_pos p i = match p with +| Some (Sum (a,n)) -> Some (Sum (a,n+i)) +| None -> None + +let mem_name name id_set = + IdSet.exists (fun (id_name,_) -> name = id_name) id_set + +let opt_regexp all_vars char_vars optional_vars double_vars r = + +(* From removed tags to their addresses *) + let env = Hashtbl.create 17 in + +(* First static optimizations, from start position *) + let rec size_forward pos = function + | Empty|Chars (_,true)|Tag _ -> Some pos + | Chars (_,false) -> Some (pos+1) + | Seq (r1,r2) -> + begin match size_forward pos r1 with + | None -> None + | Some pos -> size_forward pos r2 + end + | Alt (r1,r2) -> + let pos1 = size_forward pos r1 + and pos2 = size_forward pos r2 in + if pos1=pos2 then pos1 else None + | Star _ -> None + | Action _ -> assert false in + + let rec simple_forward pos r = match r with + | Tag n -> + if mem_name n.id double_vars then + r,Some pos + else begin + Hashtbl.add env (n.id,n.start) (Sum (Start, pos)) ; + Empty,Some pos + end + | Empty -> r, Some pos + | Chars (_,is_eof) -> + r,Some (if is_eof then pos else pos+1) + | Seq (r1,r2) -> + let r1,pos = simple_forward pos r1 in + begin match pos with + | None -> mk_seq r1 r2,None + | Some pos -> + let r2,pos = simple_forward pos r2 in + mk_seq r1 r2,pos + end + | Alt (r1,r2) -> + let pos1 = size_forward pos r1 + and pos2 = size_forward pos r2 in + r,(if pos1=pos2 then pos1 else None) + | Star _ -> r,None + | Action _ -> assert false in + +(* Then static optimizations, from end position *) + let rec size_backward pos = function + | Empty|Chars (_,true)|Tag _ -> Some pos + | Chars (_,false) -> Some (pos-1) + | Seq (r1,r2) -> + begin match size_backward pos r2 with + | None -> None + | Some pos -> size_backward pos r1 + end + | Alt (r1,r2) -> + let pos1 = size_backward pos r1 + and pos2 = size_backward pos r2 in + if pos1=pos2 then pos1 else None + | Star _ -> None + | Action _ -> assert false in + + + let rec simple_backward pos r = match r with + | Tag n -> + if mem_name n.id double_vars then + r,Some pos + else begin + Hashtbl.add env (n.id,n.start) (Sum (End, pos)) ; + Empty,Some pos + end + | Empty -> r,Some pos + | Chars (_,is_eof) -> + r,Some (if is_eof then pos else pos-1) + | Seq (r1,r2) -> + let r2,pos = simple_backward pos r2 in + begin match pos with + | None -> mk_seq r1 r2,None + | Some pos -> + let r1,pos = simple_backward pos r1 in + mk_seq r1 r2,pos + end + | Alt (r1,r2) -> + let pos1 = size_backward pos r1 + and pos2 = size_backward pos r2 in + r,(if pos1=pos2 then pos1 else None) + | Star _ -> r,None + | Action _ -> assert false in + + let r = + if opt then + let r,_ = simple_forward 0 r in + let r,_ = simple_backward 0 r in + r + else + r in + + let loc_count = ref 0 in + let get_tag_addr t = + try + Hashtbl.find env t + with + | Not_found -> + let n = !loc_count in + incr loc_count ; + Hashtbl.add env t (Sum (Mem n,0)) ; + Sum (Mem n,0) in + + let rec alloc_exp pos r = match r with + | Tag n -> + if mem_name n.id double_vars then + r,pos + else begin match pos with + | Some a -> + Hashtbl.add env (n.id,n.start) a ; + Empty,pos + | None -> + let a = get_tag_addr (n.id,n.start) in + r,Some a + end + + | Empty -> r,pos + | Chars (_,is_eof) -> r,(if is_eof then pos else add_pos pos 1) + | Seq (r1,r2) -> + let r1,pos = alloc_exp pos r1 in + let r2,pos = alloc_exp pos r2 in + mk_seq r1 r2,pos + | Alt (_,_) -> + let off = size_forward 0 r in + begin match off with + | Some i -> r,add_pos pos i + | None -> r,None + end + | Star _ -> r,None + | Action _ -> assert false in + + let r,_ = alloc_exp None r in + let m = + IdSet.fold + (fun ((name,_) as x) r -> + + let v = + if IdSet.mem x char_vars then + Ident_char + (IdSet.mem x optional_vars, get_tag_addr (name,true)) + else + Ident_string + (IdSet.mem x optional_vars, + get_tag_addr (name,true), + get_tag_addr (name,false)) in + (x,v)::r) + all_vars [] in + m,r, !loc_count + + + +let encode_casedef casedef = + let r = + List.fold_left + (fun (reg,actions,count,ntags) (expr, act) -> + let expr = remove_nested_as expr in + let char_vars = find_chars expr in + let r = encode_regexp char_vars count expr + and opt_vars = find_optional expr + and double_vars,all_vars = find_double expr in + let m,r,loc_ntags = + opt_regexp all_vars char_vars opt_vars double_vars r in + Alt(reg, Seq(r, Action count)), + (count, m ,act) :: actions, + (succ count), + Int.max loc_ntags ntags) + (Empty, [], 0, 0) + casedef in + r + +let encode_lexdef def = + chars := []; + chars_count := 0; + let entry_list = + List.map + (fun {name=entry_name; args=args; shortest=shortest; clauses=casedef} -> + let (re,actions,_,ntags) = encode_casedef casedef in + { lex_name = entry_name; + lex_regexp = re; + lex_mem_tags = ntags ; + lex_actions = List.rev actions },args,shortest) + def in + let chr = Array.of_list (List.rev !chars) in + chars := []; + (chr, entry_list) + +(* To generate directly a NFA from a regular expression. + Confer Aho-Sethi-Ullman, dragon book, chap. 3 + Extension to tagged automata. + Confer + Ville Larikari + 'NFAs with Tagged Transitions, their Conversion to Deterministic + Automata and Application to Regular Expressions'. + Symposium on String Processing and Information Retrieval (SPIRE 2000), + http://kouli.iki.fi/~vlaurika/spire2000-tnfa.ps +(See also) + http://kouli.iki.fi/~vlaurika/regex-submatch.ps.gz +*) + +type t_transition = + OnChars of int + | ToAction of int + +type transition = t_transition * Tags.t + +let trans_compare (t1,tags1) (t2,tags2) = + match Stdlib.compare t1 t2 with + | 0 -> Tags.compare tags1 tags2 + | r -> r + + +module TransSet = + Set.Make(struct type t = transition let compare = trans_compare end) + +let rec nullable = function + | Empty|Tag _ -> true + | Chars (_,_)|Action _ -> false + | Seq(r1,r2) -> nullable r1 && nullable r2 + | Alt(r1,r2) -> nullable r1 || nullable r2 + | Star _ -> true + +let rec emptymatch = function + | Empty | Chars (_,_) | Action _ -> Tags.empty + | Tag t -> Tags.add t Tags.empty + | Seq (r1,r2) -> Tags.union (emptymatch r1) (emptymatch r2) + | Alt(r1,r2) -> + if nullable r1 then + emptymatch r1 + else + emptymatch r2 + | Star r -> + if nullable r then + emptymatch r + else + Tags.empty + +let addtags transs tags = + TransSet.fold + (fun (t,tags_t) r -> TransSet.add (t, Tags.union tags tags_t) r) + transs TransSet.empty + + +let rec firstpos = function + Empty|Tag _ -> TransSet.empty + | Chars (pos,_) -> TransSet.add (OnChars pos,Tags.empty) TransSet.empty + | Action act -> TransSet.add (ToAction act,Tags.empty) TransSet.empty + | Seq(r1,r2) -> + if nullable r1 then + TransSet.union (firstpos r1) (addtags (firstpos r2) (emptymatch r1)) + else + firstpos r1 + | Alt(r1,r2) -> TransSet.union (firstpos r1) (firstpos r2) + | Star r -> firstpos r + + +(* Berry-Sethi followpos *) +let followpos size entry_list = + let v = Array.make size TransSet.empty in + let rec fill s = function + | Empty|Action _|Tag _ -> () + | Chars (n,_) -> v.(n) <- s + | Alt (r1,r2) -> + fill s r1 ; fill s r2 + | Seq (r1,r2) -> + fill + (if nullable r2 then + TransSet.union (firstpos r2) (addtags s (emptymatch r2)) + else + (firstpos r2)) + r1 ; + fill s r2 + | Star r -> + fill (TransSet.union (firstpos r) s) r in + List.iter (fun (entry,_,_) -> fill TransSet.empty entry.lex_regexp) + entry_list; + v + +(************************) +(* The algorithm itself *) +(************************) + +let no_action = max_int + +module StateSet = + Set.Make (struct type t = t_transition let compare = Stdlib.compare end) + + +module MemMap = + Map.Make (struct type t = int + let compare (x:t) y = Stdlib.compare x y end) + +type 'a dfa_state = + {final : int * ('a * int TagMap.t) ; + others : ('a * int TagMap.t) MemMap.t} + + +(* +let dtag oc t = + fprintf oc "%s<%s>" t.id (if t.start then "s" else "e") + +let dmem_map dp ds m = + MemMap.iter + (fun k x -> + eprintf "%d -> " k ; dp x ; ds ()) + m + +and dtag_map dp ds m = + TagMap.iter + (fun t x -> + dtag stderr t ; eprintf " -> " ; dp x ; ds ()) + m + +let dstate {final=(act,(_,m)) ; others=o} = + if act <> no_action then begin + eprintf "final=%d " act ; + dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m ; + prerr_endline "" + end ; + dmem_map + (fun (_,m) -> + dtag_map (fun x -> eprintf "%d" x) (fun () -> prerr_string " ,") m) + (fun () -> prerr_endline "") + o +*) + + +let dfa_state_empty = + {final=(no_action, (max_int,TagMap.empty)) ; + others=MemMap.empty} + +and dfa_state_is_empty {final=(act,_) ; others=o} = + act = no_action && + o = MemMap.empty + + +(* A key is an abstraction on a dfa state, + two states with the same key can be made the same by + copying some memory cells into others *) + + +module StateSetSet = + Set.Make (struct type t = StateSet.t let compare = StateSet.compare end) + +type t_equiv = {tag:tag_info ; equiv:StateSetSet.t} + +module MemKey = + Set.Make + (struct + type t = t_equiv + + let compare e1 e2 = match Stdlib.compare e1.tag e2.tag with + | 0 -> StateSetSet.compare e1.equiv e2.equiv + | r -> r + end) + +type dfa_key = {kstate : StateSet.t ; kmem : MemKey.t} + +(* Map a state to its key *) +let env_to_class m = + let env1 = + MemMap.fold + (fun _ (tag,s) r -> + TagMap.update tag (function + | None -> Some (StateSetSet.singleton s) + | Some ss -> Some (StateSetSet.add s ss) + ) r) + m TagMap.empty in + TagMap.fold + (fun tag ss r -> MemKey.add {tag=tag ; equiv=ss} r) + env1 MemKey.empty + + +(* trans is nfa_state, m is associated memory map *) +let inverse_mem_map trans m r = + TagMap.fold + (fun tag addr r -> + MemMap.update addr (function + | None -> Some (tag, StateSet.singleton trans) + | Some (otag, s) -> + assert (tag = otag); + Some (tag, StateSet.add trans s) + ) r) + m r + +let inverse_mem_map_other n (_,m) r = inverse_mem_map (OnChars n) m r + +let get_key {final=(act,(_,m_act)) ; others=o} = + let env = + MemMap.fold inverse_mem_map_other + o + (if act = no_action then MemMap.empty + else inverse_mem_map (ToAction act) m_act MemMap.empty) in + let state_key = + MemMap.fold (fun n _ r -> StateSet.add (OnChars n) r) o + (if act=no_action then StateSet.empty + else StateSet.add (ToAction act) StateSet.empty) in + let mem_key = env_to_class env in + {kstate = state_key ; kmem = mem_key} + + +let key_compare k1 k2 = match StateSet.compare k1.kstate k2.kstate with +| 0 -> MemKey.compare k1.kmem k2.kmem +| r -> r + +(* Association dfa_state -> state_num *) + +module StateMap = + Map.Make(struct type t = dfa_key let compare = key_compare end) + +let state_map = ref (StateMap.empty : int StateMap.t) +let todo = Stack.create() +let next_state_num = ref 0 +let next_mem_cell = ref 0 +let temp_pending = ref false +let tag_cells = Hashtbl.create 17 +let state_table = Table.create dfa_state_empty + + +(* Initial reset of state *) +let reset_state () = + Stack.clear todo; + next_state_num := 0 ; + let _ = Table.trim state_table in + () + +(* Reset state before processing a given automata. + We clear both the memory mapping and + the state mapping, as state sharing between different + automata may lead to incorrect estimation of the cell memory size + BUG ID 0004517 *) + + +let reset_state_partial ntags = + next_mem_cell := ntags ; + Hashtbl.clear tag_cells ; + temp_pending := false ; + state_map := StateMap.empty + +let do_alloc_temp () = + temp_pending := true ; + let n = !next_mem_cell in + n + +let do_alloc_cell used t = + let available = + try Hashtbl.find tag_cells t with Not_found -> Ints.empty in + try + Ints.choose (Ints.diff available used) + with + | Not_found -> + temp_pending := false ; + let n = !next_mem_cell in + if n >= 255 then raise Memory_overflow ; + Hashtbl.replace tag_cells t (Ints.add n available) ; + incr next_mem_cell ; + n + +let is_old_addr a = a >= 0 +and is_new_addr a = a < 0 + +let old_in_map m r = + TagMap.fold + (fun _ addr r -> + if is_old_addr addr then + Ints.add addr r + else + r) + m r + +let alloc_map used m mvs = + TagMap.fold + (fun tag a (r,mvs) -> + let a,mvs = + if is_new_addr a then + let a = do_alloc_cell used tag in + a,Ints.add a mvs + else a,mvs in + TagMap.add tag a r,mvs) + m (TagMap.empty,mvs) + +let create_new_state {final=(act,(_,m_act)) ; others=o} = + let used = + MemMap.fold (fun _ (_,m) r -> old_in_map m r) + o (old_in_map m_act Ints.empty) in + + let new_m_act,mvs = alloc_map used m_act Ints.empty in + let new_o,mvs = + MemMap.fold (fun k (x,m) (r,mvs) -> + let m,mvs = alloc_map used m mvs in + MemMap.add k (x,m) r,mvs) + o (MemMap.empty,mvs) in + {final=(act,(0,new_m_act)) ; others=new_o}, + Ints.fold (fun x r -> Set x::r) mvs [] + +type new_addr_gen = {mutable count : int ; mutable env : int TagMap.t} + +let create_new_addr_gen () = {count = -1 ; env = TagMap.empty} + +let alloc_new_addr tag r = + try + TagMap.find tag r.env + with + | Not_found -> + let a = r.count in + r.count <- a-1 ; + r.env <- TagMap.add tag a r.env ; + a + + +let create_mem_map tags gen = + Tags.fold + (fun tag r -> TagMap.add tag (alloc_new_addr tag gen) r) + tags TagMap.empty + +let create_init_state pos = + let gen = create_new_addr_gen () in + let st = + TransSet.fold + (fun (t,tags) st -> + match t with + | ToAction n -> + let on,_otags = st.final in + if n < on then + {st with final = (n, (0,create_mem_map tags gen))} + else + st + | OnChars n -> + try + let _ = MemMap.find n st.others in assert false + with + | Not_found -> + {st with others = + MemMap.add n (0,create_mem_map tags gen) st.others}) + pos dfa_state_empty in + st + + +let get_map t st = match t with +| ToAction _ -> let _,(_,m) = st.final in m +| OnChars n -> + let (_,m) = MemMap.find n st.others in + m + +let dest = function | Copy (d,_) | Set d -> d +and orig = function | Copy (_,o) -> o | Set _ -> -1 + +(* +let pmv oc mv = fprintf oc "%d <- %d" (dest mv) (orig mv) +let pmvs oc mvs = + List.iter (fun mv -> fprintf oc "%a " pmv mv) mvs ; + output_char oc '\n' ; flush oc +*) + + +(* Topological sort << a la louche >> *) +let sort_mvs mvs = + let rec do_rec r mvs = match mvs with + | [] -> r + | _ -> + let dests = + List.fold_left + (fun r mv -> Ints.add (dest mv) r) + Ints.empty mvs in + let rem,here = + List.partition + (fun mv -> Ints.mem (orig mv) dests) + mvs in + match here with + | [] -> + begin match rem with + | Copy (d,_)::_ -> + let d' = do_alloc_temp () in + Copy (d',d):: + do_rec r + (List.map + (fun mv -> + if orig mv = d then + Copy (dest mv,d') + else + mv) + rem) + | _ -> assert false + end + | _ -> do_rec (here@r) rem in + do_rec [] mvs + +let move_to mem_key src tgt = + let mvs = + MemKey.fold + (fun {tag=tag ; equiv=m} r -> + StateSetSet.fold + (fun s r -> + try + let t = StateSet.choose s in + let src = TagMap.find tag (get_map t src) + and tgt = TagMap.find tag (get_map t tgt) in + if src <> tgt then begin + if is_new_addr src then + Set tgt::r + else + Copy (tgt, src)::r + end else + r + with + | Not_found -> assert false) + m r) + mem_key [] in +(* Moves are topologically sorted *) + sort_mvs mvs + + +let get_state st = + let key = get_key st in + try + let num = StateMap.find key !state_map in + num,move_to key.kmem st (Table.get state_table num) + with Not_found -> + let num = !next_state_num in + incr next_state_num; + let st,mvs = create_new_state st in + Table.emit state_table st ; + state_map := StateMap.add key num !state_map; + Stack.push (st, num) todo; + num,mvs + +let map_on_all_states f old_res = + let res = ref old_res in + begin try + while true do + let (st, i) = Stack.pop todo in + let r = f st in + res := (r, i) :: !res + done + with Stack.Empty -> () + end; + !res + +let goto_state st = + if + dfa_state_is_empty st + then + Backtrack,[] + else + let n,moves = get_state st in + Goto n,moves + +(****************************) +(* compute reachable states *) +(****************************) + +let add_tags_to_map gen tags m = + Tags.fold + (fun tag m -> + let m = TagMap.remove tag m in + TagMap.add tag (alloc_new_addr tag gen) m) + tags m + +let apply_transition gen r pri m = function + | ToAction n,tags -> + let on,(opri,_) = r.final in + if n < on || (on=n && pri < opri) then + let m = add_tags_to_map gen tags m in + {r with final=n,(pri,m)} + else r + | OnChars n,tags -> + try + let (opri,_) = MemMap.find n r.others in + if pri < opri then + let m = add_tags_to_map gen tags m in + {r with others=MemMap.add n (pri,m) (MemMap.remove n r.others)} + else + r + with + | Not_found -> + let m = add_tags_to_map gen tags m in + {r with others=MemMap.add n (pri,m) r.others} + +(* add transitions ts to new state r + transitions in ts start from state pri and memory map m +*) +let apply_transitions gen r pri m ts = + TransSet.fold + (fun t r -> apply_transition gen r pri m t) + ts r + + +(* For a given nfa_state pos, refine char partition *) +let rec split_env gen follow pos m s = function + | [] -> (* Can occur ! because of non-matching regexp ([^'\000'-'\255']) *) + [] + | (s1,st1) as p::rem -> + let here = Cset.inter s s1 in + if Cset.is_empty here then + p::split_env gen follow pos m s rem + else + let rest = Cset.diff s here in + let rem = + if Cset.is_empty rest then + rem + else + split_env gen follow pos m rest rem + and new_st = apply_transitions gen st1 pos m follow in + let stay = Cset.diff s1 here in + if Cset.is_empty stay then + (here, new_st)::rem + else + (stay, st1)::(here, new_st)::rem + + +(* For all nfa_state pos in a dfa state st *) +let comp_shift gen chars follow st = + MemMap.fold + (fun pos (_,m) env -> split_env gen follow.(pos) pos m chars.(pos) env) + st [Cset.all_chars_eof,dfa_state_empty] + + +let reachs chars follow st = + let gen = create_new_addr_gen () in +(* build an association list (char set -> new state) *) + let env = comp_shift gen chars follow st in +(* change it into (char set -> new state_num) *) + let env = + List.map + (fun (s,dfa_state) -> s,goto_state dfa_state) env in +(* finally build the char indexed array -> new state num *) + let shift = Cset.env_to_array env in + shift + + +let get_tag_mem n env t = + try + TagMap.find t env.(n) + with + | Not_found -> assert false + +let do_tag_actions n env m = + + let used,r = + TagMap.fold (fun t m (used,r) -> + let a = get_tag_mem n env t in + Ints.add a used,SetTag (a,m)::r) m (Ints.empty,[]) in + let _,r = + TagMap.fold + (fun tag m (used,r) -> + if not (Ints.mem m used) && tag.start then + Ints.add m used, EraseTag m::r + else + used,r) + env.(n) (used,r) in + r + + +let translate_state shortest_match tags chars follow st = + let (n,(_,m)) = st.final in + if MemMap.empty = st.others then + Perform (n,do_tag_actions n tags m) + else if shortest_match then begin + if n=no_action then + Shift (No_remember,reachs chars follow st.others) + else + Perform(n, do_tag_actions n tags m) + end else begin + Shift ( + (if n = no_action then + No_remember + else + Remember (n,do_tag_actions n tags m)), + reachs chars follow st.others) + end + +(* +let dtags chan tags = + Tags.iter + (fun t -> fprintf chan " %a" dtag t) + tags + +let dtransset s = + TransSet.iter + (fun trans -> match trans with + | OnChars i,tags -> + eprintf " (-> %d,%a)" i dtags tags + | ToAction i,tags -> + eprintf " ([%d],%a)" i dtags tags) + s + +let dfollow t = + eprintf "follow=[" ; + for i = 0 to Array.length t-1 do + eprintf "%d:" i ; + dtransset t.(i) + done ; + prerr_endline "]" +*) + + +let make_tag_entry id start act a r = match a with + | Sum (Mem m,0) -> + TagMap.add {id=id ; start=start ; action=act} m r + | _ -> r + +let extract_tags l = + let envs = Array.make (List.length l) TagMap.empty in + List.iter + (fun (act,m,_) -> + envs.(act) <- + List.fold_right + (fun ((name,_),v) r -> match v with + | Ident_char (_,t) -> make_tag_entry name true act t r + | Ident_string (_,t1,t2) -> + make_tag_entry name true act t1 + (make_tag_entry name false act t2 r)) + m TagMap.empty) + l ; + envs + + +let make_dfa lexdef = + let (chars, entry_list) = encode_lexdef lexdef in + let follow = followpos (Array.length chars) entry_list in +(* + dfollow follow ; +*) + reset_state () ; + let r_states = ref [] in + let initial_states = + List.map + (fun (le,args,shortest) -> + let tags = extract_tags le.lex_actions in + reset_state_partial le.lex_mem_tags ; + let pos_set = firstpos le.lex_regexp in +(* + prerr_string "trans={" ; dtransset pos_set ; prerr_endline "}" ; +*) + let init_state = create_init_state pos_set in + let init_num = get_state init_state in + r_states := + map_on_all_states + (translate_state shortest tags chars follow) !r_states ; + { auto_name = le.lex_name; + auto_args = args ; + auto_mem_size = + (if !temp_pending then !next_mem_cell+1 else !next_mem_cell) ; + auto_initial_state = init_num ; + auto_actions = le.lex_actions }) + entry_list in + let states = !r_states in +(* + prerr_endline "** states **" ; + for i = 0 to !next_state_num-1 do + eprintf "+++ %d +++\n" i ; + dstate (Table.get state_table i) ; + prerr_endline "" + done ; + eprintf "%d states\n" !next_state_num ; +*) + let actions = Array.make !next_state_num (Perform (0,[])) in + List.iter (fun (act, i) -> actions.(i) <- act) states; +(* Useless state reset, so as to restrict GC roots *) + reset_state () ; + reset_state_partial 0 ; + (initial_states, actions) diff --git a/test/ocamllex_vendored/lexgen.mli b/test/ocamllex_vendored/lexgen.mli new file mode 100644 index 0000000..306f475 --- /dev/null +++ b/test/ocamllex_vendored/lexgen.mli @@ -0,0 +1,60 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* raised when there are too many bindings (>= 254 memory cells) *) +exception Memory_overflow + + +(* Representation of automata *) + + +type automata = + Perform of int * tag_action list + | Shift of automata_trans * (automata_move * memory_action list) array +and automata_trans = + No_remember + | Remember of int * tag_action list +and automata_move = + Backtrack + | Goto of int +and memory_action = + | Copy of int * int + | Set of int + +and tag_action = SetTag of int * int | EraseTag of int + +type ident = string * Syntax.location + +(* Representation of entry points *) +type tag_base = Start | End | Mem of int +type tag_addr = Sum of (tag_base * int) +type ident_info = + | Ident_string of bool * tag_addr * tag_addr + | Ident_char of bool * tag_addr + +type t_env = (ident * ident_info) list + +type ('args,'action) automata_entry = + { auto_name: string; + auto_args: 'args ; + auto_mem_size : int ; + auto_initial_state: int * memory_action list ; + auto_actions: (int * t_env * 'action) list } + +(* The entry point *) + +val make_dfa : + ('args, 'action) Syntax.entry list -> + ('args, 'action) automata_entry list * automata array diff --git a/test/ocamllex_vendored/sync.sh b/test/ocamllex_vendored/sync.sh new file mode 100755 index 0000000..ab89b1c --- /dev/null +++ b/test/ocamllex_vendored/sync.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Synchronize vendored ocamllex sources from the current OCaml compiler. +# +# Usage: ./test/ocamllex_vendored/sync.sh +# +# The source directory is the lex/ subtree of the OCaml compiler sources +# kept by opam. Override with: OCAML_LEX_DIR=/path/to/lex ./sync.sh + +set -eu + +DEST="$(cd "$(dirname "$0")" && pwd)" + +if [ -z "${OCAML_LEX_DIR:-}" ]; then + PREFIX="$(opam var prefix)" + OCAML_VERSION="$(opam var ocaml:version)" + OCAML_LEX_DIR="$PREFIX/.opam-switch/sources/ocaml-compiler.$OCAML_VERSION/lex" +fi + +if [ ! -d "$OCAML_LEX_DIR" ]; then + echo "error: source directory not found: $OCAML_LEX_DIR" >&2 + echo "Set OCAML_LEX_DIR to the lex/ directory of the OCaml compiler sources." >&2 + exit 1 +fi + +FILES="cset.ml cset.mli lexgen.ml lexgen.mli syntax.ml syntax.mli table.ml" + +for f in $FILES; do + cp "$OCAML_LEX_DIR/$f" "$DEST/$f" +done + +echo "Synced from $OCAML_LEX_DIR (OCaml ${OCAML_VERSION:-unknown})" diff --git a/test/ocamllex_vendored/syntax.ml b/test/ocamllex_vendored/syntax.ml new file mode 100644 index 0000000..61a9d79 --- /dev/null +++ b/test/ocamllex_vendored/syntax.ml @@ -0,0 +1,49 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* This apparently useless implementation file is in fact required + by the pa_ocamllex syntax extension *) + +(* The shallow abstract syntax *) + +type location = { + loc_file : string; + start_pos : int; + end_pos : int; + start_line : int; + start_col : int; +} + +type regular_expression = + Epsilon + | Characters of Cset.t + | Eof + | Sequence of regular_expression * regular_expression + | Alternative of regular_expression * regular_expression + | Repetition of regular_expression + | Bind of regular_expression * (string * location) + +type ('arg,'action) entry = + {name:string ; + shortest : bool ; + args : 'arg ; + clauses : (regular_expression * 'action) list} + +type lexer_definition = { + header: location; + entrypoints: ((string list, location) entry) list; + trailer: location; + refill_handler : location option; +} diff --git a/test/ocamllex_vendored/syntax.mli b/test/ocamllex_vendored/syntax.mli new file mode 100644 index 0000000..eb0acef --- /dev/null +++ b/test/ocamllex_vendored/syntax.mli @@ -0,0 +1,46 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* The shallow abstract syntax *) + +type location = { + loc_file : string; + start_pos : int; + end_pos : int; + start_line : int; + start_col : int; +} + +type regular_expression = + Epsilon + | Characters of Cset.t + | Eof + | Sequence of regular_expression * regular_expression + | Alternative of regular_expression * regular_expression + | Repetition of regular_expression + | Bind of regular_expression * (string * location) + +type ('arg,'action) entry = + {name:string ; + shortest : bool ; + args : 'arg ; + clauses : (regular_expression * 'action) list} + +type lexer_definition = { + header: location; + entrypoints: ((string list, location) entry) list; + trailer: location; + refill_handler : location option; +} diff --git a/test/ocamllex_vendored/table.ml b/test/ocamllex_vendored/table.ml new file mode 100644 index 0000000..137ce72 --- /dev/null +++ b/test/ocamllex_vendored/table.ml @@ -0,0 +1,56 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Luc Maranget, projet Moscova, INRIA Rocquencourt *) +(* *) +(* Copyright 2002 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +type 'a t = {mutable next : int ; mutable data : 'a array} + +let default_size = 32 + +let create x = {next = 0 ; data = Array.make default_size x} +and reset t = t.next <- 0 + +let incr_table table new_size = + let t = Array.make new_size table.data.(0) in + Array.blit table.data 0 t 0 (Array.length table.data) ; + table.data <- t + +let emit table i = + let size = Array.length table.data in + if table.next >= size then + incr_table table (2*size); + table.data.(table.next) <- i ; + table.next <- table.next + 1 + + +exception Error + +let get t i = + if 0 <= i && i < t.next then + t.data.(i) + else + raise Error + +let trim t = + let r = Array.sub t.data 0 t.next in + reset t ; + r + +let iter t f = + let size = t.next + and data = t.data in + for i = 0 to size-1 do + f data.(i) + done + +let size t = t.next From 2c9a1de82bbf9dbcbc9c47506edb45694210e9e2 Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Wed, 15 Apr 2026 12:08:43 +0200 Subject: [PATCH 3/4] Add ocamllex and DFA oracle for testing tagged DFA compilation - ocamllex_oracle: converts Ir.t to ocamllex's regex AST, compiles with Lexgen.make_dfa, and simulates the resulting automata - sedlex_oracle: runs both ocamllex and sedlex DFA simulators, compares results, reports errors on any divergence - Add FIXME comment on add_node documenting the known epsilon-closure tag bug with overlapping Star/Plus and captures Co-Authored-By: Claude Opus 4.6 (1M context) --- dune-project | 1 + sedlex.opam | 1 + src/compiler/sedlex.ml | 16 +- test/ocamllex_oracle/dune | 3 + test/ocamllex_oracle/ocamllex_oracle.ml | 186 +++++++++++++++ test/oracle/dune | 3 + test/oracle/sedlex_oracle.ml | 291 ++++++++++++++++++++++++ test/oracle/sedlex_oracle.mli | 42 ++++ 8 files changed, 542 insertions(+), 1 deletion(-) create mode 100644 test/ocamllex_oracle/dune create mode 100644 test/ocamllex_oracle/ocamllex_oracle.ml create mode 100644 test/oracle/dune create mode 100644 test/oracle/sedlex_oracle.ml create mode 100644 test/oracle/sedlex_oracle.mli diff --git a/dune-project b/dune-project index 5421cec..c5fd611 100644 --- a/dune-project +++ b/dune-project @@ -27,4 +27,5 @@ extension.") (ppxlib (>= 0.26.0)) gen (ppx_expect :with-test) + (qcheck-core :with-test) (menhir :with-test))) diff --git a/sedlex.opam b/sedlex.opam index 7b917f4..9f98931 100644 --- a/sedlex.opam +++ b/sedlex.opam @@ -22,6 +22,7 @@ depends: [ "ppxlib" {>= "0.26.0"} "gen" "ppx_expect" {with-test} + "qcheck-core" {with-test} "menhir" {with-test} "odoc" {with-doc} ] diff --git a/src/compiler/sedlex.ml b/src/compiler/sedlex.ml index 7a28c66..b2a43ab 100644 --- a/src/compiler/sedlex.ml +++ b/src/compiler/sedlex.ml @@ -226,7 +226,21 @@ type state = node list (* [add_node (state, tags) node] adds [node] to the NFA-node set [state] via epsilon closure: it follows all epsilon edges recursively, collecting any tag operations encountered along the way. Returns the updated - (state, tags) pair. Physical identity (memq) prevents revisiting nodes. *) + (state, tags) pair. Physical identity (memq) prevents revisiting nodes. + + FIXME: all NFA paths through the epsilon closure contribute to a single + flat tag-op list, which is then attached to the DFA transition. At + runtime, every op in that list executes at the same position (the current + lexbuf position when the transition fires). This is correct when each + tag cell is written by at most one NFA path. But when a repetition + (Star/Plus) creates an epsilon loop that re-enters a bind's tag node, + [memq] prevents the second visit, so the tag op appears only once — set + by whichever path the closure explored first. A correct tagged DFA + (Laurikari, 2000) would maintain per-thread tag registers in each DFA + state so that different NFA paths record independent tag values and the + right one is selected on acceptance. Without this, patterns like + [Star 'a', ('a' as x)] produce wrong capture positions. + See test/test_oracle.ml "known limitation" for counterexamples. *) let rec add_node (state, tags) node = if List.memq node state then (state, tags) else ( diff --git a/test/ocamllex_oracle/dune b/test/ocamllex_oracle/dune new file mode 100644 index 0000000..3832e3b --- /dev/null +++ b/test/ocamllex_oracle/dune @@ -0,0 +1,3 @@ +(library + (name ocamllex_oracle) + (libraries sedlex.compiler ocamllex_vendored)) diff --git a/test/ocamllex_oracle/ocamllex_oracle.ml b/test/ocamllex_oracle/ocamllex_oracle.ml new file mode 100644 index 0000000..979a231 --- /dev/null +++ b/test/ocamllex_oracle/ocamllex_oracle.ml @@ -0,0 +1,186 @@ +(* Oracle using vendored ocamllex DFA compiler. + + Converts Ir.t patterns to ocamllex's Syntax.regular_expression, compiles + them with Lexgen.make_dfa, then simulates the resulting automata to produce + match results comparable to the sedlex oracle. *) + +open Ocamllex_vendored +module Sedlex_cset = Sedlex_compiler.Cset + +(* ================================================================== *) +(* Ir.t → Syntax.regular_expression conversion *) +(* ================================================================== *) + +let dummy_loc : Syntax.location = + { loc_file = ""; start_pos = 0; end_pos = 0; start_line = 0; start_col = 0 } + +(* Convert sedlex Cset.t (Unicode intervals) to ocamllex Cset.t (byte intervals). + Clamps intervals to the byte range 0-255; intervals entirely above 255 are dropped. *) +let convert_cset (cset : Sedlex_cset.t) : Cset.t = + List.fold_left + (fun acc (lo, hi) -> + if lo > 255 then acc else Cset.union acc (Cset.interval lo (min hi 255))) + Cset.empty (Sedlex_cset.to_list cset) + +open Sedlex_compiler.Ir + +let rec convert (ir : Sedlex_compiler.Ir.t) : Syntax.regular_expression = + match ir with + | Chars cset -> Characters (convert_cset cset) + | Eps -> Epsilon + | Seq elems -> + List.fold_left + (fun acc e -> Syntax.Sequence (acc, convert e)) + Epsilon elems + | Alt branches -> ( + match branches with + | [] -> Epsilon + | [b] -> convert b + | first :: rest -> + List.fold_left + (fun acc b -> Syntax.Alternative (acc, convert b)) + (convert first) rest) + | Star inner -> Repetition (convert inner) + | Plus inner -> Sequence (convert inner, Repetition (convert inner)) + | Rep (inner, lo, hi) -> + let r = convert inner in + let mandatory = + let rec repeat n acc = + if n <= 0 then acc else repeat (n - 1) (Syntax.Sequence (r, acc)) + in + repeat lo Epsilon + in + let optional = + let rec opt_repeat n acc = + if n <= 0 then acc + else + opt_repeat (n - 1) + (Syntax.Alternative (Epsilon, Sequence (r, acc))) + in + opt_repeat (hi - lo) Epsilon + in + Sequence (mandatory, optional) + | Capture (name, inner) -> Bind (convert inner, (name, dummy_loc)) + +(* ================================================================== *) +(* Build ocamllex entry from Ir.t rules *) +(* ================================================================== *) + +let build_entry (rules : Sedlex_compiler.Ir.t array) : + (string list, int) Syntax.entry = + let clauses = + Array.to_list (Array.mapi (fun i ir -> (convert ir, i)) rules) + in + { name = "oracle"; shortest = false; args = []; clauses } + +(* ================================================================== *) +(* Automata simulation *) +(* ================================================================== *) + +type binding = { name : string; start_offset : int; end_offset : int } +type result = { rule : int; length : int; bindings : binding list } + +let simulate_automata (entries : (string list, int) Lexgen.automata_entry list) + (auto : Lexgen.automata array) (input : int array) : result option = + let entry = List.hd entries in + let mem = Array.make entry.auto_mem_size (-1) in + let init_state, init_moves = entry.auto_initial_state in + (* Execute initial memory actions *) + let exec_mem_actions pos actions = + List.iter + (fun (action : Lexgen.memory_action) -> + match action with + | Set dst -> mem.(dst) <- pos + | Copy (dst, src) -> mem.(dst) <- mem.(src)) + actions + in + let exec_tag_actions actions = + List.iter + (fun (action : Lexgen.tag_action) -> + match action with + | SetTag (dst, src) -> mem.(dst) <- mem.(src) + | EraseTag dst -> mem.(dst) <- -1) + actions + in + exec_mem_actions 0 init_moves; + let last_action = ref (-1) in + let last_pos = ref 0 in + let last_mem = ref (Array.copy mem) in + let pos = ref 0 in + let state = ref init_state in + let running = ref true in + while !running do + match auto.(!state) with + | Lexgen.Perform (action, tag_ops) -> + exec_tag_actions tag_ops; + last_action := action; + last_pos := !pos; + last_mem := Array.copy mem; + running := false + | Lexgen.Shift (remember, moves) -> ( + (match remember with + | Remember (action, tag_ops) -> + exec_tag_actions tag_ops; + last_action := action; + last_pos := !pos; + last_mem := Array.copy mem + | No_remember -> ()); + let byte = + if !pos >= Array.length input then 256 (* eof *) else input.(!pos) + in + let move, mem_actions = moves.(byte) in + exec_mem_actions (!pos + 1) mem_actions; + match move with + | Goto target -> + incr pos; + state := target + | Backtrack -> running := false) + done; + if !last_action < 0 then None + else ( + (* Resolve bindings from the action's t_env *) + let action_num = !last_action in + let match_len = !last_pos in + let saved_mem = !last_mem in + let env = + try + let _, env, _ = + List.find (fun (n, _, _) -> n = action_num) entry.auto_actions + in + env + with Not_found -> [] + in + let resolve_addr (Lexgen.Sum (base, offset)) = + (match base with + | Lexgen.Start -> 0 + | Lexgen.End -> match_len + | Lexgen.Mem n -> saved_mem.(n)) + + offset + in + let bindings = + List.filter_map + (fun ((name, _loc), (info : Lexgen.ident_info)) -> + match info with + | Ident_string (is_opt, start_addr, end_addr) -> + let s = resolve_addr start_addr in + let e = resolve_addr end_addr in + if is_opt && s < 0 then None + else Some { name; start_offset = s; end_offset = e } + | Ident_char (is_opt, addr) -> + let p = resolve_addr addr in + if is_opt && p < 0 then None + else Some { name; start_offset = p; end_offset = p + 1 }) + env + |> List.sort compare + in + Some { rule = action_num; length = match_len; bindings }) + +(* ================================================================== *) +(* Public API *) +(* ================================================================== *) + +let simulate (rules : Sedlex_compiler.Ir.t array) (input : int array) : + result option = + let entry = build_entry rules in + let entries, auto = Lexgen.make_dfa [entry] in + simulate_automata entries auto input diff --git a/test/oracle/dune b/test/oracle/dune new file mode 100644 index 0000000..22405b1 --- /dev/null +++ b/test/oracle/dune @@ -0,0 +1,3 @@ +(library + (name sedlex_oracle) + (libraries sedlex.compiler ocamllex_oracle qcheck-core)) diff --git a/test/oracle/sedlex_oracle.ml b/test/oracle/sedlex_oracle.ml new file mode 100644 index 0000000..a65c17a --- /dev/null +++ b/test/oracle/sedlex_oracle.ml @@ -0,0 +1,291 @@ +(* Oracle: verify that the DFA compiled from IR preserves the language + and that tagged transitions (for as-bindings) record correct positions. + + Two independent simulators are compared: + - ocamllex: vendored ocamllex DFA compiler (independent TDFA implementation) + - DFA: sedlex's own determinized tagged DFA + + ocamllex serves as the reference oracle. The DFA is the system under test. *) + +open Sedlex_compiler + +(* ================================================================== *) +(* Convenience builders for Ir.t *) +(* ================================================================== *) + +let unwrap = function Ok t -> t | Error msg -> failwith msg +let lit c = Ir.chars (Cset.singleton (Char.code c)) +let cls a b = Ir.chars (Cset.interval (Char.code a) (Char.code b)) +let seq a b = Ir.seq a b +let ( ^. ) = seq +let alt a b = Ir.alt a b |> unwrap +let star r = Ir.star r |> unwrap +let plus r = Ir.plus r |> unwrap +let opt r = alt Ir.eps r +let rep r lo hi = Ir.rep r lo hi |> unwrap + +let compl r = + match (r : Ir.t) with + | Chars cset -> Ir.chars (Cset.difference Cset.any cset) + | _ -> failwith "compl: operand must be Chars" + +let sub a b = + match ((a : Ir.t), (b : Ir.t)) with + | Chars c1, Chars c2 -> Ir.chars (Cset.difference c1 c2) + | _ -> failwith "sub: operands must be Chars" + +let inter a b = + match ((a : Ir.t), (b : Ir.t)) with + | Chars c1, Chars c2 -> Ir.chars (Cset.intersection c1 c2) + | _ -> failwith "inter: operands must be Chars" + +let capture name r = Ir.capture name r |> unwrap + +(* ================================================================== *) +(* DFA simulator *) +(* ================================================================== *) + +type binding = { + name : string; + start_offset : int; (** Start position in input, or [unset] *) + end_offset : int; (** End position (exclusive) in input, or [unset] *) +} + +let unset = -1 + +type result = { rule : int; length : int; bindings : binding list } + +let encode_value v = -2 - v + +let exec_tag (tags : int array) pos = function + | Sedlex.Set_position cell -> tags.(cell) <- pos + | Sedlex.Set_value (cell, v) -> tags.(cell) <- encode_value v + +let resolve_pos_expr tags length = function + | Sedlex.Tag { tag; offset } -> tags.(tag) + offset + | Sedlex.Start_plus n -> n + | Sedlex.End_minus n -> length - n + +let resolve_bindings tags length (cbs : Sedlex.compiled_binding list) = + List.filter_map + (fun (cb : Sedlex.compiled_binding) -> + let active = + match cb.disc with + | [] -> true + | conditions -> + List.for_all + (fun (cell, value) -> tags.(cell) = encode_value value) + conditions + in + if not active then None + else + Some + { + name = cb.name; + start_offset = resolve_pos_expr tags length cb.start_pos; + end_offset = resolve_pos_expr tags length cb.end_pos; + }) + cbs + |> List.sort compare + +module DFA = struct + let find_trans (trans : (Cset.t * int * _) array) c = + let r = ref None in + Array.iter + (fun (cs, tgt, ops) -> + if !r = None && Cset.mem c cs then r := Some (tgt, ops)) + trans; + !r + + let accepting (s : Sedlex.dfa_state) = + let r = ref None in + for i = 0 to Array.length s.finals - 1 do + if !r = None && s.finals.(i) then r := Some i + done; + !r + + (* Simulate DFA with longest-match / first-rule-wins. *) + let simulate_ir (compiled : Sedlex.compiled_ir) input = + let dfa = compiled.dfa in + let binds_per_rule = compiled.bindings in + let tags = Array.make compiled.num_tags unset in + List.iter (exec_tag tags 0) compiled.init_tags; + let last = ref None in + let si = ref 0 in + let pos = ref 0 in + let continue = ref true in + while !continue do + (match accepting dfa.(!si) with + | Some r -> + let bindings = + resolve_bindings (Array.copy tags) !pos binds_per_rule.(r) + in + last := Some { rule = r; length = !pos; bindings } + | None -> ()); + if !pos >= Array.length input then continue := false + else ( + match find_trans dfa.(!si).trans input.(!pos) with + | None -> continue := false + | Some (tgt, ops) -> + List.iter (exec_tag tags (!pos + 1)) ops; + si := tgt; + incr pos) + done; + !last +end + +(* ================================================================== *) +(* Oracle: run ocamllex + DFA, compare, extract + verify captures *) +(* ================================================================== *) + +let input_of_string s = Array.init (String.length s) (fun i -> Char.code s.[i]) + +let format_binding input_str { name; start_offset; end_offset } = + if start_offset < 0 || end_offset < 0 then Printf.sprintf "%s=" name + else if + end_offset < start_offset + || start_offset > String.length input_str + || end_offset > String.length input_str + then Printf.sprintf "%s=" name start_offset end_offset + else + Printf.sprintf "%s=%S" name + (String.sub input_str start_offset (end_offset - start_offset)) + +let convert_ocamllex_result (r : Ocamllex_oracle.result option) : result option + = + match r with + | None -> None + | Some r -> + Some + { + rule = r.rule; + length = r.length; + bindings = + List.map + (fun (b : Ocamllex_oracle.binding) -> + { + name = b.name; + start_offset = b.start_offset; + end_offset = b.end_offset; + }) + r.bindings; + } + +let run_all irs input_str = + let compiled = Sedlex.compile_ir irs in + let input = input_of_string input_str in + let ocamllex = convert_ocamllex_result (Ocamllex_oracle.simulate irs input) in + let dfa = DFA.simulate_ir compiled input in + (ocamllex, dfa) + +let check irs input_str = + let ocamllex, dfa = run_all irs input_str in + ocamllex = dfa + +let oracle irs input_str = + let ocamllex, dfa = run_all irs input_str in + let fmt_result = function + | None -> "no match" + | Some r -> + let fmt_bindings b = + String.concat ", " (List.map (format_binding input_str) b) + in + Printf.sprintf "rule %d, len %d, [%s]" r.rule r.length + (fmt_bindings r.bindings) + in + if ocamllex = dfa then Printf.printf "%S -> %s\n" input_str (fmt_result dfa) + else + Printf.printf "ERROR %S -> ocamllex=%s dfa=%s\n" input_str + (fmt_result ocamllex) (fmt_result dfa) + +(* ================================================================== *) +(* QCheck: random regexp + random input *) +(* ================================================================== *) + +module G = QCheck2.Gen + +(* Generator for random Ir.t patterns with captures. *) +let gen_ir : Ir.t G.t = + let leaf = + G.oneof_weighted + [ + ( 3, + G.map + (fun c -> Ir.chars (Cset.singleton (Char.code c))) + (G.oneof_list ['a'; 'b'; 'c'; 'd']) ); + ( 1, + G.map2 + (fun a b -> Ir.chars (Cset.interval (Char.code a) (Char.code b))) + (G.oneof_list ['a'; 'b']) + (G.oneof_list ['c'; 'd']) ); + ] + in + let names = [| "x"; "y"; "z" |] in + (* gen_inner: no Capture (safe for Star/Plus/Rep bodies) *) + let gen_inner = + G.sized + @@ G.fix (fun self n -> + if n <= 0 then leaf + else ( + let child = self (n / 2) in + G.oneof_weighted + [ + (3, leaf); + (2, G.map2 (fun a b -> Ir.seq a b) child child); + (1, G.map2 (fun a b -> alt a b) child child); + (1, G.map (fun r -> plus r) (self (n / 3))); + (1, G.map (fun r -> star r) (self (n / 3))); + (1, G.map (fun r -> opt r) (self (n / 3))); + ( 1, + G.map2 + (fun lo hi -> + let lo = min lo hi and hi = max lo hi in + rep (cls 'a' 'c') lo hi) + (G.int_range 0 3) (G.int_range 0 3) ); + (1, G.return (sub (cls 'a' 'd') (cls 'c' 'd'))); + (1, G.return (inter (cls 'a' 'd') (cls 'b' 'c'))); + (1, G.return (compl (cls 'a' 'c'))); + ])) + in + (* Safe combinators for the generator: fall back gracefully when IR + validation rejects a combination (e.g. Alt with mismatched captures). *) + let safe_alt a b = + match Ir.alt a b with Ok t -> t | Error _ -> Ir.seq a b + in + let safe_capture name r = + match Ir.capture name r with Ok t -> t | Error _ -> r + in + (* gen_top: may contain Capture wrapping gen_inner, plus all operators. *) + let name_i = ref 0 in + G.sized + @@ G.fix (fun self n -> + if n <= 0 then leaf + else ( + let sub_top = self (n / 2) in + G.oneof_weighted + [ + (3, leaf); + (3, G.map2 (fun a b -> Ir.seq a b) sub_top sub_top); + (1, G.map2 (fun a b -> safe_alt a b) sub_top sub_top); + (1, G.map (fun r -> star r) gen_inner); + (1, G.map (fun r -> plus r) gen_inner); + (1, G.map (fun r -> opt r) gen_inner); + ( 2, + G.map + (fun r -> + let i = !name_i mod Array.length names in + incr name_i; + safe_capture names.(i) r) + gen_inner ); + ])) + +let gen_input = + G.string_size ~gen:(G.oneof_list ['a'; 'b'; 'c'; 'd'; 'e']) (G.int_range 0 15) + +let print_case (irs, input_str) = + let rules = + Array.to_list irs + |> List.mapi (fun i r -> Printf.sprintf " rule%d: %s" i (Ir.show r)) + |> String.concat "\n" + in + Printf.sprintf "rules:\n%s\ninput: %S" rules input_str diff --git a/test/oracle/sedlex_oracle.mli b/test/oracle/sedlex_oracle.mli new file mode 100644 index 0000000..85590ea --- /dev/null +++ b/test/oracle/sedlex_oracle.mli @@ -0,0 +1,42 @@ +(** Oracle for testing sedlex's tagged DFA compilation. + + Two independent simulators are compared: + - ocamllex: vendored ocamllex DFA compiler (independent TDFA) + - DFA: sedlex's own determinized tagged DFA *) + +open Sedlex_compiler + +(** {2 Convenience builders for Ir.t} + + Thin wrappers around {!Ir} smart constructors that raise on error instead of + returning [result]. *) + +val lit : char -> Ir.t +val cls : char -> char -> Ir.t +val seq : Ir.t -> Ir.t -> Ir.t +val alt : Ir.t -> Ir.t -> Ir.t +val star : Ir.t -> Ir.t +val plus : Ir.t -> Ir.t +val opt : Ir.t -> Ir.t +val rep : Ir.t -> int -> int -> Ir.t +val compl : Ir.t -> Ir.t +val sub : Ir.t -> Ir.t -> Ir.t +val inter : Ir.t -> Ir.t -> Ir.t +val capture : string -> Ir.t -> Ir.t +val ( ^. ) : Ir.t -> Ir.t -> Ir.t + +(** {2 Oracle} *) + +(** [check rules input] compiles each rule, runs the ocamllex DFA and sedlex + DFA, and returns [true] iff both agree exactly. *) +val check : Ir.t array -> string -> bool + +(** [oracle rules input] is like {!check} but also prints the result. Prints + [ERROR] when the two disagree. *) +val oracle : Ir.t array -> string -> unit + +(** {2 QCheck support} *) + +val gen_ir : Ir.t QCheck2.Gen.t +val gen_input : string QCheck2.Gen.t +val print_case : Ir.t array * string -> string From 48b7c94af09f9bd5d64316eef488a77d8571825e Mon Sep 17 00:00:00 2001 From: Hugo Heuzard Date: Wed, 15 Apr 2026 12:08:50 +0200 Subject: [PATCH 4/4] Add oracle expect tests: hand-written cases + QCheck fuzzing Hand-written tests cover captures, multi-rule, backtracking, bounded repetition, complement, subtraction, intersection, or-patterns, and the known Star/Plus overlap limitation. QCheck tests (50k iterations) fuzz single-rule and two-rule patterns with random captures. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/dune | 2 +- test/test_oracle.ml | 223 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 test/test_oracle.ml diff --git a/test/dune b/test/dune index 53f358a..dbcdd1e 100644 --- a/test/dune +++ b/test/dune @@ -1,6 +1,6 @@ (library (name sedlex_test) - (libraries sedlex) + (libraries sedlex sedlex_oracle qcheck-core) (inline_tests (deps UTF-8-test.txt)) (enabled_if diff --git a/test/test_oracle.ml b/test/test_oracle.ml new file mode 100644 index 0000000..9b59ded --- /dev/null +++ b/test/test_oracle.ml @@ -0,0 +1,223 @@ +open Sedlex_compiler +open Sedlex_oracle +module G = QCheck2.Gen + +(* ================================================================== *) +(* Hand-written tests *) +(* ================================================================== *) + +let%expect_test "simple capture" = + oracle [| lit 'a' ^. capture "x" (plus (lit 'b')) ^. lit 'c' |] "abbc"; + [%expect {| "abbc" -> rule 0, len 4, [x="bb"] |}] + +let%expect_test "whole-match capture" = + oracle [| capture "x" (plus (cls 'a' 'z')) |] "hello"; + [%expect {| "hello" -> rule 0, len 5, [x="hello"] |}] + +let%expect_test "two captures in sequence" = + oracle + [| capture "x" (plus (lit 'a')) ^. capture "y" (plus (lit 'b')) |] + "aaabb"; + [%expect {| "aaabb" -> rule 0, len 5, [x="aaa", y="bb"] |}] + +let%expect_test "variable-length capture" = + oracle [| lit 'd' ^. capture "c" (star (cls 'a' 'c')) ^. lit 'd' |] "dabcd"; + [%expect {| "dabcd" -> rule 0, len 5, [c="abc"] |}] + +let%expect_test "no match" = + oracle [| lit 'a' ^. capture "x" (plus (lit 'b')) ^. lit 'c' |] "xyz"; + [%expect {| "xyz" -> no match |}] + +let%expect_test "multi-rule" = + oracle + [| + lit 'd' ^. capture "x" (plus (lit 'b')) ^. lit 'e'; + capture "x" (plus (cls 'a' 'c')); + |] + "dbbe"; + oracle + [| + lit 'd' ^. capture "x" (plus (lit 'b')) ^. lit 'e'; + capture "x" (plus (cls 'a' 'c')); + |] + "abc"; + [%expect + {| + "dbbe" -> rule 0, len 4, [x="bb"] + "abc" -> rule 1, len 3, [x="abc"] + |}] + +let%expect_test "backtrack restores tags" = + (* Rule 0 needs trailing 'c'; input "aaabb" backtracks to rule 1 *) + oracle + [| + capture "x" (plus (lit 'a')) ^. plus (lit 'b') ^. lit 'c'; + capture "x" (plus (lit 'a')) ^. plus (lit 'b'); + |] + "aaabb"; + [%expect {| "aaabb" -> rule 1, len 5, [x="aaa"] |}] + +let%expect_test "bounded repetition" = + oracle [| rep (lit 'a') 2 4 ^. capture "x" (plus (lit 'b')) |] "aaabbb"; + oracle [| rep (lit 'a') 2 4 ^. capture "x" (plus (lit 'b')) |] "abbb"; + [%expect + {| + "aaabbb" -> rule 0, len 6, [x="bbb"] + "abbb" -> no match + |}] + +let%expect_test "complement" = + (* Compl [a-c] matches any char NOT in a-c *) + oracle [| lit 'd' ^. capture "x" (compl (cls 'a' 'c')) ^. lit 'd' |] "dxd"; + oracle [| lit 'd' ^. capture "x" (compl (cls 'a' 'c')) ^. lit 'd' |] "dad"; + [%expect {| + "dxd" -> rule 0, len 3, [x="x"] + "dad" -> no match + |}] + +let%expect_test "subtraction" = + (* Sub([a-e], [c-e]) = [a-b] *) + oracle [| capture "x" (plus (sub (cls 'a' 'e') (cls 'c' 'e'))) |] "abcde"; + [%expect {| "abcde" -> rule 0, len 2, [x="ab"] |}] + +let%expect_test "intersection" = + (* Inter([a-d], [c-f]) = [c-d] *) + oracle [| capture "x" (plus (inter (cls 'a' 'd') (cls 'c' 'f'))) |] "cdabe"; + [%expect {| "cdabe" -> rule 0, len 2, [x="cd"] |}] + +let%expect_test "or-pattern with discriminator" = + (* (lit 'a' as x) | (lit 'b' as x) — each branch binds "x" with different tags *) + oracle [| alt (capture "x" (lit 'a')) (capture "x" (lit 'b')) |] "a"; + oracle [| alt (capture "x" (lit 'a')) (capture "x" (lit 'b')) |] "b"; + oracle [| alt (capture "x" (lit 'a')) (capture "x" (lit 'b')) |] "c"; + [%expect + {| + "a" -> rule 0, len 1, [x="a"] + "b" -> rule 0, len 1, [x="b"] + "c" -> no match + |}] + +let%expect_test "or-pattern with variable-length branches" = + (* (Plus 'a' as x) | (lit 'b', Plus 'c' as x) *) + oracle + [| + alt + (capture "x" (plus (lit 'a'))) + (capture "x" (seq (lit 'b') (plus (lit 'c')))); + |] + "aaa"; + oracle + [| + alt + (capture "x" (plus (lit 'a'))) + (capture "x" (seq (lit 'b') (plus (lit 'c')))); + |] + "bccc"; + [%expect + {| + "aaa" -> rule 0, len 3, [x="aaa"] + "bccc" -> rule 0, len 4, [x="bccc"] + |}] + +let%expect_test "known limitation: overlapping star and capture" = + (* When Star and the following capture overlap in character sets, + the DFA's epsilon closure (Sedlex.add_node) visits the bind's + start-tag node via the star's loop first, preventing the capture's + own path from recording its tag. See the FIXME on add_node. + + The capture must be variable-length and sandwiched between other + parts of the pattern so that Start_plus/End_minus optimizations + cannot resolve positions without tags. *) + + (* Disjoint char sets: works correctly *) + oracle [| star (lit 'a') ^. capture "x" (plus (lit 'b')) ^. lit 'c' |] "aabbc"; + (* Overlapping char sets: star and capture compete for 'a' *) + oracle [| star (lit 'a') ^. capture "x" (plus (lit 'a')) ^. lit 'b' |] "ab"; + oracle [| star (lit 'a') ^. capture "x" (plus (lit 'a')) ^. lit 'b' |] "aaab"; + (* Plus has the same issue *) + oracle [| plus (lit 'a') ^. capture "x" (plus (lit 'a')) ^. lit 'b' |] "aab"; + [%expect + {| + "aabbc" -> rule 0, len 5, [x="bb"] + ERROR "ab" -> ocamllex=rule 0, len 2, [x="a"] dfa=rule 0, len 2, [x=""] + ERROR "aaab" -> ocamllex=rule 0, len 4, [x="a"] dfa=rule 0, len 4, [x=""] + ERROR "aab" -> ocamllex=rule 0, len 3, [x="a"] dfa=rule 0, len 3, [x=""] + |}] + +let qcheck_test ~name gen = + let test = + QCheck2.Test.make ~count:50_000 ~max_fail:10 ~name + ~print:(fun (descs, input_str) -> + Array.iteri + (fun i d -> Printf.printf " rule%d: %s\n" i (Ir.show d)) + descs; + oracle descs input_str; + "") + gen + (fun (irs, input_str) -> check irs input_str) + in + try QCheck2.Test.check_exn test with QCheck2.Test.Test_fail _ -> () + +let%expect_test "qcheck: single rule" = + let gen = G.map2 (fun r s -> ([| r |], s)) gen_ir gen_input in + qcheck_test ~name:"single" gen; + [%expect + {| + rule0: ((Plus ['a'-'c'] as x), Star 'a', 'a') + ERROR "aaa" -> ocamllex=rule 0, len 3, [x="aa"] dfa=rule 0, len 3, [x="aaa"] + rule0: ((Star ['a'-'c'] as y), 'a', Star 'a') + ERROR "aa" -> ocamllex=rule 0, len 2, [y="a"] dfa=rule 0, len 2, [y="aa"] + rule0: (Star ['a'-'c'], ((Plus 'a', 'a') as y)) + ERROR "aaaaa" -> ocamllex=rule 0, len 5, [y="aa"] dfa=rule 0, len 5, [y=""] + rule0: ((Plus (Star 'a', 'd') as x), Plus 'd') + ERROR "dd" -> ocamllex=rule 0, len 2, [x="d"] dfa=rule 0, len 2, [x="dd"] + rule0: (Star (Rep(['a'-'c'], 0..0) | 'a'), (('a', 'a', ('a' | Star 'a')) as y)) + ERROR "aa" -> ocamllex=rule 0, len 2, [y="aa"] dfa=rule 0, len 2, [y=""] + rule0: (((Star 'a' | 'c') as z), 'c', Star 'a', 'c', ['a'-'c']) + ERROR "cca" -> ocamllex=rule 0, len 3, [z=""] dfa=rule 0, len 3, [z="c"] + rule0: ((('a' | Rep(['a'-'c'], 0..0)) as y), Plus 'a') + ERROR "a" -> ocamllex=rule 0, len 1, [y=""] dfa=rule 0, len 1, [y="a"] + rule0: (Star Rep(['a'-'c'], 0..1), (Star ('a', 'a') as y), 'a') + ERROR "a" -> ocamllex=rule 0, len 1, [y=""] dfa=rule 0, len 1, [y=] + rule0: ((eps | 'c'), (('a' | Plus 'c') as y), ('a' | Star 'a')) + ERROR "cb" -> ocamllex=rule 0, len 1, [y="c"] dfa=rule 0, len 1, [y=""] + rule0: (('a' | Star 'a'), (Star Rep(['a'-'c'], 0..1) | 'a'), (('b' | ('a', 'a')) as x)) + ERROR "b" -> ocamllex=rule 0, len 1, [x="b"] dfa=rule 0, len 1, [x=""] + |}] + +let%expect_test "qcheck: two rules" = + let gen = G.map3 (fun a b s -> ([| a; b |], s)) gen_ir gen_ir gen_input in + qcheck_test ~name:"two rules" gen; + [%expect + {| + rule0: 'a' + rule1: (Star (['a'-'c'], Star 'a'), ('a' as y), Star 'a', 'b') + ERROR "ab" -> ocamllex=rule 1, len 2, [y="a"] dfa=rule 1, len 2, [y=] + rule0: 'a' + rule1: ((Star 'a' as x), 'b', (Star 'a' as x)) + ERROR "b" -> ocamllex=rule 1, len 1, [x=""] dfa=rule 1, len 1, [x="", x=""] + rule0: 'a' + rule1: (Star 'a', (Plus 'a' as y)) + ERROR "aa" -> ocamllex=rule 1, len 2, [y="a"] dfa=rule 1, len 2, [y=""] + rule0: ('d', Star Rep(['a'-'c'], 0..0), (('a' | Star 'a') as x), Plus ['a'-'c'], 'd') + rule1: 'a' + ERROR "dad" -> ocamllex=rule 0, len 3, [x=""] dfa=rule 0, len 3, [x="a"] + rule0: ((('b', Star 'a', ['a'-'c'], 'a', Plus Star 'b') as x), 'b', ('a' | Star 'a')) + rule1: 'a' + ERROR "baab" -> ocamllex=rule 0, len 4, [x="baa"] dfa=rule 0, len 4, [x="baab"] + rule0: 'a' + rule1: (Star ['a'-'c'], 'b', (['a'-'c'] as y), 'b', Star 'a') + ERROR "abab" -> ocamllex=rule 1, len 4, [y="a"] dfa=rule 1, len 4, [y=] + rule0: 'a' + rule1: (Star 'd', ('c' as z), ['a'-'c'], 'c', ('c' as z)) + ERROR "dcacc" -> ocamllex=rule 1, len 5, [z="c"] dfa=rule 1, len 5, [z="c", z="c"] + rule0: 'a' + rule1: (('a' | Star 'a'), ('a' as x), Star 'a') + ERROR "aaaaa" -> ocamllex=rule 1, len 5, [x="a"] dfa=rule 1, len 5, [x=] + rule0: 'a' + rule1: ((Star ['a'-'c'] as z), Star 'a', 'b') + ERROR "b" -> ocamllex=rule 1, len 1, [z=""] dfa=rule 1, len 1, [z="b"] + rule0: 'a' + rule1: (Star ['a'-'c'], ('b' as z), (('a', 'a') | ['a'-'c'])) + ERROR "ba" -> ocamllex=rule 1, len 2, [z="b"] dfa=rule 1, len 2, [z=] + |}]