Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dune-project
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ extension.")
(ppxlib (>= 0.26.0))
gen
(ppx_expect :with-test)
(qcheck-core :with-test)
(menhir :with-test)))
1 change: 1 addition & 0 deletions sedlex.opam
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ depends: [
"ppxlib" {>= "0.26.0"}
"gen"
"ppx_expect" {with-test}
"qcheck-core" {with-test}
"menhir" {with-test}
"odoc" {with-doc}
]
Expand Down
5 changes: 5 additions & 0 deletions src/compiler/cset.ml
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,10 @@ let complement c =
in
match c with (-1, j) :: l -> aux (succ j) l | l -> aux (-1) l

let rec mem c = function
| [] -> false
| (lo, hi) :: rest ->
if c < lo then false else if c <= hi then true else mem c rest

let intersection c1 c2 = complement (union (complement c1) (complement c2))
let difference c1 c2 = complement (union (complement c1) c2)
1 change: 1 addition & 0 deletions src/compiler/cset.mli
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ val is_empty : t -> bool
val eof : t
val singleton : int -> t
val interval : int -> int -> t
val mem : int -> t -> bool
val to_seq : t -> int Seq.t
16 changes: 15 additions & 1 deletion src/compiler/sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,21 @@ type state = node list
(* [add_node (state, tags) node] adds [node] to the NFA-node set [state]
via epsilon closure: it follows all epsilon edges recursively, collecting
any tag operations encountered along the way. Returns the updated
(state, tags) pair. Physical identity (memq) prevents revisiting nodes. *)
(state, tags) pair. Physical identity (memq) prevents revisiting nodes.

FIXME: all NFA paths through the epsilon closure contribute to a single
flat tag-op list, which is then attached to the DFA transition. At
runtime, every op in that list executes at the same position (the current
lexbuf position when the transition fires). This is correct when each
tag cell is written by at most one NFA path. But when a repetition
(Star/Plus) creates an epsilon loop that re-enters a bind's tag node,
[memq] prevents the second visit, so the tag op appears only once — set
by whichever path the closure explored first. A correct tagged DFA
(Laurikari, 2000) would maintain per-thread tag registers in each DFA
state so that different NFA paths record independent tag values and the
right one is selected on acceptance. Without this, patterns like
[Star 'a', ('a' as x)] produce wrong capture positions.
See test/test_oracle.ml "known limitation" for counterexamples. *)
let rec add_node (state, tags) node =
if List.memq node state then (state, tags)
else (
Expand Down
2 changes: 1 addition & 1 deletion test/dune
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
(library
(name sedlex_test)
(libraries sedlex)
(libraries sedlex sedlex_oracle qcheck-core)
(inline_tests
(deps UTF-8-test.txt))
(enabled_if
Expand Down
3 changes: 3 additions & 0 deletions test/ocamllex_oracle/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
(library
(name ocamllex_oracle)
(libraries sedlex.compiler ocamllex_vendored))
186 changes: 186 additions & 0 deletions test/ocamllex_oracle/ocamllex_oracle.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
(* Oracle using vendored ocamllex DFA compiler.

Converts Ir.t patterns to ocamllex's Syntax.regular_expression, compiles
them with Lexgen.make_dfa, then simulates the resulting automata to produce
match results comparable to the sedlex oracle. *)

open Ocamllex_vendored
module Sedlex_cset = Sedlex_compiler.Cset

(* ================================================================== *)
(* Ir.t → Syntax.regular_expression conversion *)
(* ================================================================== *)

let dummy_loc : Syntax.location =
{ loc_file = ""; start_pos = 0; end_pos = 0; start_line = 0; start_col = 0 }

(* Convert sedlex Cset.t (Unicode intervals) to ocamllex Cset.t (byte intervals).
Clamps intervals to the byte range 0-255; intervals entirely above 255 are dropped. *)
let convert_cset (cset : Sedlex_cset.t) : Cset.t =
List.fold_left
(fun acc (lo, hi) ->
if lo > 255 then acc else Cset.union acc (Cset.interval lo (min hi 255)))
Cset.empty (Sedlex_cset.to_list cset)

open Sedlex_compiler.Ir

let rec convert (ir : Sedlex_compiler.Ir.t) : Syntax.regular_expression =
match ir with
| Chars cset -> Characters (convert_cset cset)
| Eps -> Epsilon
| Seq elems ->
List.fold_left
(fun acc e -> Syntax.Sequence (acc, convert e))
Epsilon elems
| Alt branches -> (
match branches with
| [] -> Epsilon
| [b] -> convert b
| first :: rest ->
List.fold_left
(fun acc b -> Syntax.Alternative (acc, convert b))
(convert first) rest)
| Star inner -> Repetition (convert inner)
| Plus inner -> Sequence (convert inner, Repetition (convert inner))
| Rep (inner, lo, hi) ->
let r = convert inner in
let mandatory =
let rec repeat n acc =
if n <= 0 then acc else repeat (n - 1) (Syntax.Sequence (r, acc))
in
repeat lo Epsilon
in
let optional =
let rec opt_repeat n acc =
if n <= 0 then acc
else
opt_repeat (n - 1)
(Syntax.Alternative (Epsilon, Sequence (r, acc)))
in
opt_repeat (hi - lo) Epsilon
in
Sequence (mandatory, optional)
| Capture (name, inner) -> Bind (convert inner, (name, dummy_loc))

(* ================================================================== *)
(* Build ocamllex entry from Ir.t rules *)
(* ================================================================== *)

let build_entry (rules : Sedlex_compiler.Ir.t array) :
(string list, int) Syntax.entry =
let clauses =
Array.to_list (Array.mapi (fun i ir -> (convert ir, i)) rules)
in
{ name = "oracle"; shortest = false; args = []; clauses }

(* ================================================================== *)
(* Automata simulation *)
(* ================================================================== *)

type binding = { name : string; start_offset : int; end_offset : int }
type result = { rule : int; length : int; bindings : binding list }

let simulate_automata (entries : (string list, int) Lexgen.automata_entry list)
(auto : Lexgen.automata array) (input : int array) : result option =
let entry = List.hd entries in
let mem = Array.make entry.auto_mem_size (-1) in
let init_state, init_moves = entry.auto_initial_state in
(* Execute initial memory actions *)
let exec_mem_actions pos actions =
List.iter
(fun (action : Lexgen.memory_action) ->
match action with
| Set dst -> mem.(dst) <- pos
| Copy (dst, src) -> mem.(dst) <- mem.(src))
actions
in
let exec_tag_actions actions =
List.iter
(fun (action : Lexgen.tag_action) ->
match action with
| SetTag (dst, src) -> mem.(dst) <- mem.(src)
| EraseTag dst -> mem.(dst) <- -1)
actions
in
exec_mem_actions 0 init_moves;
let last_action = ref (-1) in
let last_pos = ref 0 in
let last_mem = ref (Array.copy mem) in
let pos = ref 0 in
let state = ref init_state in
let running = ref true in
while !running do
match auto.(!state) with
| Lexgen.Perform (action, tag_ops) ->
exec_tag_actions tag_ops;
last_action := action;
last_pos := !pos;
last_mem := Array.copy mem;
running := false
| Lexgen.Shift (remember, moves) -> (
(match remember with
| Remember (action, tag_ops) ->
exec_tag_actions tag_ops;
last_action := action;
last_pos := !pos;
last_mem := Array.copy mem
| No_remember -> ());
let byte =
if !pos >= Array.length input then 256 (* eof *) else input.(!pos)
in
let move, mem_actions = moves.(byte) in
exec_mem_actions (!pos + 1) mem_actions;
match move with
| Goto target ->
incr pos;
state := target
| Backtrack -> running := false)
done;
if !last_action < 0 then None
else (
(* Resolve bindings from the action's t_env *)
let action_num = !last_action in
let match_len = !last_pos in
let saved_mem = !last_mem in
let env =
try
let _, env, _ =
List.find (fun (n, _, _) -> n = action_num) entry.auto_actions
in
env
with Not_found -> []
in
let resolve_addr (Lexgen.Sum (base, offset)) =
(match base with
| Lexgen.Start -> 0
| Lexgen.End -> match_len
| Lexgen.Mem n -> saved_mem.(n))
+ offset
in
let bindings =
List.filter_map
(fun ((name, _loc), (info : Lexgen.ident_info)) ->
match info with
| Ident_string (is_opt, start_addr, end_addr) ->
let s = resolve_addr start_addr in
let e = resolve_addr end_addr in
if is_opt && s < 0 then None
else Some { name; start_offset = s; end_offset = e }
| Ident_char (is_opt, addr) ->
let p = resolve_addr addr in
if is_opt && p < 0 then None
else Some { name; start_offset = p; end_offset = p + 1 })
env
|> List.sort compare
in
Some { rule = action_num; length = match_len; bindings })

(* ================================================================== *)
(* Public API *)
(* ================================================================== *)

let simulate (rules : Sedlex_compiler.Ir.t array) (input : int array) :
result option =
let entry = build_entry rules in
let entries, auto = Lexgen.make_dfa [entry] in
simulate_automata entries auto input
1 change: 1 addition & 0 deletions test/ocamllex_vendored/.ocamlformat-ignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*
33 changes: 33 additions & 0 deletions test/ocamllex_vendored/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Vendored ocamllex sources

These files are copied verbatim from the OCaml compiler's `lex/` directory.
They provide ocamllex's DFA compiler (`Lexgen.make_dfa`) which is used as an
independent reference oracle to test sedlex's tagged DFA compilation.

## Files

| File | Description |
|------|-------------|
| `cset.ml` / `.mli` | Character sets |
| `lexgen.ml` / `.mli` | DFA compiler with tagged transitions |
| `syntax.ml` / `.mli` | Regular expression AST |
| `table.ml` | DFA table representation |

## Updating

Run `./sync.sh` to copy the latest sources from the current opam switch:

```
./test/ocamllex_vendored/sync.sh
```

To sync from a specific directory:

```
OCAML_LEX_DIR=/path/to/ocaml/lex ./test/ocamllex_vendored/sync.sh
```

## License

These files are part of the OCaml compiler and are distributed under the
GNU Lesser General Public License version 2.1 (see file headers).
97 changes: 97 additions & 0 deletions test/ocamllex_vendored/cset.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
(**************************************************************************)
(* *)
(* OCaml *)
(* *)
(* Luc Maranget, Jerome Vouillon projet Cristal, *)
(* INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. *)
(* *)
(* All rights reserved. This file is distributed under the terms of *)
(* the GNU Lesser General Public License version 2.1, with the *)
(* special exception on linking described in the file LICENSE. *)
(* *)
(**************************************************************************)

exception Bad

type t = (int * int) list


let empty = []
let is_empty = function
| [] -> true
| _ -> false

let singleton c = [c,c]

let interval c1 c2 =
if c1 <= c2 then [c1,c2]
else [c2,c1]


let rec union s1 s2 = match s1,s2 with
| [],_ -> s2
| _,[] -> s1
| (c1,d1) as p1::r1, (c2,d2)::r2 ->
if c1 > c2 then
union s2 s1
else begin (* c1 <= c2 *)
if d1+1 < c2 then
p1::union r1 s2
else if d1 < d2 then
union ((c1,d2)::r2) r1
else
union s1 r2
end

let rec inter l l' = match l, l' with
_, [] -> []
| [], _ -> []
| (c1, c2)::r, (c1', c2')::r' ->
if c2 < c1' then
inter r l'
else if c2' < c1 then
inter l r'
else if c2 < c2' then
(Int.max c1 c1', c2)::inter r l'
else
(Int.max c1 c1', c2')::inter l r'

let rec diff l l' = match l, l' with
_, [] -> l
| [], _ -> []
| (c1, c2)::r, (c1', c2')::r' ->
if c2 < c1' then
(c1, c2)::diff r l'
else if c2' < c1 then
diff l r'
else
let r'' = if c2' < c2 then (c2' + 1, c2) :: r else r in
if c1 < c1' then
(c1, c1' - 1)::diff r'' r'
else
diff r'' r'


let eof = singleton 256
and all_chars = interval 0 255
and all_chars_eof = interval 0 256

let complement s = diff all_chars s

let env_to_array env = match env with
| [] -> assert false
| (_,x)::rem ->
let res = Array.make 257 x in
List.iter
(fun (c,y) ->
List.iter
(fun (i,j) ->
for k=i to j do
res.(k) <- y
done)
c)
rem ;
res
Loading
Loading