Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# unreleased
- Support nested `let..in` for `[%sedlex.regexp?]` definitions
- Add shortest match support via `match%sedlex.shortest` (#180)

# 3.7 (2025-10-06)
- Update to unicode 17.0.0
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,13 @@ Like ocamllex, sedlex uses **longest match** with **first rule priority**:
the input `"if"` is matched by the first rule because it is listed
first, even though the second rule also accepts `"if"`.

sedlex also supports **shortest match** via `match%sedlex.shortest`.
With shortest match, the lexer returns as soon as *any* rule matches rather
than continuing to find the longest match. First rule priority still applies
when multiple rules match at the same position. For example, `Plus 'a'` with
input `"aaa"` matches just `"a"` in shortest mode, whereas it would match
`"aaa"` in longest mode.

The actions can call functions from the Sedlexing module to extract
(parts of) the matched lexeme, in the desired encoding.

Expand Down
31 changes: 29 additions & 2 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ let regexp_of_pattern env =
in
aux ~encoding:Ascii

let handle_sedlex_match ~env ~map_rhs match_expr =
let handle_sedlex_match ?(shortest = false) ~env ~map_rhs match_expr =
let lexbuf =
match match_expr with
| { pexp_desc = Pexp_match (lexbuf, _); _ } -> (
Expand Down Expand Up @@ -517,7 +517,7 @@ let handle_sedlex_match ~env ~map_rhs match_expr =
cases
in
let brs = Array.of_list cases in
let auto = Sedlex.compile (Array.map fst brs) in
let auto = Sedlex.compile ~shortest (Array.map fst brs) in
(gen_definition lexbuf auto cases error, auto)

let previous = ref []
Expand Down Expand Up @@ -550,6 +550,25 @@ let mapper =
| [%expr [%sedlex [%e? { pexp_desc = Pexp_match _; _ } as match_expr]]]
->
fst (handle_sedlex_match ~env ~map_rhs:this#expression match_expr)
(* match%sedlex.shortest <lexbuf> with ... *)
| {
pexp_desc =
Pexp_extension
( { txt = "sedlex.shortest"; _ },
PStr
[
{
pstr_desc =
Pstr_eval
(({ pexp_desc = Pexp_match _; _ } as match_expr), _);
_;
};
] );
_;
} ->
fst
(handle_sedlex_match ~shortest:true ~env ~map_rhs:this#expression
match_expr)
(* let <name> = <rhs> in <body> — intercept when <rhs> is a regexp *)
| [%expr
let [%p? { ppat_desc = Ppat_var { txt = name; _ }; _ }] =
Expand All @@ -563,6 +582,11 @@ let mapper =
| [%expr [%sedlex [%e? _]]] ->
err e.pexp_loc
"the %%sedlex extension is only recognized on match expressions"
| { pexp_desc = Pexp_extension ({ txt = "sedlex.shortest"; _ }, _); _ }
->
err e.pexp_loc
"the %%sedlex.shortest extension is only recognized on match \
expressions"
| _ -> super#expression e

val toplevel = true
Expand Down Expand Up @@ -619,6 +643,9 @@ let extensions =
Extension.declare "sedlex" Extension.Context.expression
Ast_pattern.(single_expr_payload __)
(fun ~loc:_ ~path:_ expr -> mapper#expression expr);
Extension.declare "sedlex.shortest" Extension.Context.expression
Ast_pattern.(single_expr_payload __)
(fun ~loc:_ ~path:_ expr -> mapper#expression expr);
]

let () =
Expand Down
27 changes: 25 additions & 2 deletions src/syntax/sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ let transition (state : state) =
type dfa_state = { trans : (Cset.t * int) array; finals : bool array }
type dfa = dfa_state array

let compile rs =
let compile ?(shortest = false) rs =
let rs = Array.map compile_re rs in
let counter = ref 0 in
let states = Hashtbl.create 31 in
Expand All @@ -141,7 +141,30 @@ let compile rs =
Array.iter (fun (i, _) -> init := add_node !init i) rs;
let i = aux !init in
assert (i = 0);
Array.init !counter (Hashtbl.find states_def)
let dfa = Array.init !counter (Hashtbl.find states_def) in
if shortest then (
(* Collect reachable states, stripping transitions from accepting ones *)
let n = Array.length dfa in
let remap = Array.make n (-1) in
let order = Array.make n 0 in
let next = ref 0 in
let rec mark i =
if remap.(i) = -1 then (
let j = !next in
remap.(i) <- j;
order.(j) <- i;
incr next;
let st = dfa.(i) in
if not (Array.exists Fun.id st.finals) then
Array.iter (fun (_, t) -> mark t) st.trans)
in
mark 0;
Array.init !next (fun j ->
let st = dfa.(order.(j)) in
if Array.exists Fun.id st.finals then { st with trans = [||] }
else
{ st with trans = Array.map (fun (c, t) -> (c, remap.(t))) st.trans }))
Comment on lines +145 to +166
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you extract this as separate function so we can reason/change the implementation if it is ever needed?

else dfa

let cset_to_label cset =
let escape_dot c =
Expand Down
2 changes: 1 addition & 1 deletion src/syntax/sedlex.mli
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ val intersection : regexp -> regexp -> regexp option
type dfa_state = { trans : (Sedlex_cset.t * int) array; finals : bool array }
type dfa = dfa_state array

val compile : regexp array -> dfa
val compile : ?shortest:bool -> regexp array -> dfa
val dfa_to_dot : dfa -> string
84 changes: 84 additions & 0 deletions test/basic.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1128,3 +1128,87 @@ let%expect_test "nested_let_regexp_toplevel" =
Hex: 0xDEAD
Word: rest
EOF |}]

let%expect_test "shortest match" =
(* "ab" | "abc" with input "abc": shortest should match "ab" (rule 0) *)
let buf = Sedlexing.Utf8.from_string "abc" in
let result =
match%sedlex.shortest buf with
| "ab" -> "matched ab"
| "abc" -> "matched abc"
| _ -> "error"
in
Printf.printf "%s\n" result;
Printf.printf "lexeme: %s\n" (Sedlexing.Utf8.lexeme buf);
[%expect {|
matched ab
lexeme: ab |}];
(* Longest match for comparison *)
let buf = Sedlexing.Utf8.from_string "abc" in
let result =
match%sedlex buf with
| "ab" -> "matched ab"
| "abc" -> "matched abc"
| _ -> "error"
in
Printf.printf "%s\n" result;
Printf.printf "lexeme: %s\n" (Sedlexing.Utf8.lexeme buf);
[%expect {|
matched abc
lexeme: abc |}]

let%expect_test "shortest match priority" =
(* Two rules matching same shortest prefix: first rule wins *)
let buf = Sedlexing.Utf8.from_string "ab" in
let result =
match%sedlex.shortest buf with
| "ab" -> "rule 0"
| "ab" | "abc" -> "rule 1"
| _ -> "error"
in
Printf.printf "%s\n" result;
[%expect {| rule 0 |}]

let%expect_test "shortest match no match" =
(* No match: error case *)
let buf = Sedlexing.Utf8.from_string "xyz" in
let result =
match%sedlex.shortest buf with "ab" -> "matched" | _ -> "no match"
in
Printf.printf "%s\n" result;
[%expect {| no match |}]

let%expect_test "shortest match with star" =
(* Plus 'a' with input "aaa": shortest matches single 'a' *)
let buf = Sedlexing.Utf8.from_string "aaa" in
let result =
match%sedlex.shortest buf with Plus 'a' -> "matched" | _ -> "error"
in
Printf.printf "%s\n" result;
Printf.printf "lexeme: %s\n" (Sedlexing.Utf8.lexeme buf);
[%expect {|
matched
lexeme: a |}]

let%expect_test "shortest match repeated" =
(* Use shortest match in a loop to tokenize *)
let buf = Sedlexing.Utf8.from_string "aabab" in
let rec loop () =
match%sedlex.shortest buf with
| Plus 'a' ->
Printf.printf "a+: %s\n" (Sedlexing.Utf8.lexeme buf);
loop ()
| "b" ->
Printf.printf "b: %s\n" (Sedlexing.Utf8.lexeme buf);
loop ()
| eof -> Printf.printf "done\n"
| _ -> Printf.printf "error\n"
in
loop ();
[%expect {|
a+: a
a+: a
b: b
a+: a
b: b
done |}]
167 changes: 167 additions & 0 deletions test/codegen/test_gen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,170 @@ let%expect_test "multi-rule" =
Sedlexing.start buf;
(match __sedlex_state_0 buf with | 0 -> () | 1 -> () | 2 -> () | _ -> ())
|}]

(* Shortest match codegen tests *)

let%expect_test "shortest: simple string match" =
(match%sedlex_test_shortest buf with "ab" | "de" -> () | _ -> ());
[%expect
{|
DOT:
digraph {
rankdir=LR;
node [shape=circle];

_start [shape=point];
_start -> state0;

state0 [label="0"];
state0 -> state1 [label="'a'"];
state0 -> state3 [label="'d'"];
state1 [label="1"];
state1 -> state2 [label="'b'"];
state2 [label="2\n[rule 0]", shape=doublecircle];
state3 [label="3"];
state3 -> state2 [label="'e'"];
}
CODE:
let rec __sedlex_state_0 buf =
match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| 1 -> __sedlex_state_3 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf in
Sedlexing.start buf; (match __sedlex_state_0 buf with | 0 -> () | _ -> ())
|}]

let%expect_test "shortest: character class (Plus)" =
(match%sedlex_test_shortest buf with Plus 'a' .. 'z' -> () | _ -> ());
[%expect
{|
DOT:
digraph {
rankdir=LR;
node [shape=circle];

_start [shape=point];
_start -> state0;

state0 [label="0"];
state0 -> state1 [label="'a'-'z'"];
state1 [label="1\n[rule 0]", shape=doublecircle];
}
CODE:
let __sedlex_state_0 buf =
match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf in
Sedlexing.start buf; (match __sedlex_state_0 buf with | 0 -> () | _ -> ())
|}]

let%expect_test "shortest: multi-rule" =
(match%sedlex_test_shortest buf with
| "ab" -> ()
| "de" -> ()
| Plus '0' .. '9' -> ()
| _ -> ());
[%expect
{|
DOT:
digraph {
rankdir=LR;
node [shape=circle];

_start [shape=point];
_start -> state0;

state0 [label="0"];
state0 -> state1 [label="'0'-'9'"];
state0 -> state2 [label="'a'"];
state0 -> state4 [label="'d'"];
state1 [label="1\n[rule 2]", shape=doublecircle];
state2 [label="2"];
state2 -> state3 [label="'b'"];
state3 [label="3\n[rule 0]", shape=doublecircle];
state4 [label="4"];
state4 -> state5 [label="'e'"];
state5 [label="5\n[rule 1]", shape=doublecircle];
}
CODE:
let rec __sedlex_state_0 buf =
match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> 2
| 1 -> __sedlex_state_2 buf
| 2 -> __sedlex_state_4 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf
and __sedlex_state_4 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 1
| _ -> Sedlexing.backtrack buf in
Sedlexing.start buf;
(match __sedlex_state_0 buf with | 0 -> () | 1 -> () | 2 -> () | _ -> ())
|}]

let%expect_test "shortest: overlapping prefixes" =
(match%sedlex_test_shortest buf with "ab" -> () | "abc" -> () | _ -> ());
[%expect
{|
DOT:
digraph {
rankdir=LR;
node [shape=circle];

_start [shape=point];
_start -> state0;

state0 [label="0"];
state0 -> state1 [label="'a'"];
state1 [label="1"];
state1 -> state2 [label="'b'"];
state2 [label="2\n[rule 0]", shape=doublecircle];
}
CODE:
let rec __sedlex_state_0 buf =
match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf in
Sedlexing.start buf;
(match __sedlex_state_0 buf with | 0 -> () | 1 -> () | _ -> ())
|}]

let%expect_test "shortest: dead rule (prefix match)" =
(match%sedlex_test_shortest buf with "a" -> () | "aa" -> () | _ -> ());
[%expect
{|
DOT:
digraph {
rankdir=LR;
node [shape=circle];

_start [shape=point];
_start -> state0;

state0 [label="0"];
state0 -> state1 [label="'a'"];
state1 [label="1\n[rule 0]", shape=doublecircle];
}
CODE:
let __sedlex_state_0 buf =
match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| _ -> Sedlexing.backtrack buf in
Sedlexing.start buf;
(match __sedlex_state_0 buf with | 0 -> () | 1 -> () | _ -> ())
|}]
Loading
Loading