Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# unreleased
- Add `Sedlexing.accept` for custom buffer control over final state acceptance (#81)
- Support nested `let..in` for `[%sedlex.regexp?]` definitions
- Add support for named captured group (#177, #178)

Expand Down
2 changes: 2 additions & 0 deletions src/lib/sedlexing.ml
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ let start lexbuf =
lexbuf.start_line <- lexbuf.curr_line;
mark lexbuf (-1)

let accept _lexbuf = true

let backtrack lexbuf =
lexbuf.pos <- lexbuf.marked_pos;
lexbuf.bytes_pos <- lexbuf.marked_bytes_pos;
Expand Down
21 changes: 16 additions & 5 deletions src/lib/sedlexing.mli
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@

It is possible to have sedlex-generated lexers work on a custom
implementation for lex buffers. To do this, define a module [L] which
implements the [start], [next], [mark] and [backtrack] functions (See the
Internal Interface section below for a specification). They need not work on
a type named [lexbuf]: you can use the type name you want. Then, just do in
your sedlex-processed source, bind this module to the name [Sedlexing] (for
instance, with a local module definition: [let module Sedlexing = L in ...].
implements the [start], [next], [mark], [backtrack] and [accept] functions
(See the Internal Interface section below for a specification). They need
not work on a type named [lexbuf]: you can use the type name you want. Then,
just do in your sedlex-processed source, bind this module to the name
[Sedlexing] (for instance, with a local module definition:
[let module Sedlexing = L in ...].

Of course, you'll probably want to define functions like [lexeme] to be used
in the lexers semantic actions. *)
Expand Down Expand Up @@ -234,6 +235,16 @@ val backtrack : lexbuf -> int
and can be removed at any time. *)
val __private__next_int : lexbuf -> int

(** [accept lexbuf] is called when the lexer reaches a final state. Returns
[true] to accept the match or [false] to reject it (causing [backtrack] to
be called instead). The default implementation always returns [true].

A custom [Sedlexing] module can override this to inspect the current
position and reject matches, falling back to an earlier marked state. This
is useful, for example, to reject matches that do not end on a grapheme
cluster boundary. *)
val accept : lexbuf -> bool

(** Tagged DFA memory cells for [as] bindings.

The following functions manage an internal array of memory cells used to
Expand Down
17 changes: 12 additions & 5 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -231,13 +231,18 @@ let state_fun state = Printf.sprintf "__sedlex_state_%i" state

(* [call_state lexbuf auto state] generates the expression that transitions
into DFA [state]. If the state has no outgoing transitions (a sink), it
returns the accepting rule index directly; otherwise it emits a function
call to the generated state function. *)
calls [accept] to check whether the match is valid, returning the rule
index or falling back to [backtrack]; otherwise it emits a function call
to the generated state function. *)
let call_state lexbuf (auto : Sedlex.dfa) state =
let loc = default_loc in
let { Sedlex.trans; finals } = auto.(state) in
if Array.length trans = 0 then (
match best_final finals with
| Some i -> eint ~loc:default_loc i
| Some i ->
[%expr
if Sedlexing.accept [%e lexbuf] then [%e eint ~loc i]
else Sedlexing.backtrack [%e lexbuf]]
| None -> assert false)
else appfun (state_fun state) [lexbuf]

Expand All @@ -264,7 +269,8 @@ let gen_tag_ops lexbuf (ops : Sedlex.tag_op list) cont =

(* [gen_state (lexbuf_name, lexbuf) auto i {trans; finals}] generates the
function [__sedlex_state_N] for DFA state [i]. The function:
1. If the state is accepting, calls [mark] to save the current position.
1. If the state is accepting, calls [accept] to check whether the
match is valid, then [mark] to save the position if accepted.
2. Reads the next code point, maps it through the partition function to
get an equivalence class index, then pattern-matches on that index.
3. Each transition arm executes its tag operations then calls the target
Expand Down Expand Up @@ -309,7 +315,8 @@ let gen_state (lexbuf_name, lexbuf) (auto : Sedlex.dfa) i
| Some i ->
ret
[%expr
Sedlexing.mark [%e lexbuf] [%e eint ~loc i];
if Sedlexing.accept [%e lexbuf] then
Sedlexing.mark [%e lexbuf] [%e eint ~loc i];
[%e body ()]]

(* [gen_recflag auto] determines whether the generated state functions need
Expand Down
84 changes: 48 additions & 36 deletions test/codegen/test_gen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ let%expect_test "simple string match" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with | 0 -> () | _ -> ()
|}]
Expand Down Expand Up @@ -59,7 +59,7 @@ let%expect_test "character class" =
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -103,17 +103,17 @@ let%expect_test "multi-rule" =
| 2 -> __sedlex_state_4 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 2;
if Sedlexing.accept buf then Sedlexing.mark buf 2;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf)
and __sedlex_state_2 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_4 buf =
match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
| 0 -> 1
| 0 -> if Sedlexing.accept buf then 1 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 -> ()
Expand Down Expand Up @@ -153,7 +153,7 @@ let%expect_test "as binding: simple" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 ->
Expand Down Expand Up @@ -188,7 +188,7 @@ let%expect_test "as binding: whole-match shortcut" =
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -235,7 +235,7 @@ let%expect_test "as binding: multiple bindings" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 ->
Expand Down Expand Up @@ -280,12 +280,12 @@ let%expect_test "as binding: or-pattern with discriminator" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf)
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -352,8 +352,12 @@ let%expect_test "as binding: shared prefix or-pattern" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_5 buf =
match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
| 0 -> (Sedlexing.__private__set_mem_value buf 0 0; 0)
| 1 -> (Sedlexing.__private__set_mem_value buf 0 1; 0)
| 0 ->
(Sedlexing.__private__set_mem_value buf 0 0;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| 1 ->
(Sedlexing.__private__set_mem_value buf 0 1;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf;
Sedlexing.__private__init_mem buf 1;
Expand Down Expand Up @@ -418,12 +422,16 @@ let%expect_test "as binding: 3-way or reuses disc cell" =
and __sedlex_state_3 buf =
match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
| 0 -> (Sedlexing.__private__set_mem_value buf 0 0; __sedlex_state_4 buf)
| 1 -> (Sedlexing.__private__set_mem_value buf 0 1; 0)
| 1 ->
(Sedlexing.__private__set_mem_value buf 0 1;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| _ -> Sedlexing.backtrack buf
and __sedlex_state_4 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
| 0 -> (Sedlexing.__private__set_mem_value buf 0 2; 0)
| 0 ->
(Sedlexing.__private__set_mem_value buf 0 2;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| _ -> Sedlexing.backtrack buf) in
match Sedlexing.start buf;
Sedlexing.__private__init_mem buf 1;
Expand Down Expand Up @@ -483,11 +491,11 @@ let%expect_test "as binding: multi-rule" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> 1
| 0 -> if Sedlexing.accept buf then 1 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 ->
Expand Down Expand Up @@ -525,7 +533,7 @@ let%expect_test "as binding: wrapping alternation" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 ->
Expand Down Expand Up @@ -586,7 +594,7 @@ let%expect_test "optim: element-length (Offset_from_tag)" =
| 0 -> __sedlex_state_3 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_3 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -638,12 +646,12 @@ let%expect_test "optim: or-pattern offset propagation" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf)
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -691,12 +699,12 @@ let%expect_test "optim: discriminator elision" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_1 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_1 buf
| _ -> Sedlexing.backtrack buf)
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -748,7 +756,7 @@ let%expect_test "optim: intra-rule tag coalescing" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -827,7 +835,7 @@ let%expect_test "optim: cross-rule cell sharing" =
| 1 -> __sedlex_state_3 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_3 buf
| _ -> Sedlexing.backtrack buf)
Expand All @@ -842,7 +850,7 @@ let%expect_test "optim: cross-rule cell sharing" =
| 1 -> __sedlex_state_6 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_6 buf =
Sedlexing.mark buf 1;
if Sedlexing.accept buf then Sedlexing.mark buf 1;
(match __sedlex_partition_7 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_6 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -913,8 +921,8 @@ let%expect_test "optim: dead tag elimination" =
and __sedlex_state_2 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| 1 -> 0
| 2 -> 1
| 1 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| 2 -> if Sedlexing.accept buf then 1 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf;
Sedlexing.__private__init_mem buf 1;
Expand Down Expand Up @@ -966,7 +974,7 @@ let%expect_test "optim: self-loop tag delay" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -1023,7 +1031,7 @@ let%expect_test "optim: tag remapping after coalescing" =
and __sedlex_state_2 buf =
match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_2 buf
| 1 -> 0
| 1 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| _ -> Sedlexing.backtrack buf in
match Sedlexing.start buf; __sedlex_state_0 buf with
| 0 ->
Expand Down Expand Up @@ -1082,9 +1090,9 @@ let%expect_test "optim: set_prev with backtracking" =
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_2 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> 0
| 0 -> if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf
| 1 -> __sedlex_state_2 buf
| _ -> Sedlexing.backtrack buf) in
match Sedlexing.start buf;
Expand Down Expand Up @@ -1150,7 +1158,7 @@ let%expect_test "Rep fixed-length prefix enables Start_plus" =
| 0 -> __sedlex_state_4 buf
| _ -> Sedlexing.backtrack buf
and __sedlex_state_4 buf =
Sedlexing.mark buf 0;
if Sedlexing.accept buf then Sedlexing.mark buf 0;
(match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
| 0 -> __sedlex_state_4 buf
| _ -> Sedlexing.backtrack buf) in
Expand Down Expand Up @@ -1224,7 +1232,9 @@ let%expect_test "as binding: or-chain then nested or on right" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_3 buf =
match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
| 0 -> (Sedlexing.__private__set_mem_value buf 0 0; 0)
| 0 ->
(Sedlexing.__private__set_mem_value buf 0 0;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| _ -> Sedlexing.backtrack buf
and __sedlex_state_5 buf =
match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
Expand All @@ -1245,7 +1255,9 @@ let%expect_test "as binding: or-chain then nested or on right" =
| _ -> Sedlexing.backtrack buf
and __sedlex_state_9 buf =
match __sedlex_partition_7 (Sedlexing.__private__next_int buf) with
| 0 -> (Sedlexing.__private__set_mem_value buf 0 2; 0)
| 0 ->
(Sedlexing.__private__set_mem_value buf 0 2;
if Sedlexing.accept buf then 0 else Sedlexing.backtrack buf)
| _ -> Sedlexing.backtrack buf
and __sedlex_state_11 buf =
match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
Expand Down
Loading
Loading