hhugo · hhugo · Jun 5, 2025 · Sep 11, 2025 · Oct 6, 2025 · Nov 24, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -30,12 +30,6 @@ jobs:
     runs-on: ${{ matrix.os }}
 
     steps:
-      - name: Set git to use LF
-        run: |
-          git config --global core.autocrlf false
-          git config --global core.eol lf
-          git config --global core.ignorecase false
-
       - name: Checkout code
         uses: actions/checkout@v4
 

diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml
@@ -17,4 +17,4 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check changelog
-        uses: tarides/changelog-check-action@v1
+        uses: tarides/changelog-check-action@v3
diff --git a/.ocamlformat b/.ocamlformat
@@ -1,4 +1,4 @@
-version=0.27.0
+version=0.28.1
 profile = conventional
 break-separators = after
 space-around-lists = false

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,11 @@
-# dev
+# unreleased
+- Support nested `let..in` for `[%sedlex.regexp?]` definitions
+- Add support for named captured group (#177)
+
+# 3.7 (2025-10-06)
+- Update to unicode 17.0.0
+
+# 3.6 (2025-01-05)
 - Fixed one of the ranges implementing
   Implement Corrigendum #1: UTF-8 Shortest Form
   for 4-bytes long characters (#171)

diff --git a/README.md b/README.md
@@ -43,6 +43,7 @@ if you use camlp4 or camlp5, with the standard or revised syntax.
 
 ## Lexer specifications
 
+### Match rules
 
 sedlex adds a new kind of expression to OCaml: lexer definitions.
 The syntax for the new construction is:
@@ -58,7 +59,7 @@ The syntax for the new construction is:
 or:
 
 ```ocaml
-  [%sedlex match lexbuf with 
+  [%sedlex match lexbuf with
   | R1 -> e1
   ...
   | Rn -> en
@@ -79,97 +80,188 @@ where:
 Unlike ocamllex, lexers work on stream of Unicode codepoints, not
 bytes.
 
+Like ocamllex, sedlex uses **longest match** with **first rule priority**:
+
+- The lexer always tries to match the longest possible prefix of the
+  input.  It does so by continuing to read characters as long as some
+  rule can still match a longer string, while remembering the last
+  position at which a rule did match.
+
+- When two or more rules match the same longest prefix (a tie), the
+  rule that appears first in the `match%sedlex` definition wins.  For
+  example, given the rules `| "if" -> ...` and `| Plus ('a'..'z') -> ...`,
+  the input `"if"` is matched by the first rule because it is listed
+  first, even though the second rule also accepts `"if"`.
+
+**Important:** The `_` (catch-all) case is *not* a regexp — it is the
+fallback when no rule matches.  Because no characters were consumed, the
+matched lexeme is empty (`""`).  If you need to consume and report an
+unexpected character, use `any` instead:
+
+```ocaml
+  | any -> Printf.sprintf "unexpected character: %s" (Sedlexing.Utf8.lexeme buf)
+  | _ -> failwith "unreachable: any matches all characters"
+```
+
 The actions can call functions from the Sedlexing module to extract
 (parts of) the matched lexeme, in the desired encoding.
 
+### Regexp syntax
+
 Regular expressions are syntactically OCaml patterns:
 
 - `"...."` (string constant): recognize the specified string.
-- `'....'` (character constant) : recognize the specified character
-- `i` (integer constant) : recognize the specified codepoint
-- `'...' .. '...'`: character range
-- `i1 .. i2`: range between two codepoints
-- `R1 | R2` : alternation
-- `R, R2, ..., Rn` : concatenation
-- `Star R` : Kleene star (0 or more repetition)
-- `Plus R` : equivalent to `R, R*`
-- `Opt R` : equivalent to `("" | R)`
-- `Rep (R, n)` : equivalent to `R{n}`
-- `Rep (R, n .. m)` : equivalent to `R{n, m}`
-- `Chars "..."` : recognize any character in the string
-- `Compl R` : assume that R is a single-character length regexp (see below)
-  and recognize the complement set
-- `Sub (R1,R2)` : assume that R is a single-character length regexp (see below)
-  and recognize the set of items in `R1` but not in `R2` ("subtract")
-- `Intersect (R1,R2)` : assume that `R` is a single-character length regexp (see
-  below) and recognize the set of items which are in both `R1` and `R2`
-- `Utf8 R` : string literals inside R are assumed to be utf-8 encoded.
-- `Latin1 R` : string literals inside R are assumed to be latin1 encoded.
-- `Ascii R` : string literals inside R are assumed to be ascii encoded.
-- `lid` (lowercase identifier) : reference a named regexp (see below)
+- `'....'` (character constant): recognize the specified character.
+- `i` (integer constant): recognize the specified codepoint.
+- `'...' .. '...'`: character range.
+- `i1 .. i2`: range between two codepoints.
+- `R1 | R2`: alternation.
+- `R, R2, ..., Rn`: concatenation.
+- `Star R`: Kleene star (0 or more repetition).
+- `Plus R`: equivalent to `R, R*`.
+- `Opt R`: equivalent to `("" | R)`.
+- `Rep (R, n)`: equivalent to `R{n}`.
+- `Rep (R, n .. m)`: equivalent to `R{n, m}`.
+- `Chars "..."`: recognize any character in the string.
+- `Compl R`: assume that R is a single-character length regexp (see below)
+  and recognize the complement set.
+- `Sub (R1,R2)`: assume that R is a single-character length regexp (see below)
+  and recognize the set of items in `R1` but not in `R2` ("subtract").
+- `Intersect (R1,R2)`: assume that `R` is a single-character length regexp (see
+  below) and recognize the set of items which are in both `R1` and `R2`.
+- `Utf8 R`: string literals inside R are assumed to be utf-8 encoded.
+- `Latin1 R`: string literals inside R are assumed to be latin1 encoded.
+- `Ascii R`: string literals inside R are assumed to be ascii encoded.
+- `lid` (lowercase identifier): reference a named regexp (see below).
 
 A single-character length regexp is a regexp which does not contain (after
 expansion of references) concatenation, Star, Plus, Opt or string constants
 with a length different from one.
 
+### Precedence
 
+Since sedlex regular expressions are encoded as OCaml patterns, they follow
+OCaml's pattern precedence rules. From lowest to highest:
 
-Note:
- - The OCaml source is assumed to be encoded in UTF-8.
- - Strings and chars litterals will be interpreted in ASCII unless otherwise
-   specified by the `Latin1`,`Ascii` and `Utf8` constructors in patterns.
+| Precedence | Operator | Example | Parses as |
+|---|---|---|---|
+| lowest | `\|` (alternation) | `'a' \| 'b', 'c'` | `'a' \| ('b', 'c')` |
+| | `,` (concatenation) | `"ab", Star 'c'` | `"ab", (Star 'c')` |
+| highest | Constructor application (`Star`, `Plus`, `Opt`, ...) | `Star 'a' \| 'b'` | `(Star 'a') \| 'b'` |
 
+In particular, `Star r1, r2` is `(Star r1), r2` (not `Star (r1, r2)`),
+and `r1 | r2, r3` is `r1 | (r2, r3)` (not `(r1 | r2), r3`).
+Use parentheses to override: `Star (r1, r2)`, `(r1 | r2), r3`.
 
-It is possible to define named regular expressions with the following
-construction, that can appear in place of a structure item:
+### Named capture groups (`as` bindings)
+
+You can capture sub-matches using OCaml's `as` pattern syntax:
 
 ```ocaml
-  let lid = [%sedlex.regexp? R]
+match%sedlex buf with
+| (Plus ('0'..'9') as num), '.', (Plus ('0'..'9') as frac) ->
+    let n = Sedlexing.Utf8.of_submatch num in
+    let f = Sedlexing.Utf8.of_submatch frac in
+    Printf.printf "integer=%s fractional=%s\n" n f
+| _ -> ()
 ```
 
-where lid is the regexp name to be defined and R its definition.  The
-scope of the "lid" regular expression is the rest of the structure,
-after the definition.
+Each `as` binding produces a value of type `Sedlexing.submatch`. Use the
+extraction functions to obtain the matched content:
+
+- `Sedlexing.Utf8.of_submatch s` returns the sub-match as a UTF-8 string.
+- `Sedlexing.Latin1.of_submatch s` returns the sub-match as a Latin-1 string.
+- `Sedlexing.lexeme_of_submatch s` returns the sub-match as a `Uchar.t array`.
 
-The same syntax can be used for local binding:
+Or-patterns work as expected — both sides must bind the same names:
 
 ```ocaml
-  let lid = [%sedlex.regexp? R] in
-  body
+match%sedlex buf with
+| ("0x", Plus hex_digit as n) | (Plus ('0'..'9') as n) ->
+    Sedlexing.Utf8.of_submatch n
+| _ -> ...
 ```
 
-The scope of "lid" is the body expression.
+**Restriction:** `as` bindings are not allowed inside repetition operators
+(`Star`, `Plus`, `Opt`, `Rep`) or set operators (`Compl`, `Sub`, `Intersect`).
+
+### Encoding
+
+- The OCaml source is assumed to be encoded in UTF-8.
+- Strings and chars literals will be interpreted in ASCII unless otherwise
+  specified by the `Latin1`,`Ascii` and `Utf8` constructors in patterns.
+
+### Named regexps
+
+### Named regular expressions
+
+You can give names to regular expressions with `[%sedlex.regexp? ...]` and
+reference them by name in lexer rules.
+
+**Top-level definitions** are visible for the rest of the module:
+
+```ocaml
+let digit = [%sedlex.regexp? '0' .. '9']
+let number = [%sedlex.regexp? Plus digit]
+
+let rec token buf =
+  match%sedlex buf with
+  | number -> INT (Sedlexing.Utf8.lexeme buf)
+  | _ -> ...
+```
+
+**Local definitions** with `let ... in` are scoped to the body expression:
+
+```ocaml
+let hex_digit =
+  let digit = [%sedlex.regexp? '0' .. '9'] in
+  let hex_letter = [%sedlex.regexp? 'a' .. 'f' | 'A' .. 'F'] in
+  [%sedlex.regexp? digit | hex_letter]
+```
+
+Local definitions also work inside expressions:
+
+```ocaml
+let token buf =
+  let int_lit =
+    let digit = [%sedlex.regexp? '0' .. '9'] in
+    [%sedlex.regexp? Plus digit]
+  in
+  match%sedlex buf with
+  | int_lit -> ...
+  | _ -> ...
+```
 
 
 ## Predefined regexps
 
 sedlex provides a set of predefined regexps:
-- any: any character
-- eof: the virtual end-of-file character
-- xml_letter, xml_digit, xml_extender, xml_base_char, xml_ideographic,
-  xml_combining_char, xml_blank: as defined by the XML recommandation
-- tr8876_ident_char: characters names in identifiers from ISO TR8876
-- cc, cf, cn, co, cs, ll, lm, lo, lt, lu, mc, me, mn, nd, nl, no, pc, pd,
-  pe, pf, pi, po, ps, sc, sk, sm, so, zl, zp, zs: as defined by the
-  Unicode standard (categories)
-- alphabetic, ascii_hex_digit, hex_digit, id_continue, id_start,
-  lowercase, math, other_alphabetic, other_lowercase, other_math,
-  other_uppercase, uppercase, white_space, xid_continue, xid_start: as
-  defined by the Unicode standard (properties)
+- `any`: any character.
+- `eof`: the virtual end-of-file character.
+- `xml_letter`, `xml_digit`, `xml_extender`, `xml_base_char`, `xml_ideographic`,
+  `xml_combining_char`, `xml_blank`: as defined by the XML recommendation.
+- `tr8876_ident_char`: character names in identifiers from ISO TR8876.
+- `cc`, `cf`, `cn`, `co`, `cs`, `ll`, `lm`, `lo`, `lt`, `lu`, `mc`, `me`, `mn`, `nd`, `nl`, `no`, `pc`, `pd`,
+  `pe`, `pf`, `pi`, `po`, `ps`, `sc`, `sk`, `sm`, `so`, `zl`, `zp`, `zs`: as defined by the
+  Unicode standard (categories).
+- `alphabetic`, `ascii_hex_digit`, `hex_digit`, `id_continue`, `id_start`,
+  `lowercase`, `math`, `other_alphabetic`, `other_lowercase`, `other_math`,
+  `other_uppercase`, `uppercase`, `white_space`, `xid_continue`, `xid_start`: as
+  defined by the Unicode standard (properties).
 
 
 ## Running a lexer
 
-See the interface of the Sedlexing module for a description of how to
+See the interface of the `Sedlexing` module for a description of how to
 create lexbuf values (from strings, stream or channels encoded in
 Latin1, utf8 or utf16, or from integer arrays or streams representing
 Unicode code points).
 
 It is possible to work with a custom implementation for lex buffers.
-To do this, you just have to ensure that a module called Sedlexing is
+To do this, you just have to ensure that a module called `Sedlexing` is
 in scope of your lexer specifications, and that it defines at least
-the following functions: start, next, mark, backtrack.  See the interface
-of the Sedlexing module for more information.
+the following functions: `start`, `next`, `mark`, `backtrack`.  See the interface
+of the `Sedlexing` module for more information.
 
 
 
@@ -233,6 +325,58 @@ Once sedlex is installed as per above, simply type
 #require "sedlex.ppx";;
 ```
 
+## Integration with ocamlyacc and menhir
+
+sedlex uses its own `Sedlexing.lexbuf` type, while ocamlyacc and menhir
+(classic API) expect a lexer function of type `Lexing.lexbuf -> token`.
+To bridge the two, create a dummy `Lexing.lexbuf` and update its position
+fields after each token:
+
+```ocaml
+(* In lexer.ml — the sedlex lexer *)
+let rec token buf =
+  match%sedlex buf with
+    | Plus ('0'..'9') -> Parser.INT (int_of_string (Sedlexing.Utf8.lexeme buf))
+    | '+' -> Parser.PLUS
+    | Plus white_space -> token buf
+    | eof -> Parser.EOF
+    | _ -> failwith "Unexpected character"
+
+(* Wrap for ocamlyacc / menhir classic API *)
+let tokenize buf =
+  let lexbuf = Lexing.from_string "" in
+  let tokenize lexbuf =
+    let tok = token buf in
+    let start_pos, end_pos = Sedlexing.lexing_positions buf in
+    lexbuf.Lexing.lex_start_p <- start_pos;
+    lexbuf.Lexing.lex_curr_p <- end_pos;
+    tok
+  in
+  (tokenize, lexbuf)
+
+(* In main.ml *)
+let () =
+  let buf = Sedlexing.Utf8.from_string "1 + 2" in
+  let tokenize, lexbuf = Lexer.tokenize buf in
+  let result = Parser.main tokenize lexbuf in
+  ...
+```
+
+For menhir's **incremental API**, use `Sedlexing.with_tokenizer` which
+returns a `unit -> token * position * position` supplier directly:
+
+```ocaml
+let supplier = Sedlexing.with_tokenizer token buf in
+let result =
+  Parser.MenhirInterpreter.loop supplier
+    (Parser.Incremental.main Lexing.dummy_pos)
+in
+...
+```
+
+Complete working examples are in `examples/with_ocamlyacc/` and
+`examples/with_menhir/`.
+
 ## Examples
 
 The `examples/` subdirectory contains several samples of sedlex in use.

diff --git a/dune-project b/dune-project
@@ -1,12 +1,15 @@
-(lang dune 3.0)
-(version 3.5)
+(lang dune 3.18)
+(using menhir 2.1)
+(version 3.7)
 (name sedlex)
 (source (github ocaml-community/sedlex))
 (license MIT)
 (authors "Alain Frisch <alain.frisch@lexifi.com>"
          "https://github.com/ocaml-community/sedlex/graphs/contributors")
 (maintainers "Alain Frisch <alain.frisch@lexifi.com>")
 (homepage "https://github.com/ocaml-community/sedlex")
+(maintenance_intent "(latest)")
+(documentation "https://ocaml-community.github.io/sedlex/index.html")
 
 (generate_opam_files true)
 (executables_implicit_empty_intf true)
@@ -23,4 +26,5 @@ extension.")
    dune
    (ppxlib (>= 0.26.0))
    gen
-   (ppx_expect :with-test)))
+   (ppx_expect :with-test)
+   (menhir :with-test)))
diff --git a/examples/dune b/examples/dune
@@ -2,8 +2,7 @@
  (names tokenizer regressions complement subtraction repeat performance)
  (libraries sedlex sedlex_ppx)
  (preprocess
-  (pps sedlex.ppx))
- (flags :standard -w +39))
+  (pps sedlex.ppx)))
 
 (rule
  (alias runtest)

diff --git a/examples/regressions.ml b/examples/regressions.ml
@@ -3,7 +3,7 @@
 module CSet = Sedlex_ppx.Sedlex_cset
 module Unicode = Sedlex_ppx.Unicode
 
-let test_versions = ("15.0.0", "16.0.0")
+let test_versions = ("16.0.0", "17.0.0")
 
 let regressions =
   [ (* Example *)