From 14281bbecdc398222425eeaf6441d354682f109d Mon Sep 17 00:00:00 2001
From: Hugo Heuzard <hugo.heuzard@gmail.com>
Date: Sun, 8 Feb 2026 03:47:46 +0100
Subject: [PATCH 1/4] Tag delay optimization

When a Set_position tag fires on every transition entering a cycle
state, remove it from those transitions and emit Set_position with
offset 1 (pos - 1) on exit transitions. This turns O(n) tag writes
per loop into O(1) on exit.

Handles multi-state cycles via SCC analysis (Tarjan's algorithm) and
multi-exit cycles: exits from other cycle states are safe when they
only reach rules that don't read the tag.

Set_position { cell; offset } with offset 0 for current position and
offset 1 for previous code point (subtracted from pos).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CHANGES.md                     |   2 +-
 src/lib/sedlexing.ml           |   5 +-
 src/lib/sedlexing.mli          |   4 +-
 src/syntax/ppx_sedlex.ml       |  13 +-
 src/syntax/sedlex.ml           | 265 ++++++++++++++++++++++++++++---
 src/syntax/sedlex.mli          |   8 +-
 test/basic.ml                  |  62 +++++++-
 test/codegen/test_gen.ml       | 282 +++++++++++++++++++++++++--------
 test/codegen/test_realistic.ml |  25 ++-
 9 files changed, 556 insertions(+), 110 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 0e8de80..4fc1c42 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,6 @@
 # unreleased
 - Support nested `let..in` for `[%sedlex.regexp?]` definitions
-- Add support for named captured group (#177, #178)
+- Add support for named captured group (#177, #178, #179)
 
 # 3.7 (2025-10-06)
 - Update to unicode 17.0.0
diff --git a/src/lib/sedlexing.ml b/src/lib/sedlexing.ml
index 653a955..0448af8 100644
--- a/src/lib/sedlexing.ml
+++ b/src/lib/sedlexing.ml
@@ -298,7 +298,10 @@ let __private__init_mem lexbuf n =
     Array.fill lexbuf.__private__mem_saved 0 n (-1)
   end
 
-let __private__set_mem_pos lexbuf i = lexbuf.__private__mem.(i) <- lexbuf.pos
+let __private__set_mem_pos lexbuf i offset =
+  assert (offset >= 0);
+  assert (lexbuf.pos - offset >= 0);
+  lexbuf.__private__mem.(i) <- lexbuf.pos - offset
 
 let __private__set_mem_value lexbuf i v =
   assert (v >= 0);
diff --git a/src/lib/sedlexing.mli b/src/lib/sedlexing.mli
index dcba7a6..702c2b0 100644
--- a/src/lib/sedlexing.mli
+++ b/src/lib/sedlexing.mli
@@ -250,10 +250,10 @@ val __private__next_int : lexbuf -> int
     each [match%sedlex] block that uses [as] bindings. *)
 val __private__init_mem : lexbuf -> int -> unit
 
-(** [__private__set_mem_pos lexbuf i] records the current position in cell [i],
+(** [__private__set_mem_pos lexbuf i offset] records [pos - offset] in cell [i],
     for later retrieval by {!__private__mem_pos}. Used by [Set_position] tag
     operations on DFA transitions. *)
-val __private__set_mem_pos : lexbuf -> int -> unit
+val __private__set_mem_pos : lexbuf -> int -> int -> unit
 
 (** [__private__set_mem_value lexbuf i v] stores integer [v] in cell [i],
     encoded as [-(v + 2)] so it is disjoint from positions and the unset
diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml
index 9afe33d..c205103 100644
--- a/src/syntax/ppx_sedlex.ml
+++ b/src/syntax/ppx_sedlex.ml
@@ -242,18 +242,19 @@ let call_state lexbuf (auto : Sedlex.dfa) state =
   else appfun (state_fun state) [lexbuf]
 
 (* [gen_tag_ops lexbuf ops cont] wraps [cont] in a sequence of tag
-   operation calls. Each [Set_position t] becomes a call to
-   [__private__set_mem_pos], and each [Set_value (cell, v)] becomes a call to
-   [__private__set_mem_value]. Operations are folded right so they execute
-   before [cont]. *)
+   operation calls. Each [Set_position] becomes a call to
+   [__private__set_mem_pos] and each [Set_value (cell, v)] becomes a call
+   to [__private__set_mem_value]. Operations are folded right so they
+   execute before [cont]. *)
 let gen_tag_ops lexbuf (ops : Sedlex.tag_op list) cont =
   let loc = default_loc in
   List.fold_right
     (fun (op : Sedlex.tag_op) acc ->
       match op with
-        | Set_position t ->
+        | Set_position { cell; offset } ->
             [%expr
-              Sedlexing.__private__set_mem_pos [%e lexbuf] [%e eint ~loc t];
+              Sedlexing.__private__set_mem_pos [%e lexbuf] [%e eint ~loc cell]
+                [%e eint ~loc offset];
               [%e acc]]
         | Set_value (cell, value) ->
             [%expr
diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml
index 4b8fe70..2c16a63 100644
--- a/src/syntax/sedlex.ml
+++ b/src/syntax/sedlex.ml
@@ -17,13 +17,20 @@
 
    2. Tags for `as` bindings (Laurikari-style)
       NFA nodes may carry a tag operation (Set_position or Set_value).
-      [bind] wraps a sub-regexp with start/end tagged epsilon nodes so the
-      DFA can record sub-match positions at runtime. When the PPX can
-      compute one boundary from a known offset (see [pos_expr] in
+      [bind] wraps a sub-regexp with start/end tagged epsilon
+      nodes so the DFA can record sub-match positions at runtime. When the
+      PPX can compute one boundary from a known offset (see [pos_expr] in
       ppx_sedlex.ml), [bind_start_only] or [bind_end_only] is used instead,
       saving a memory cell. Discriminator tags (Set_value) disambiguate
       or-patterns where multiple branches bind the same name.
 
+      After determinization, [compile] applies the self-loop tag delay
+      optimization: when a Set_position tag appears on both a self-loop
+      and all entering transitions to a state, it is removed from those
+      transitions and replaced by Set_position with offset 1 on exit
+      transitions. This
+      avoids writing to the memory cell on every loop iteration.
+
    3. Determinization (compile)
       Classic subset construction, extended to handle tags (Laurikari, NFAs
       with Tagged Transitions, 2000).
@@ -54,10 +61,6 @@
    -----------------------------------------
 
    Tag optimizations for `as` bindings:
-   - Self-loop tag delay: tags on self-loops that also appear on all
-     entering transitions can be removed from those transitions and emitted
-     as a "set previous position" on exit. This turns O(n) tag writes in
-     loops (e.g. Star) into O(1) on exit.
    - Intra-rule tag coalescing: tags with identical occurrence signatures
      (same presence in init_tags and same set of transitions) can share a
      single memory cell.
@@ -74,7 +77,9 @@ module Cset = Sedlex_cset
 
 (* NFA *)
 
-type tag_op = Set_position of int | Set_value of int * int
+type tag_op =
+  | Set_position of { cell : int; offset : int }
+  | Set_value of int * int
 
 type node = {
   id : int;  (** Unique identifier, used for sorting transitions by target. *)
@@ -165,10 +170,14 @@ let bind r =
   let start_tag = new_tag () in
   let end_tag = new_tag () in
   let wrapped succ =
-    let end_node = new_tagged_node (Set_position end_tag) in
+    let end_node =
+      new_tagged_node (Set_position { cell = end_tag; offset = 0 })
+    in
     end_node.eps <- [succ];
     let inner = r end_node in
-    let start_node = new_tagged_node (Set_position start_tag) in
+    let start_node =
+      new_tagged_node (Set_position { cell = start_tag; offset = 0 })
+    in
     start_node.eps <- [inner];
     start_node
   in
@@ -178,7 +187,9 @@ let bind_start_only r =
   let start_tag = new_tag () in
   let wrapped succ =
     let inner = r succ in
-    let start_node = new_tagged_node (Set_position start_tag) in
+    let start_node =
+      new_tagged_node (Set_position { cell = start_tag; offset = 0 })
+    in
     start_node.eps <- [inner];
     start_node
   in
@@ -187,7 +198,9 @@ let bind_start_only r =
 let bind_end_only r =
   let end_tag = new_tag () in
   let wrapped succ =
-    let end_node = new_tagged_node (Set_position end_tag) in
+    let end_node =
+      new_tagged_node (Set_position { cell = end_tag; offset = 0 })
+    in
     end_node.eps <- [succ];
     r end_node
   in
@@ -300,6 +313,183 @@ type dfa_state = {
 type dfa = dfa_state array
 type compiled = { dfa : dfa; init_tags : tag_op list; num_tags : int }
 
+(* [compute_sccs num_states raw_dfa] computes strongly connected components
+   using Tarjan's algorithm. Returns [(scc_id, scc_count)] where
+   [scc_id.(s)] is the SCC index for state [s]. *)
+let compute_sccs num_states raw_dfa =
+  let scc_id = Array.make num_states (-1) in
+  let scc_count = ref 0 in
+  let index = Array.make num_states (-1) in
+  let lowlink = Array.make num_states 0 in
+  let on_stack = Array.make num_states false in
+  let stack = ref [] in
+  let idx = ref 0 in
+  let rec strongconnect v =
+    index.(v) <- !idx;
+    lowlink.(v) <- !idx;
+    incr idx;
+    stack := v :: !stack;
+    on_stack.(v) <- true;
+    let trans, _ = raw_dfa.(v) in
+    Array.iter
+      (fun (_, w, _) ->
+        if index.(w) = -1 then (
+          strongconnect w;
+          lowlink.(v) <- min lowlink.(v) lowlink.(w))
+        else if on_stack.(w) then lowlink.(v) <- min lowlink.(v) index.(w))
+      trans;
+    if lowlink.(v) = index.(v) then (
+      let id = !scc_count in
+      incr scc_count;
+      let rec pop () =
+        match !stack with
+          | w :: rest ->
+              stack := rest;
+              on_stack.(w) <- false;
+              scc_id.(w) <- id;
+              if w <> v then pop ()
+          | [] -> assert false
+      in
+      pop ())
+  in
+  for v = 0 to num_states - 1 do
+    if index.(v) = -1 then strongconnect v
+  done;
+  (scc_id, !scc_count)
+
+(* [find_delayable_tags num_states num_rules raw_dfa tag_to_rule]
+   identifies Set_position tags that can be delayed from cycle bodies
+   to cycle exits.
+
+   A tag [t] is delayable at state [s] when:
+   1. [s] is part of a cycle (SCC with ≥ 2 states, or a self-loop),
+   2. [s] has at least one transition leaving the SCC,
+   3. [s] is not reachable without [Set_position t] firing
+      (every transition entering [s] carries it), and
+   4. every transition leaving the SCC from a state other than [s]
+      leads only to states from which [tag_to_rule.(t)] is unreachable.
+
+   Returns [(delayed_at, scc_id)] where [delayed_at.(s)] is the list of
+   delayable tags at state [s]. *)
+let find_delayable_tags num_states num_rules raw_dfa tag_to_rule =
+  let scc_id, scc_count = compute_sccs num_states raw_dfa in
+  let scc_size = Array.make scc_count 0 in
+  Array.iter (fun id -> scc_size.(id) <- scc_size.(id) + 1) scc_id;
+  let has_self_loop =
+    Array.init num_states (fun s ->
+        let trans, _ = raw_dfa.(s) in
+        Array.exists (fun (_, target, _) -> target = s) trans)
+  in
+  let in_cycle s = scc_size.(scc_id.(s)) >= 2 || has_self_loop.(s) in
+  let has_exit s =
+    let trans, _ = raw_dfa.(s) in
+    Array.exists (fun (_, target, _) -> scc_id.(target) <> scc_id.(s)) trans
+  in
+  (* [reachable.(s).(r)] is true if some accepting state for rule [r]
+     is reachable from state [s] via transitions. *)
+  let reachable =
+    Array.init num_states (fun s ->
+        let _, finals = raw_dfa.(s) in
+        Array.copy finals)
+  in
+  let changed = ref true in
+  while !changed do
+    changed := false;
+    for s = 0 to num_states - 1 do
+      let trans, _ = raw_dfa.(s) in
+      Array.iter
+        (fun (_, target, _) ->
+          for r = 0 to num_rules - 1 do
+            if reachable.(target).(r) && not reachable.(s).(r) then (
+              reachable.(s).(r) <- true;
+              changed := true)
+          done)
+        trans
+    done
+  done;
+  (* For each state in a cycle that has exits, find Set_position tags
+     on every entering transition, then keep only those whose owning
+     rule is unreachable from non-[s] exits of the SCC. *)
+  let delayed_at = Array.make num_states [] in
+  for s = 0 to num_states - 1 do
+    if in_cycle s && has_exit s then (
+      let entering = ref [] in
+      for s' = 0 to num_states - 1 do
+        let trans', _ = raw_dfa.(s') in
+        Array.iter
+          (fun (_, target, tags) ->
+            if target = s then entering := tags :: !entering)
+          trans'
+      done;
+      let candidates =
+        match !entering with
+          | [] -> []
+          | first :: rest ->
+              List.filter
+                (fun t ->
+                  match t with
+                    | Set_position { offset = 0; _ } ->
+                        List.for_all (List.mem t) rest
+                    | _ -> false)
+                first
+      in
+      let scc = scc_id.(s) in
+      delayed_at.(s) <-
+        List.filter
+          (fun t ->
+            match t with
+              | Set_position { cell; _ } ->
+                  let rule = tag_to_rule.(cell) in
+                  let dominated = ref true in
+                  for s' = 0 to num_states - 1 do
+                    if s' <> s && scc_id.(s') = scc then (
+                      let trans', _ = raw_dfa.(s') in
+                      Array.iter
+                        (fun (_, target, _) ->
+                          if scc_id.(target) <> scc && reachable.(target).(rule)
+                          then dominated := false)
+                        trans')
+                  done;
+                  !dominated
+              | _ -> false)
+          candidates)
+  done;
+  (delayed_at, scc_id)
+
+(* [apply_tag_delay raw_dfa delayed_at scc_id] rewrites the DFA:
+   - removes delayed tags from transitions entering each state, and
+   - emits [Set_position] with offset 1 on transitions leaving the cycle.
+
+   Offset 1 records the position of the previous code point, which is
+   exactly the value the original [Set_position] would have had on the
+   last iteration.
+   This turns O(n) tag writes per loop into O(1) on exit. *)
+let apply_tag_delay raw_dfa delayed_at scc_id =
+  Array.mapi
+    (fun s (trans, finals) ->
+      let trans =
+        Array.map
+          (fun (cs, target, tags) ->
+            let tags =
+              List.filter (fun t -> not (List.mem t delayed_at.(target))) tags
+            in
+            let delayed_ops =
+              if delayed_at.(s) <> [] && scc_id.(target) <> scc_id.(s) then
+                List.filter_map
+                  (fun op ->
+                    match op with
+                      | Set_position { cell; _ } ->
+                          Some (Set_position { cell; offset = 1 })
+                      | _ -> None)
+                  delayed_at.(s)
+              else []
+            in
+            (cs, target, tags @ delayed_ops))
+          trans
+      in
+      { trans; finals })
+    raw_dfa
+
 (* [compile rs] determinizes the NFA for an array of regexp rules.
    Each rule is compiled to an NFA (entry node, final node) pair. The initial
    DFA state is the epsilon closure of all entry nodes. States are explored
@@ -307,7 +497,16 @@ type compiled = { dfa : dfa; init_tags : tag_op list; num_tags : int }
    (physical identity). Returns a {compiled} record with the DFA, initial
    tag operations, and total number of memory cells needed. *)
 let compile rs =
-  let rs = Array.map compile_re rs in
+  let num_rules = Array.length rs in
+  let tag_starts = Array.make num_rules 0 in
+  let rs =
+    Array.mapi
+      (fun i r ->
+        tag_starts.(i) <- !cur_tag;
+        compile_re r)
+      rs
+  in
+  let num_tags_raw = !cur_tag in
   let counter = ref 0 in
   let states = Hashtbl.create 31 in
   let states_def = Hashtbl.create 31 in
@@ -320,7 +519,7 @@ let compile rs =
       let trans = transition state in
       let trans = Array.map (fun (p, t, tags) -> (p, aux t, tags)) trans in
       let finals = Array.map (fun (_, f) -> List.memq f state) rs in
-      Hashtbl.add states_def i { trans; finals };
+      Hashtbl.add states_def i (trans, finals);
       i
   in
   let init = ref ([], []) in
@@ -328,11 +527,33 @@ let compile rs =
   let init_state, init_tags = !init in
   let i = aux init_state in
   assert (i = 0);
-  {
-    dfa = Array.init !counter (Hashtbl.find states_def);
-    init_tags = dedup_tags init_tags;
-    num_tags = !cur_tag;
-  }
+  let num_states = !counter in
+  let raw_dfa = Array.init num_states (Hashtbl.find states_def) in
+  if num_tags_raw = 0 then (
+    let dfa =
+      Array.map
+        (fun (trans, finals) ->
+          {
+            trans = Array.map (fun (cs, target, _) -> (cs, target, [])) trans;
+            finals;
+          })
+        raw_dfa
+    in
+    { dfa; init_tags = []; num_tags = 0 })
+  else (
+    let tag_to_rule = Array.make num_tags_raw 0 in
+    for i = 0 to num_rules - 1 do
+      let lo = tag_starts.(i) in
+      let hi = if i < num_rules - 1 then tag_starts.(i + 1) else num_tags_raw in
+      for t = lo to hi - 1 do
+        tag_to_rule.(t) <- i
+      done
+    done;
+    let delayed_at, scc_id =
+      find_delayable_tags num_states num_rules raw_dfa tag_to_rule
+    in
+    let dfa = apply_tag_delay raw_dfa delayed_at scc_id in
+    { dfa; init_tags = dedup_tags init_tags; num_tags = num_tags_raw })
 
 let cset_to_label cset =
   let escape_dot c =
@@ -382,8 +603,10 @@ let dfa_to_dot dfa =
         (fun (cset, target, tags) ->
           let label = cset_to_label cset in
           let tag_op_to_string = function
-            | Set_position t -> "t" ^ string_of_int t
-            | Set_value (c, v) -> "d" ^ string_of_int c ^ "=" ^ string_of_int v
+            | Set_position { cell; offset = 0 } -> Printf.sprintf "t%d" cell
+            | Set_position { cell; offset } ->
+                Printf.sprintf "t%d-%d" cell offset
+            | Set_value (c, v) -> Printf.sprintf "d%d=%d" c v
           in
           let label =
             if tags = [] then label
diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli
index 5bac2b5..64fd212 100644
--- a/src/syntax/sedlex.mli
+++ b/src/syntax/sedlex.mli
@@ -55,9 +55,11 @@ val intersection : regexp -> regexp -> regexp option
 
 (** Tag operations emitted on DFA transitions. *)
 type tag_op =
-  | Set_position of int
-      (** [Set_position i]: record the current lexbuf position in memory cell
-          [i]. *)
+  | Set_position of { cell : int; offset : int }
+      (** [Set_position { cell; offset }]: record [pos - offset] in memory cell
+          [cell]. Offset 0 records the current position; offset 1 records the
+          previous code point's position (emitted by the tag delay optimization
+          on transitions leaving a loop). *)
   | Set_value of int * int
       (** [Set_value (cell, v)]: record integer [v] in memory cell [cell] (used
           for or-pattern discriminators). *)
diff --git a/test/basic.ml b/test/basic.ml
index e44cc4f..145d10d 100644
--- a/test/basic.ml
+++ b/test/basic.ml
@@ -1303,7 +1303,7 @@ let%expect_test "as_bindings" =
           (Sedlexing.Utf8.of_submatch y)
     | _ -> assert false);
   [%expect {| x=d y=gh |}];
-  (* Test 11: Set_prev with backtracking (Opt at end) *)
+  (* Test 11: Delayed tag with backtracking (Opt at end) *)
   let buf = Sedlexing.Utf8.from_string "aabba" in
   (match%sedlex buf with
     | (Plus 'a' as x), ((Plus 'b', Opt 'a') as y) ->
@@ -1475,3 +1475,63 @@ let%expect_test "as_bindings_nested_sedlex" =
     | Plus 'a' .. 'z' as _x -> Printf.printf "mem_cells=%d\n" (num_mem buf)
     | _ -> assert false);
   [%expect {| mem_cells=0 |}]
+
+let%expect_test "as_bindings_multi_state_cycle" =
+  (* Plus "ab" creates a 2-state cycle; tag delay should still be correct *)
+  let buf = Sedlexing.Utf8.from_string "aaababcc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aaa y=bab z=cc |}];
+  (* Single iteration of the cycle *)
+  let buf = Sedlexing.Utf8.from_string "aabc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aa y=b z=c |}];
+  (* Many iterations *)
+  let buf = Sedlexing.Utf8.from_string "aabababababccc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aa y=babababab z=ccc |}]
+
+let%expect_test "as_bindings_multi_exit_tag_delay" =
+  (* Opt 'b' shifts Plus "ab" to share a cycle with Plus "ba".
+     The cycle has exits from two states; tag delay relies on the
+     generalized reachability check. *)
+  let buf = Sedlexing.Utf8.from_string "ababccc" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  (* With the optional 'b' prefix *)
+  let buf = Sedlexing.Utf8.from_string "bababc" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  (* Rule 1 match (exits from the other cycle state) *)
+  let buf = Sedlexing.Utf8.from_string "abad" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| rule1 |}]
diff --git a/test/codegen/test_gen.ml b/test/codegen/test_gen.ml
index 44af2e0..7a46239 100644
--- a/test/codegen/test_gen.ml
+++ b/test/codegen/test_gen.ml
@@ -562,10 +562,10 @@ let%expect_test "optim: element-length (Offset_from_tag)" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
       state2 -> state3 [label="'c'"];
       state3 [label="3\n[rule 0]", shape=doublecircle];
@@ -574,12 +574,12 @@ let%expect_test "optim: element-length (Offset_from_tag)" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
@@ -730,22 +730,22 @@ let%expect_test "optim: intra-rule tag coalescing" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state2 [label="'b'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -773,7 +773,7 @@ let%expect_test "optim: intra-rule tag coalescing" =
    Non-interfering rules should reuse the same memory cells.
    Rule 0 and rule 1 never co-exist in the same DFA state (beyond state 0),
    so their tags can share cells.
-   Current: init_mem 4 (2 per rule: start + end tags for variable-length binding).
+   Current: init_mem 4 (2 per rule: start + end tags; delayed on exits).
    Goal: init_mem 2 (cells shared across non-interfering rules). *)
 let%expect_test "optim: cross-rule cell sharing" =
   (match%sedlex_test buf with
@@ -791,40 +791,40 @@ let%expect_test "optim: cross-rule cell sharing" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
-      state0 -> state4 [label="'d' {t2}"];
+      state0 -> state1 [label="'a'"];
+      state0 -> state4 [label="'d'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b' {t1}"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
-      state2 -> state2 [label="'b' {t1}"];
-      state2 -> state3 [label="'c'"];
+      state2 -> state2 [label="'b'"];
+      state2 -> state3 [label="'c' {t1-1}"];
       state3 [label="3\n[rule 0]", shape=doublecircle];
       state3 -> state3 [label="'c'"];
       state4 [label="4"];
-      state4 -> state4 [label="'d' {t2}"];
-      state4 -> state5 [label="'e' {t3}"];
+      state4 -> state4 [label="'d'"];
+      state4 -> state5 [label="'e' {t2-1}"];
       state5 [label="5"];
-      state5 -> state5 [label="'e' {t3}"];
-      state5 -> state6 [label="'f'"];
+      state5 -> state5 [label="'e'"];
+      state5 -> state6 [label="'f' {t3-1}"];
       state6 [label="6\n[rule 1]", shape=doublecircle];
       state6 -> state6 [label="'f'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 2; __sedlex_state_4 buf)
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> __sedlex_state_4 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_2 buf)
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_2 buf)
-      | 1 -> __sedlex_state_3 buf
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_3 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_3 buf =
       Sedlexing.mark buf 0;
@@ -833,13 +833,13 @@ let%expect_test "optim: cross-rule cell sharing" =
        | _ -> Sedlexing.backtrack buf)
     and __sedlex_state_4 buf =
       match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 2; __sedlex_state_4 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 3; __sedlex_state_5 buf)
+      | 0 -> __sedlex_state_4 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 2 1; __sedlex_state_5 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_5 buf =
       match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 3; __sedlex_state_5 buf)
-      | 1 -> __sedlex_state_6 buf
+      | 0 -> __sedlex_state_5 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 3 1; __sedlex_state_6 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_6 buf =
       Sedlexing.mark buf 1;
@@ -870,8 +870,7 @@ let%expect_test "optim: cross-rule cell sharing" =
    a final state should be removed.
    Rule 0 has a binding on Plus 'b'; rule 1 does not.
    Both share the Plus 'a', Plus 'b' prefix in the DFA.
-   Current: init_mem 1, tag t0 set on shared prefix transitions
-   even when only rule 1 is reachable via 'd'.
+   Current: init_mem 1, t0 delayed (offset 1 on 'a'→'b' exit).
    Goal: no tags on transitions leading exclusively to rule 1. *)
 let%expect_test "optim: dead tag elimination" =
   (match%sedlex_test buf with
@@ -889,10 +888,10 @@ let%expect_test "optim: dead tag elimination" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
       state2 -> state2 [label="'b'"];
       state2 -> state3 [label="'c'"];
@@ -903,12 +902,12 @@ let%expect_test "optim: dead tag elimination" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
@@ -930,11 +929,10 @@ let%expect_test "optim: dead tag elimination" =
     | _ -> ()
     |}]
 
-(* Optimization 7: Self-loop tag delay (Set_prev)
+(* Optimization 7: Self-loop tag delay [DONE]
    Tags on a self-loop that also appear on all entering transitions
-   should be delayed to exit transitions as Set_prev.
-   Current: init_mem 1, set_mem t0 on every 'a' iteration (O(n)).
-   Goal: no set_mem on the self-loop, set_mem_prev on exit (O(1)). *)
+   are delayed to exit transitions with offset 1.
+   Result: init_mem 1, no set_mem on the self-loop, set_mem_pos with offset 1 on exit (O(1)). *)
 let%expect_test "optim: self-loop tag delay" =
   (match%sedlex_test buf with (Plus 'a' as x), Plus 'b' -> ignore x | _ -> ());
   [%expect
@@ -948,22 +946,22 @@ let%expect_test "optim: self-loop tag delay" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state2 [label="'b'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -985,7 +983,7 @@ let%expect_test "optim: self-loop tag delay" =
 
 (* Optimization 8: Tag remapping
    After coalescing and dead-tag elimination, the PPX should remap
-   Tag references through the compiler's tag_map.
+   tag references.
    Current: 0 tags (all offsets known: x=0..1, y=1..end-1, z=end-1..end).
    Goal: already optimal. *)
 let%expect_test "optim: tag remapping after coalescing" =
@@ -1042,11 +1040,11 @@ let%expect_test "optim: tag remapping after coalescing" =
     | _ -> ()
     |}]
 
-(* Optimization 9: Set_prev with backtracking
+(* Optimization 9: Delayed tag with backtracking [DONE]
    Opt at the end means the DFA can accept at two states (with or without
-   the optional 'a'). When self-loop tag delay is implemented, the delayed
-   tags (Set_prev) must survive mark/backtrack correctly.
-   Current: init_mem 1 (x: start=0, end=tag0; y: start=tag0, end=lexeme_length). *)
+   the optional 'a'). The delayed tag (offset 1) must survive mark/backtrack
+   correctly.
+   Result: init_mem 1, set_mem_pos with offset 1 on exit (x: start=0, end=tag0; y: start=tag0, end=lexeme_length). *)
 let%expect_test "optim: set_prev with backtracking" =
   (match%sedlex_test buf with
     | (Plus 'a' as x), ((Plus 'b', Opt 'a') as y) -> ignore (x, y)
@@ -1062,10 +1060,10 @@ let%expect_test "optim: set_prev with backtracking" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state3 [label="'a'"];
       state2 -> state2 [label="'b'"];
@@ -1074,12 +1072,12 @@ let%expect_test "optim: set_prev with backtracking" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -1164,6 +1162,88 @@ let%expect_test "Rep fixed-length prefix enables Start_plus" =
     | _ -> ()
     |}]
 
+(* Multi-exit tag delay: the cycle has exits from two different states,
+   but the tag's owning rule is only reachable from one of them.
+   Opt 'b' shifts Plus "ab" by one character so it shares the cycle
+   with Plus "ba"; each rule exits from a different cycle state.
+   tag_end fires in the cycle; the generalized check allows delay
+   because the other exit only reaches rule 1 (which has no tags). *)
+let%expect_test "optim: multi-exit tag delay" =
+  (match%sedlex_test buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' -> ignore x
+    | Opt 'a', Plus ('b', 'a'), 'd' -> ()
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a'"];
+      state0 -> state6 [label="'b' {t0}"];
+      state1 [label="1"];
+      state1 -> state2 [label="'b'"];
+      state2 [label="2"];
+      state2 -> state3 [label="'a'"];
+      state2 -> state5 [label="'c' {t1-1}"];
+      state3 [label="3"];
+      state3 -> state2 [label="'b'"];
+      state3 -> state4 [label="'d'"];
+      state4 [label="4\n[rule 1]", shape=doublecircle];
+      state5 [label="5\n[rule 0]", shape=doublecircle];
+      state5 -> state5 [label="'c'"];
+      state6 [label="6"];
+      state6 -> state3 [label="'a'"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 0; __sedlex_state_6 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_5 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> 1
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_5 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> __sedlex_state_5 buf
+       | _ -> Sedlexing.backtrack buf)
+    and __sedlex_state_6 buf =
+      match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | _ -> Sedlexing.backtrack buf in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 2;
+          Sedlexing.__private__set_mem_pos buf 0 0;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        ignore x
+    | 1 -> ()
+    | _ -> ()
+    |}]
+
 let%expect_test "as binding: or-chain then nested or on right" =
   (match%sedlex_test buf with
     | ("ab" as x), ("ef" as y)
@@ -1303,3 +1383,81 @@ let%expect_test "as binding: or-chain then nested or on right" =
         ignore (x, y)
     | _ -> ()
     |}]
+
+let%expect_test "as binding: Plus with self-loop delay" =
+  (match%sedlex_test buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) -> ignore (x, y, z)
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a' {t0}"];
+      state1 [label="1"];
+      state1 -> state2 [label="'a'"];
+      state2 [label="2"];
+      state2 -> state2 [label="'a'"];
+      state2 -> state3 [label="'b' {t0-1}"];
+      state3 [label="3"];
+      state3 -> state4 [label="'a'"];
+      state3 -> state5 [label="'c' {t1-1}"];
+      state4 [label="4"];
+      state4 -> state3 [label="'b'"];
+      state5 [label="5\n[rule 0]", shape=doublecircle];
+      state5 -> state5 [label="'c'"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> (Sedlexing.__private__set_mem_pos buf 0 0; __sedlex_state_1 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_3 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_4 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_5 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_4 buf =
+      match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_5 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> __sedlex_state_5 buf
+       | _ -> Sedlexing.backtrack buf) in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 2;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          let __s = 0 in
+          let __e = Sedlexing.__private__mem_pos buf 0 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        let y =
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        let z =
+          let __s = Sedlexing.__private__mem_pos buf 1 in
+          let __e = Sedlexing.lexeme_length buf in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        ignore (x, y, z)
+    | _ -> ()
+    |}]
diff --git a/test/codegen/test_realistic.ml b/test/codegen/test_realistic.ml
index 083745f..2a7884d 100644
--- a/test/codegen/test_realistic.ml
+++ b/test/codegen/test_realistic.ml
@@ -8,7 +8,6 @@
    - Rule 3: '(', ('a'..'z' as x), ',', ('a'..'z' as y), ')' → 0 tags (all fixed offsets)
    - Rule 4: (Plus digits as tok) | (Plus letters as tok) → 0 tags (discriminator elided)
    Remaining optimization goals:
-   - Self-loop tag delay → rules 0,1 use Set_prev (O(1) instead of O(n))
    - Cross-rule cell sharing → rules 0,1 share cells (init_mem 2 → 1) *)
 let%expect_test "realistic: multi-token lexer" =
   (match%sedlex_test buf with
@@ -34,8 +33,8 @@ let%expect_test "realistic: multi-token lexer" =
       state0 -> state1 [label="'('"];
       state0 -> state6 [label="'0'"];
       state0 -> state7 [label="'1'-'9'"];
-      state0 -> state11 [label="'A'-'Z' {t1}"];
-      state0 -> state14 [label="'a'-'z' {t0}"];
+      state0 -> state11 [label="'A'-'Z'"];
+      state0 -> state14 [label="'a'-'z'"];
       state1 [label="1"];
       state1 -> state2 [label="'a'-'z'"];
       state2 [label="2"];
@@ -57,15 +56,15 @@ let%expect_test "realistic: multi-token lexer" =
       state9 -> state10 [label="';'"];
       state10 [label="10\n[rule 2]", shape=doublecircle];
       state11 [label="11"];
-      state11 -> state12 [label="'='"];
-      state11 -> state11 [label="'A'-'Z' {t1}"];
+      state11 -> state12 [label="'=' {t1-1}"];
+      state11 -> state11 [label="'A'-'Z'"];
       state12 [label="12"];
       state12 -> state13 [label="'0'-'9'"];
       state13 [label="13\n[rule 1]", shape=doublecircle];
       state13 -> state13 [label="'0'-'9'"];
       state14 [label="14\n[rule 4]", shape=doublecircle];
-      state14 -> state15 [label="'.'"];
-      state14 -> state14 [label="'a'-'z' {t0}"];
+      state14 -> state15 [label="'.' {t0-1}"];
+      state14 -> state14 [label="'a'-'z'"];
       state15 [label="15"];
       state15 -> state16 [label="'a'-'z'"];
       state16 [label="16\n[rule 0]", shape=doublecircle];
@@ -77,8 +76,8 @@ let%expect_test "realistic: multi-token lexer" =
       | 0 -> __sedlex_state_1 buf
       | 1 -> __sedlex_state_6 buf
       | 2 -> __sedlex_state_7 buf
-      | 3 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_11 buf)
-      | 4 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_14 buf)
+      | 3 -> __sedlex_state_11 buf
+      | 4 -> __sedlex_state_14 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
@@ -118,8 +117,8 @@ let%expect_test "realistic: multi-token lexer" =
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_11 buf =
       match __sedlex_partition_9 (Sedlexing.__private__next_int buf) with
-      | 0 -> __sedlex_state_12 buf
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_11 buf)
+      | 0 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_12 buf)
+      | 1 -> __sedlex_state_11 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_12 buf =
       match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
@@ -133,8 +132,8 @@ let%expect_test "realistic: multi-token lexer" =
     and __sedlex_state_14 buf =
       Sedlexing.mark buf 4;
       (match __sedlex_partition_10 (Sedlexing.__private__next_int buf) with
-       | 0 -> __sedlex_state_15 buf
-       | 1 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_14 buf)
+       | 0 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_15 buf)
+       | 1 -> __sedlex_state_14 buf
        | _ -> Sedlexing.backtrack buf)
     and __sedlex_state_15 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with

From 2d02605b259e0730a61a3b4588378cab30371c40 Mon Sep 17 00:00:00 2001
From: Hugo Heuzard <hugo.heuzard@gmail.com>
Date: Fri, 27 Mar 2026 17:24:35 +0100
Subject: [PATCH 2/4] Fix tag ownership: use NFA walk instead of tag_starts

Tags are allocated during regexp_of_pattern in the PPX, not during
compile_re. The tag_starts approach incorrectly mapped all tags to
rule 0. Use an NFA walk (tag_owners) to correctly discover which
rule each tag belongs to.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/syntax/sedlex.ml | 48 +++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml
index 2c16a63..518850d 100644
--- a/src/syntax/sedlex.ml
+++ b/src/syntax/sedlex.ml
@@ -490,6 +490,32 @@ let apply_tag_delay raw_dfa delayed_at scc_id =
       { trans; finals })
     raw_dfa
 
+let tag_cell = function
+  | Set_position { cell; _ } -> cell
+  | Set_value (cell, _) -> cell
+
+(* [tag_owners num_tags rs] maps each tag cell to the rule that owns it.
+   Walks the NFA of each rule to discover which tags it created.
+   Returns an array where [result.(t)] is the rule index for tag [t],
+   or -1 if the tag was not found in any rule's NFA. *)
+let tag_owners num_tags rs =
+  let tag_to_rule = Array.make num_tags (-1) in
+  Array.iteri
+    (fun r (start, _) ->
+      let visited = Hashtbl.create 31 in
+      let rec visit node =
+        if not (Hashtbl.mem visited node.id) then (
+          Hashtbl.add visited node.id ();
+          (match node.tag with
+            | Some op -> tag_to_rule.(tag_cell op) <- r
+            | None -> ());
+          List.iter visit node.eps;
+          List.iter (fun (_, n) -> visit n) node.trans)
+      in
+      visit start)
+    rs;
+  tag_to_rule
+
 (* [compile rs] determinizes the NFA for an array of regexp rules.
    Each rule is compiled to an NFA (entry node, final node) pair. The initial
    DFA state is the epsilon closure of all entry nodes. States are explored
@@ -498,14 +524,7 @@ let apply_tag_delay raw_dfa delayed_at scc_id =
    tag operations, and total number of memory cells needed. *)
 let compile rs =
   let num_rules = Array.length rs in
-  let tag_starts = Array.make num_rules 0 in
-  let rs =
-    Array.mapi
-      (fun i r ->
-        tag_starts.(i) <- !cur_tag;
-        compile_re r)
-      rs
-  in
+  let rs = Array.map compile_re rs in
   let num_tags_raw = !cur_tag in
   let counter = ref 0 in
   let states = Hashtbl.create 31 in
@@ -540,20 +559,13 @@ let compile rs =
         raw_dfa
     in
     { dfa; init_tags = []; num_tags = 0 })
-  else (
-    let tag_to_rule = Array.make num_tags_raw 0 in
-    for i = 0 to num_rules - 1 do
-      let lo = tag_starts.(i) in
-      let hi = if i < num_rules - 1 then tag_starts.(i + 1) else num_tags_raw in
-      for t = lo to hi - 1 do
-        tag_to_rule.(t) <- i
-      done
-    done;
+  else
+    let tag_to_rule = tag_owners num_tags_raw rs in
     let delayed_at, scc_id =
       find_delayable_tags num_states num_rules raw_dfa tag_to_rule
     in
     let dfa = apply_tag_delay raw_dfa delayed_at scc_id in
-    { dfa; init_tags = dedup_tags init_tags; num_tags = num_tags_raw })
+    { dfa; init_tags = dedup_tags init_tags; num_tags = num_tags_raw }
 
 let cset_to_label cset =
   let escape_dot c =

From 426ca687d7d5b176205d8c932f93eb0e364ad435 Mon Sep 17 00:00:00 2001
From: Hugo Heuzard <hugo.heuzard@gmail.com>
Date: Fri, 27 Mar 2026 17:25:36 +0100
Subject: [PATCH 3/4] Add test for tag ownership correctness in multi-exit
 delay

With the tag binding on rule 1 (not rule 0), incorrect tag ownership
would allow an unsafe delay: the check would see rule 0 as unreachable
from the non-s exit (correct) but miss that rule 1 IS reachable there.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test/basic.ml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/test/basic.ml b/test/basic.ml
index 145d10d..e0e63fb 100644
--- a/test/basic.ml
+++ b/test/basic.ml
@@ -1508,6 +1508,26 @@ let%expect_test "as_bindings_multi_state_cycle" =
     | _ -> assert false);
   [%expect {| x=aa y=babababab z=ccc |}]
 
+let%expect_test "as_bindings_multi_exit_tag_ownership" =
+  (* The tag belongs to rule 1 (not rule 0). With incorrect tag ownership
+     (all tags mapped to rule 0), the delay check would incorrectly allow
+     delay because rule 0 is unreachable from the non-s exit — but rule 1
+     IS reachable there, and it reads the tag. *)
+  let buf = Sedlexing.Utf8.from_string "ababccc" in
+  (match%sedlex buf with
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule0\n"
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  let buf = Sedlexing.Utf8.from_string "abad" in
+  (match%sedlex buf with
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule0\n"
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | _ -> assert false);
+  [%expect {| rule0 |}]
+
 let%expect_test "as_bindings_multi_exit_tag_delay" =
   (* Opt 'b' shifts Plus "ab" to share a cycle with Plus "ba".
      The cycle has exits from two states; tag delay relies on the

From 67bf0babec525bb0c9643c7d25ae86e5e067d14f Mon Sep 17 00:00:00 2001
From: Hugo Heuzard <hugo.heuzard@gmail.com>
Date: Sun, 8 Feb 2026 04:16:15 +0100
Subject: [PATCH 4/4] Cross-rule cell sharing via graph coloring

Non-interfering tag cells from different rules can share the same
physical memory slot. Uses forward reachability to build an
interference graph, then greedy graph coloring to assign slots.

Run after tag delay so coloring benefits from the reduced forward
reachability of delayed tags.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/syntax/ppx_sedlex.ml       |  14 +++
 src/syntax/sedlex.ml           | 158 +++++++++++++++++++++++++++++----
 src/syntax/sedlex.mli          |   1 +
 test/codegen/test_gen.ml       | 126 +++++++++++++++++++++++---
 test/codegen/test_realistic.ml |  18 ++--
 5 files changed, 279 insertions(+), 38 deletions(-)

diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml
index c205103..6e12977 100644
--- a/src/syntax/ppx_sedlex.ml
+++ b/src/syntax/ppx_sedlex.ml
@@ -470,6 +470,19 @@ type tag_info = {
          distinct value in the shared discriminator cell. *)
 }
 
+let remap_tag_info (tag_map : int array) (ti : tag_info) =
+  let remap_pos = function
+    | Tag { tag; offset } -> Tag { tag = tag_map.(tag); offset }
+    | Start_plus _ as pe -> pe
+    | End_minus _ as pe -> pe
+  in
+  {
+    ti with
+    start_pos = remap_pos ti.start_pos;
+    end_pos = remap_pos ti.end_pos;
+    disc = List.map (fun (cell, v) -> (tag_map.(cell), v)) ti.disc;
+  }
+
 (* [advance pe len] shifts a position expression forward by [len] code
    points. Returns [None] if either argument is unknown. *)
 let advance pe len =
@@ -1054,6 +1067,7 @@ let handle_sedlex_match_ ~env ~map_rhs match_expr =
   let cases =
     List.map
       (fun (_, tag_info, e) ->
+        let tag_info = List.map (remap_tag_info compiled.tag_map) tag_info in
         let action = gen_binding_code (snd lexbuf) tag_info (map_rhs e) in
         ((), action))
       cases_parsed
diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml
index 518850d..dbd4e6d 100644
--- a/src/syntax/sedlex.ml
+++ b/src/syntax/sedlex.ml
@@ -64,8 +64,6 @@
    - Intra-rule tag coalescing: tags with identical occurrence signatures
      (same presence in init_tags and same set of transitions) can share a
      single memory cell.
-   - Cross-rule cell sharing: memory cells from non-interfering rules can
-     share the same physical slot via liveness analysis and graph coloring.
 
    DFA construction:
    - DFA minimization: the generated DFA is not minimized. Hopcroft's or
@@ -311,7 +309,13 @@ type dfa_state = {
 }
 
 type dfa = dfa_state array
-type compiled = { dfa : dfa; init_tags : tag_op list; num_tags : int }
+
+type compiled = {
+  dfa : dfa;
+  init_tags : tag_op list;
+  num_tags : int;
+  tag_map : int array;
+}
 
 (* [compute_sccs num_states raw_dfa] computes strongly connected components
    using Tarjan's algorithm. Returns [(scc_id, scc_count)] where
@@ -487,19 +491,44 @@ let apply_tag_delay raw_dfa delayed_at scc_id =
             (cs, target, tags @ delayed_ops))
           trans
       in
-      { trans; finals })
+      (trans, finals))
     raw_dfa
 
 let tag_cell = function
   | Set_position { cell; _ } -> cell
   | Set_value (cell, _) -> cell
 
-(* [tag_owners num_tags rs] maps each tag cell to the rule that owns it.
-   Walks the NFA of each rule to discover which tags it created.
-   Returns an array where [result.(t)] is the rule index for tag [t],
-   or -1 if the tag was not found in any rule's NFA. *)
-let tag_owners num_tags rs =
-  let tag_to_rule = Array.make num_tags (-1) in
+(* [remap_cells slot_of num_slots tag_map raw_dfa init_tags] rewrites all tag cell
+   references in the DFA and init_tags through [slot_of], and composes
+   [tag_map] with the remapping. Returns [(num_slots, tag_map, raw_dfa, init_tags)]. *)
+let remap_cells slot_of num_slots tag_map raw_dfa init_tags =
+  let tag_map = Array.map (fun c -> slot_of.(c)) tag_map in
+  let remap_op = function
+    | Set_position { cell; offset } ->
+        Set_position { cell = slot_of.(cell); offset }
+    | Set_value (c, v) -> Set_value (slot_of.(c), v)
+  in
+  let raw_dfa =
+    Array.map
+      (fun (trans, finals) ->
+        let trans =
+          Array.map
+            (fun (cs, target, tags) ->
+              (cs, target, List.sort_uniq compare (List.map remap_op tags)))
+            trans
+        in
+        (trans, finals))
+      raw_dfa
+  in
+  let init_tags = List.sort_uniq compare (List.map remap_op init_tags) in
+  (num_slots, tag_map, raw_dfa, init_tags)
+
+(* [cell_owners num_tags rs tag_map] maps each cell to the rule that owns
+   it: -1 = unused, >= 0 = single rule, -2 = multiple rules. Walks the
+   NFA of each rule to discover which tags it created. *)
+let cell_owners num_tags rs tag_map =
+  let num_tags_raw = Array.length tag_map in
+  let tag_to_rule = Array.make num_tags_raw (-1) in
   Array.iteri
     (fun r (start, _) ->
       let visited = Hashtbl.create 31 in
@@ -514,7 +543,95 @@ let tag_owners num_tags rs =
       in
       visit start)
     rs;
-  tag_to_rule
+  let cell_to_rule = Array.make num_tags (-1) in
+  for t = 0 to num_tags_raw - 1 do
+    if tag_to_rule.(t) >= 0 then (
+      let c = tag_map.(t) in
+      let r = tag_to_rule.(t) in
+      if cell_to_rule.(c) = -1 then cell_to_rule.(c) <- r
+      else if cell_to_rule.(c) <> r then cell_to_rule.(c) <- -2)
+  done;
+  cell_to_rule
+
+(* [forward_reachable num_tags num_states raw_dfa init_tags] computes,
+   for each cell [c], the set of DFA states reachable from any transition
+   that writes to [c]. Two cells whose reachable sets overlap may be
+   simultaneously live and must not share a slot. *)
+let forward_reachable num_tags num_states raw_dfa init_tags =
+  let fwd = Array.init num_tags (fun _ -> Array.make num_states false) in
+  (* Seed: mark the target of each transition that writes to cell c. *)
+  for s = 0 to num_states - 1 do
+    let trans, _ = raw_dfa.(s) in
+    Array.iter
+      (fun (_, target, tags) ->
+        List.iter (fun op -> fwd.(tag_cell op).(target) <- true) tags)
+      trans
+  done;
+  List.iter (fun op -> fwd.(tag_cell op).(0) <- true) init_tags;
+  (* BFS: propagate reachability along DFA transitions. *)
+  for c = 0 to num_tags - 1 do
+    let queue = Queue.create () in
+    for s = 0 to num_states - 1 do
+      if fwd.(c).(s) then Queue.push s queue
+    done;
+    while not (Queue.is_empty queue) do
+      let s = Queue.pop queue in
+      let trans, _ = raw_dfa.(s) in
+      Array.iter
+        (fun (_, target, _) ->
+          if not fwd.(c).(target) then (
+            fwd.(c).(target) <- true;
+            Queue.push target queue))
+        trans
+    done
+  done;
+  fwd
+
+(* [color_cells num_tags num_states cell_to_rule fwd] assigns each cell to the
+   lowest-numbered slot that does not conflict with any previously
+   assigned cell. Two cells conflict when they belong to the same rule,
+   when either is shared across rules, or when their forward-reachable
+   sets overlap.
+   Returns [(slot_of, num_slots)]. *)
+let color_cells num_tags num_states cell_to_rule fwd =
+  let slot_of = Array.make num_tags 0 in
+  let max_slot = ref 0 in
+  for c = 0 to num_tags - 1 do
+    let used = Hashtbl.create 8 in
+    for c' = 0 to c - 1 do
+      let conflict =
+        cell_to_rule.(c) < 0
+        || cell_to_rule.(c') < 0
+        || cell_to_rule.(c) = cell_to_rule.(c')
+        ||
+        let overlap = ref false in
+        for s = 0 to num_states - 1 do
+          if fwd.(c).(s) && fwd.(c').(s) then overlap := true
+        done;
+        !overlap
+      in
+      if conflict then Hashtbl.replace used slot_of.(c') ()
+    done;
+    let slot = ref 0 in
+    while Hashtbl.mem used !slot do
+      incr slot
+    done;
+    slot_of.(c) <- !slot;
+    if !slot > !max_slot then max_slot := !slot
+  done;
+  (slot_of, !max_slot + 1)
+
+(* [share_cells] merges non-interfering tag cells from different rules
+   into shared physical slots.
+   Returns [(num_tags, tag_map, raw_dfa, init_tags)] with remapped cells,
+   or the inputs unchanged if no sharing is possible. *)
+let share_cells num_tags num_states rs raw_dfa init_tags tag_map =
+  let cell_to_rule = cell_owners num_tags rs tag_map in
+  let fwd = forward_reachable num_tags num_states raw_dfa init_tags in
+  let slot_of, num_slots = color_cells num_tags num_states cell_to_rule fwd in
+  if num_slots < num_tags then
+    remap_cells slot_of num_slots tag_map raw_dfa init_tags
+  else (num_tags, tag_map, raw_dfa, init_tags)
 
 (* [compile rs] determinizes the NFA for an array of regexp rules.
    Each rule is compiled to an NFA (entry node, final node) pair. The initial
@@ -558,14 +675,25 @@ let compile rs =
           })
         raw_dfa
     in
-    { dfa; init_tags = []; num_tags = 0 })
+    { dfa; init_tags = []; num_tags = 0; tag_map = [||] })
   else
-    let tag_to_rule = tag_owners num_tags_raw rs in
+    let tag_to_rule = cell_owners num_tags_raw rs (Array.init num_tags_raw Fun.id) in
     let delayed_at, scc_id =
       find_delayable_tags num_states num_rules raw_dfa tag_to_rule
     in
-    let dfa = apply_tag_delay raw_dfa delayed_at scc_id in
-    { dfa; init_tags = dedup_tags init_tags; num_tags = num_tags_raw }
+    let raw_dfa = apply_tag_delay raw_dfa delayed_at scc_id in
+    let tag_map = Array.init num_tags_raw Fun.id in
+    let num_tags, tag_map, raw_dfa, init_tags =
+      if num_tags_raw <= 1 || num_rules <= 1 then
+        (num_tags_raw, tag_map, raw_dfa, init_tags)
+      else share_cells num_tags_raw num_states rs raw_dfa init_tags tag_map
+    in
+    let dfa =
+      Array.map
+        (fun (trans, finals) -> { trans; finals })
+        raw_dfa
+    in
+    { dfa; init_tags = dedup_tags init_tags; num_tags; tag_map }
 
 let cset_to_label cset =
   let escape_dot c =
diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli
index 64fd212..6a2ae48 100644
--- a/src/syntax/sedlex.mli
+++ b/src/syntax/sedlex.mli
@@ -110,6 +110,7 @@ type compiled = {
   num_tags : int;
       (** Total number of memory cells needed at runtime. When [num_tags = 0],
           no memory is allocated (pattern has no [as] bindings). *)
+  tag_map : int array;
 }
 
 (** [compile rules] determinizes the NFA for an array of regexp rules using
diff --git a/test/codegen/test_gen.ml b/test/codegen/test_gen.ml
index 7a46239..418cb83 100644
--- a/test/codegen/test_gen.ml
+++ b/test/codegen/test_gen.ml
@@ -710,7 +710,7 @@ let%expect_test "optim: discriminator elision" =
     | _ -> ()
     |}]
 
-(* Optimization 4: Intra-rule tag coalescing
+(* Optimization 4: Intra-rule tag coalescing [NOT IMPLEMENTED]
    Tags with identical occurrence signatures should share one memory cell.
    Here x_end and y_start fire on the same transitions.
    Current: init_mem 1 (x_start=0, x_end=y_start via Tag offset, y_end=lexeme_length).
@@ -769,12 +769,112 @@ let%expect_test "optim: intra-rule tag coalescing" =
     | _ -> ()
     |}]
 
-(* Optimization 5: Cross-rule cell sharing (graph coloring)
-   Non-interfering rules should reuse the same memory cells.
+(* Or-pattern where both branches bind x at the same DFA position
+   (Plus 'b' after Plus 'a'). Each branch allocates its own start/end
+   tags for x. y's tags differ (Plus 'c' vs Plus 'f' → different states).
+   Current: init_mem 5. With intra-rule coalescing, x's branch tags
+   could share cells since they fire on the same transitions. *)
+let%expect_test "coalescing: or-pattern with same-position bindings" =
+  (match%sedlex_test buf with
+    | Plus 'a', (Plus 'b' as x), (Plus 'c' as y)
+    | Plus 'a', (Plus 'b' as x), (Plus 'f' as y) ->
+        ignore (x, y)
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a'"];
+      state1 [label="1"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1,t2-1}"];
+      state2 [label="2"];
+      state2 -> state2 [label="'b'"];
+      state2 -> state3 [label="'c' {d4=0,t1-1,t3-1}"];
+      state2 -> state4 [label="'f' {d4=1,t1-1,t3-1}"];
+      state3 [label="3\n[rule 0]", shape=doublecircle];
+      state3 -> state3 [label="'c' {d4=0}"];
+      state4 [label="4\n[rule 0]", shape=doublecircle];
+      state4 -> state4 [label="'f' {d4=1}"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | 1 ->
+          (Sedlexing.__private__set_mem_pos buf 0 1;
+           Sedlexing.__private__set_mem_pos buf 2 1;
+           __sedlex_state_2 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 ->
+          (Sedlexing.__private__set_mem_value buf 4 0;
+           Sedlexing.__private__set_mem_pos buf 1 1;
+           Sedlexing.__private__set_mem_pos buf 3 1;
+           __sedlex_state_3 buf)
+      | 2 ->
+          (Sedlexing.__private__set_mem_value buf 4 1;
+           Sedlexing.__private__set_mem_pos buf 1 1;
+           Sedlexing.__private__set_mem_pos buf 3 1;
+           __sedlex_state_4 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+       | 0 -> (Sedlexing.__private__set_mem_value buf 4 0; __sedlex_state_3 buf)
+       | _ -> Sedlexing.backtrack buf)
+    and __sedlex_state_4 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> (Sedlexing.__private__set_mem_value buf 4 1; __sedlex_state_4 buf)
+       | _ -> Sedlexing.backtrack buf) in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 5;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          if (Sedlexing.__private__mem_value buf 4) = 0
+          then
+            let __s = Sedlexing.__private__mem_pos buf 0 in
+            let __e = Sedlexing.__private__mem_pos buf 1 in
+            { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }
+          else
+            (let __s = Sedlexing.__private__mem_pos buf 2 in
+             let __e = Sedlexing.__private__mem_pos buf 3 in
+             { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) in
+        let y =
+          if (Sedlexing.__private__mem_value buf 4) = 0
+          then
+            let __s = Sedlexing.__private__mem_pos buf 1 in
+            let __e = Sedlexing.lexeme_length buf in
+            { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }
+          else
+            (let __s = Sedlexing.__private__mem_pos buf 3 in
+             let __e = Sedlexing.lexeme_length buf in
+             { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) in
+        ignore (x, y)
+    | _ -> ()
+    |}]
+
+(* Optimization 5: Cross-rule cell sharing (graph coloring) [DONE]
+   Non-interfering rules reuse the same memory cells.
    Rule 0 and rule 1 never co-exist in the same DFA state (beyond state 0),
-   so their tags can share cells.
-   Current: init_mem 4 (2 per rule: start + end tags; delayed on exits).
-   Goal: init_mem 2 (cells shared across non-interfering rules). *)
+   so their tags share cells.
+   Result: init_mem 2 (cells shared across non-interfering rules; delayed on exits). *)
 let%expect_test "optim: cross-rule cell sharing" =
   (match%sedlex_test buf with
     | Plus 'a', (Plus 'b' as x), Plus 'c' -> ignore x
@@ -803,10 +903,10 @@ let%expect_test "optim: cross-rule cell sharing" =
       state3 -> state3 [label="'c'"];
       state4 [label="4"];
       state4 -> state4 [label="'d'"];
-      state4 -> state5 [label="'e' {t2-1}"];
+      state4 -> state5 [label="'e' {t0-1}"];
       state5 [label="5"];
       state5 -> state5 [label="'e'"];
-      state5 -> state6 [label="'f' {t3-1}"];
+      state5 -> state6 [label="'f' {t1-1}"];
       state6 [label="6\n[rule 1]", shape=doublecircle];
       state6 -> state6 [label="'f'"];
     }
@@ -834,12 +934,12 @@ let%expect_test "optim: cross-rule cell sharing" =
     and __sedlex_state_4 buf =
       match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
       | 0 -> __sedlex_state_4 buf
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 2 1; __sedlex_state_5 buf)
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_5 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_5 buf =
       match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
       | 0 -> __sedlex_state_5 buf
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 3 1; __sedlex_state_6 buf)
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_6 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_6 buf =
       Sedlexing.mark buf 1;
@@ -847,7 +947,7 @@ let%expect_test "optim: cross-rule cell sharing" =
        | 0 -> __sedlex_state_6 buf
        | _ -> Sedlexing.backtrack buf) in
     match Sedlexing.start buf;
-          Sedlexing.__private__init_mem buf 4;
+          Sedlexing.__private__init_mem buf 2;
           __sedlex_state_0 buf
     with
     | 0 ->
@@ -858,8 +958,8 @@ let%expect_test "optim: cross-rule cell sharing" =
         ignore x
     | 1 ->
         let y =
-          let __s = Sedlexing.__private__mem_pos buf 2 in
-          let __e = Sedlexing.__private__mem_pos buf 3 in
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         ignore y
     | _ -> ()
diff --git a/test/codegen/test_realistic.ml b/test/codegen/test_realistic.ml
index 2a7884d..635fac4 100644
--- a/test/codegen/test_realistic.ml
+++ b/test/codegen/test_realistic.ml
@@ -1,14 +1,12 @@
 (* Realistic multi-rule lexer exercising many patterns simultaneously.
-   Current: init_mem 2.
+   Current: init_mem 1 (rules 0,1 share cell via cross-rule graph coloring).
    - Rule 0: (Plus 'a'..'z' as ns), '.', (Plus 'a'..'z' as name) → 1 tag
      (ns: start=0, end=tag0; name: start=tag0+1 via Tag offset, end=lexeme_length)
    - Rule 1: (Plus 'A'..'Z' as label), '=', (Plus '0'..'9' as value) → 1 tag
-     (label: start=0, end=tag1; value: start=tag1+1 via Tag offset, end=lexeme_length)
+     (label: start=0, end=tag0; value: start=tag0+1 via Tag offset, end=lexeme_length)
    - Rule 2: "0x", (Plus hex as hex), ';' → 0 tags (prefix=2, suffix=1)
    - Rule 3: '(', ('a'..'z' as x), ',', ('a'..'z' as y), ')' → 0 tags (all fixed offsets)
-   - Rule 4: (Plus digits as tok) | (Plus letters as tok) → 0 tags (discriminator elided)
-   Remaining optimization goals:
-   - Cross-rule cell sharing → rules 0,1 share cells (init_mem 2 → 1) *)
+   - Rule 4: (Plus digits as tok) | (Plus letters as tok) → 0 tags (discriminator elided) *)
 let%expect_test "realistic: multi-token lexer" =
   (match%sedlex_test buf with
     | (Plus 'a' .. 'z' as ns), '.', (Plus 'a' .. 'z' as name) ->
@@ -56,7 +54,7 @@ let%expect_test "realistic: multi-token lexer" =
       state9 -> state10 [label="';'"];
       state10 [label="10\n[rule 2]", shape=doublecircle];
       state11 [label="11"];
-      state11 -> state12 [label="'=' {t1-1}"];
+      state11 -> state12 [label="'=' {t0-1}"];
       state11 -> state11 [label="'A'-'Z'"];
       state12 [label="12"];
       state12 -> state13 [label="'0'-'9'"];
@@ -117,7 +115,7 @@ let%expect_test "realistic: multi-token lexer" =
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_11 buf =
       match __sedlex_partition_9 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_12 buf)
+      | 0 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_12 buf)
       | 1 -> __sedlex_state_11 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_12 buf =
@@ -145,7 +143,7 @@ let%expect_test "realistic: multi-token lexer" =
        | 0 -> __sedlex_state_16 buf
        | _ -> Sedlexing.backtrack buf) in
     match Sedlexing.start buf;
-          Sedlexing.__private__init_mem buf 2;
+          Sedlexing.__private__init_mem buf 1;
           __sedlex_state_0 buf
     with
     | 0 ->
@@ -161,10 +159,10 @@ let%expect_test "realistic: multi-token lexer" =
     | 1 ->
         let label =
           let __s = 0 in
-          let __e = Sedlexing.__private__mem_pos buf 1 in
+          let __e = Sedlexing.__private__mem_pos buf 0 in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         let value =
-          let __s = (Sedlexing.__private__mem_pos buf 1) + 1 in
+          let __s = (Sedlexing.__private__mem_pos buf 0) + 1 in
           let __e = Sedlexing.lexeme_length buf in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         ignore (label, value)