diff --git a/CHANGES.md b/CHANGES.md
index 0e8de80..4fc1c42 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,6 @@
 # unreleased
 - Support nested `let..in` for `[%sedlex.regexp?]` definitions
-- Add support for named captured group (#177, #178)
+- Add support for named captured group (#177, #178, #179)
 
 # 3.7 (2025-10-06)
 - Update to unicode 17.0.0
diff --git a/src/lib/sedlexing.ml b/src/lib/sedlexing.ml
index 653a955..0448af8 100644
--- a/src/lib/sedlexing.ml
+++ b/src/lib/sedlexing.ml
@@ -298,7 +298,10 @@ let __private__init_mem lexbuf n =
     Array.fill lexbuf.__private__mem_saved 0 n (-1)
   end
 
-let __private__set_mem_pos lexbuf i = lexbuf.__private__mem.(i) <- lexbuf.pos
+let __private__set_mem_pos lexbuf i offset =
+  assert (offset >= 0);
+  assert (lexbuf.pos - offset >= 0);
+  lexbuf.__private__mem.(i) <- lexbuf.pos - offset
 
 let __private__set_mem_value lexbuf i v =
   assert (v >= 0);
diff --git a/src/lib/sedlexing.mli b/src/lib/sedlexing.mli
index dcba7a6..702c2b0 100644
--- a/src/lib/sedlexing.mli
+++ b/src/lib/sedlexing.mli
@@ -250,10 +250,10 @@ val __private__next_int : lexbuf -> int
     each [match%sedlex] block that uses [as] bindings. *)
 val __private__init_mem : lexbuf -> int -> unit
 
-(** [__private__set_mem_pos lexbuf i] records the current position in cell [i],
+(** [__private__set_mem_pos lexbuf i offset] records [pos - offset] in cell [i],
     for later retrieval by {!__private__mem_pos}. Used by [Set_position] tag
     operations on DFA transitions. *)
-val __private__set_mem_pos : lexbuf -> int -> unit
+val __private__set_mem_pos : lexbuf -> int -> int -> unit
 
 (** [__private__set_mem_value lexbuf i v] stores integer [v] in cell [i],
     encoded as [-(v + 2)] so it is disjoint from positions and the unset
diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml
index 9afe33d..6e12977 100644
--- a/src/syntax/ppx_sedlex.ml
+++ b/src/syntax/ppx_sedlex.ml
@@ -242,18 +242,19 @@ let call_state lexbuf (auto : Sedlex.dfa) state =
   else appfun (state_fun state) [lexbuf]
 
 (* [gen_tag_ops lexbuf ops cont] wraps [cont] in a sequence of tag
-   operation calls. Each [Set_position t] becomes a call to
-   [__private__set_mem_pos], and each [Set_value (cell, v)] becomes a call to
-   [__private__set_mem_value]. Operations are folded right so they execute
-   before [cont]. *)
+   operation calls. Each [Set_position] becomes a call to
+   [__private__set_mem_pos] and each [Set_value (cell, v)] becomes a call
+   to [__private__set_mem_value]. Operations are folded right so they
+   execute before [cont]. *)
 let gen_tag_ops lexbuf (ops : Sedlex.tag_op list) cont =
   let loc = default_loc in
   List.fold_right
     (fun (op : Sedlex.tag_op) acc ->
       match op with
-        | Set_position t ->
+        | Set_position { cell; offset } ->
             [%expr
-              Sedlexing.__private__set_mem_pos [%e lexbuf] [%e eint ~loc t];
+              Sedlexing.__private__set_mem_pos [%e lexbuf] [%e eint ~loc cell]
+                [%e eint ~loc offset];
               [%e acc]]
         | Set_value (cell, value) ->
             [%expr
@@ -469,6 +470,19 @@ type tag_info = {
          distinct value in the shared discriminator cell. *)
 }
 
+let remap_tag_info (tag_map : int array) (ti : tag_info) =
+  let remap_pos = function
+    | Tag { tag; offset } -> Tag { tag = tag_map.(tag); offset }
+    | Start_plus _ as pe -> pe
+    | End_minus _ as pe -> pe
+  in
+  {
+    ti with
+    start_pos = remap_pos ti.start_pos;
+    end_pos = remap_pos ti.end_pos;
+    disc = List.map (fun (cell, v) -> (tag_map.(cell), v)) ti.disc;
+  }
+
 (* [advance pe len] shifts a position expression forward by [len] code
    points. Returns [None] if either argument is unknown. *)
 let advance pe len =
@@ -1053,6 +1067,7 @@ let handle_sedlex_match_ ~env ~map_rhs match_expr =
   let cases =
     List.map
       (fun (_, tag_info, e) ->
+        let tag_info = List.map (remap_tag_info compiled.tag_map) tag_info in
         let action = gen_binding_code (snd lexbuf) tag_info (map_rhs e) in
         ((), action))
       cases_parsed
diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml
index 4b8fe70..dbd4e6d 100644
--- a/src/syntax/sedlex.ml
+++ b/src/syntax/sedlex.ml
@@ -17,13 +17,20 @@
 
    2. Tags for `as` bindings (Laurikari-style)
       NFA nodes may carry a tag operation (Set_position or Set_value).
-      [bind] wraps a sub-regexp with start/end tagged epsilon nodes so the
-      DFA can record sub-match positions at runtime. When the PPX can
-      compute one boundary from a known offset (see [pos_expr] in
+      [bind] wraps a sub-regexp with start/end tagged epsilon
+      nodes so the DFA can record sub-match positions at runtime. When the
+      PPX can compute one boundary from a known offset (see [pos_expr] in
       ppx_sedlex.ml), [bind_start_only] or [bind_end_only] is used instead,
       saving a memory cell. Discriminator tags (Set_value) disambiguate
       or-patterns where multiple branches bind the same name.
 
+      After determinization, [compile] applies the self-loop tag delay
+      optimization: when a Set_position tag appears on both a self-loop
+      and all entering transitions to a state, it is removed from those
+      transitions and replaced by Set_position with offset 1 on exit
+      transitions. This
+      avoids writing to the memory cell on every loop iteration.
+
    3. Determinization (compile)
       Classic subset construction, extended to handle tags (Laurikari, NFAs
       with Tagged Transitions, 2000).
@@ -54,15 +61,9 @@
    -----------------------------------------
 
    Tag optimizations for `as` bindings:
-   - Self-loop tag delay: tags on self-loops that also appear on all
-     entering transitions can be removed from those transitions and emitted
-     as a "set previous position" on exit. This turns O(n) tag writes in
-     loops (e.g. Star) into O(1) on exit.
    - Intra-rule tag coalescing: tags with identical occurrence signatures
      (same presence in init_tags and same set of transitions) can share a
      single memory cell.
-   - Cross-rule cell sharing: memory cells from non-interfering rules can
-     share the same physical slot via liveness analysis and graph coloring.
 
    DFA construction:
    - DFA minimization: the generated DFA is not minimized. Hopcroft's or
@@ -74,7 +75,9 @@ module Cset = Sedlex_cset
 
 (* NFA *)
 
-type tag_op = Set_position of int | Set_value of int * int
+type tag_op =
+  | Set_position of { cell : int; offset : int }
+  | Set_value of int * int
 
 type node = {
   id : int;  (** Unique identifier, used for sorting transitions by target. *)
@@ -165,10 +168,14 @@ let bind r =
   let start_tag = new_tag () in
   let end_tag = new_tag () in
   let wrapped succ =
-    let end_node = new_tagged_node (Set_position end_tag) in
+    let end_node =
+      new_tagged_node (Set_position { cell = end_tag; offset = 0 })
+    in
     end_node.eps <- [succ];
     let inner = r end_node in
-    let start_node = new_tagged_node (Set_position start_tag) in
+    let start_node =
+      new_tagged_node (Set_position { cell = start_tag; offset = 0 })
+    in
     start_node.eps <- [inner];
     start_node
   in
@@ -178,7 +185,9 @@ let bind_start_only r =
   let start_tag = new_tag () in
   let wrapped succ =
     let inner = r succ in
-    let start_node = new_tagged_node (Set_position start_tag) in
+    let start_node =
+      new_tagged_node (Set_position { cell = start_tag; offset = 0 })
+    in
     start_node.eps <- [inner];
     start_node
   in
@@ -187,7 +196,9 @@ let bind_start_only r =
 let bind_end_only r =
   let end_tag = new_tag () in
   let wrapped succ =
-    let end_node = new_tagged_node (Set_position end_tag) in
+    let end_node =
+      new_tagged_node (Set_position { cell = end_tag; offset = 0 })
+    in
     end_node.eps <- [succ];
     r end_node
   in
@@ -298,7 +309,329 @@ type dfa_state = {
 }
 
 type dfa = dfa_state array
-type compiled = { dfa : dfa; init_tags : tag_op list; num_tags : int }
+
+type compiled = {
+  dfa : dfa;
+  init_tags : tag_op list;
+  num_tags : int;
+  tag_map : int array;
+}
+
+(* [compute_sccs num_states raw_dfa] computes strongly connected components
+   using Tarjan's algorithm. Returns [(scc_id, scc_count)] where
+   [scc_id.(s)] is the SCC index for state [s]. *)
+let compute_sccs num_states raw_dfa =
+  let scc_id = Array.make num_states (-1) in
+  let scc_count = ref 0 in
+  let index = Array.make num_states (-1) in
+  let lowlink = Array.make num_states 0 in
+  let on_stack = Array.make num_states false in
+  let stack = ref [] in
+  let idx = ref 0 in
+  let rec strongconnect v =
+    index.(v) <- !idx;
+    lowlink.(v) <- !idx;
+    incr idx;
+    stack := v :: !stack;
+    on_stack.(v) <- true;
+    let trans, _ = raw_dfa.(v) in
+    Array.iter
+      (fun (_, w, _) ->
+        if index.(w) = -1 then (
+          strongconnect w;
+          lowlink.(v) <- min lowlink.(v) lowlink.(w))
+        else if on_stack.(w) then lowlink.(v) <- min lowlink.(v) index.(w))
+      trans;
+    if lowlink.(v) = index.(v) then (
+      let id = !scc_count in
+      incr scc_count;
+      let rec pop () =
+        match !stack with
+          | w :: rest ->
+              stack := rest;
+              on_stack.(w) <- false;
+              scc_id.(w) <- id;
+              if w <> v then pop ()
+          | [] -> assert false
+      in
+      pop ())
+  in
+  for v = 0 to num_states - 1 do
+    if index.(v) = -1 then strongconnect v
+  done;
+  (scc_id, !scc_count)
+
+(* [find_delayable_tags num_states num_rules raw_dfa tag_to_rule]
+   identifies Set_position tags that can be delayed from cycle bodies
+   to cycle exits.
+
+   A tag [t] is delayable at state [s] when:
+   1. [s] is part of a cycle (SCC with ≥ 2 states, or a self-loop),
+   2. [s] has at least one transition leaving the SCC,
+   3. [s] is not reachable without [Set_position t] firing
+      (every transition entering [s] carries it), and
+   4. every transition leaving the SCC from a state other than [s]
+      leads only to states from which [tag_to_rule.(t)] is unreachable.
+
+   Returns [(delayed_at, scc_id)] where [delayed_at.(s)] is the list of
+   delayable tags at state [s]. *)
+let find_delayable_tags num_states num_rules raw_dfa tag_to_rule =
+  let scc_id, scc_count = compute_sccs num_states raw_dfa in
+  let scc_size = Array.make scc_count 0 in
+  Array.iter (fun id -> scc_size.(id) <- scc_size.(id) + 1) scc_id;
+  let has_self_loop =
+    Array.init num_states (fun s ->
+        let trans, _ = raw_dfa.(s) in
+        Array.exists (fun (_, target, _) -> target = s) trans)
+  in
+  let in_cycle s = scc_size.(scc_id.(s)) >= 2 || has_self_loop.(s) in
+  let has_exit s =
+    let trans, _ = raw_dfa.(s) in
+    Array.exists (fun (_, target, _) -> scc_id.(target) <> scc_id.(s)) trans
+  in
+  (* [reachable.(s).(r)] is true if some accepting state for rule [r]
+     is reachable from state [s] via transitions. *)
+  let reachable =
+    Array.init num_states (fun s ->
+        let _, finals = raw_dfa.(s) in
+        Array.copy finals)
+  in
+  let changed = ref true in
+  while !changed do
+    changed := false;
+    for s = 0 to num_states - 1 do
+      let trans, _ = raw_dfa.(s) in
+      Array.iter
+        (fun (_, target, _) ->
+          for r = 0 to num_rules - 1 do
+            if reachable.(target).(r) && not reachable.(s).(r) then (
+              reachable.(s).(r) <- true;
+              changed := true)
+          done)
+        trans
+    done
+  done;
+  (* For each state in a cycle that has exits, find Set_position tags
+     on every entering transition, then keep only those whose owning
+     rule is unreachable from non-[s] exits of the SCC. *)
+  let delayed_at = Array.make num_states [] in
+  for s = 0 to num_states - 1 do
+    if in_cycle s && has_exit s then (
+      let entering = ref [] in
+      for s' = 0 to num_states - 1 do
+        let trans', _ = raw_dfa.(s') in
+        Array.iter
+          (fun (_, target, tags) ->
+            if target = s then entering := tags :: !entering)
+          trans'
+      done;
+      let candidates =
+        match !entering with
+          | [] -> []
+          | first :: rest ->
+              List.filter
+                (fun t ->
+                  match t with
+                    | Set_position { offset = 0; _ } ->
+                        List.for_all (List.mem t) rest
+                    | _ -> false)
+                first
+      in
+      let scc = scc_id.(s) in
+      delayed_at.(s) <-
+        List.filter
+          (fun t ->
+            match t with
+              | Set_position { cell; _ } ->
+                  let rule = tag_to_rule.(cell) in
+                  let dominated = ref true in
+                  for s' = 0 to num_states - 1 do
+                    if s' <> s && scc_id.(s') = scc then (
+                      let trans', _ = raw_dfa.(s') in
+                      Array.iter
+                        (fun (_, target, _) ->
+                          if scc_id.(target) <> scc && reachable.(target).(rule)
+                          then dominated := false)
+                        trans')
+                  done;
+                  !dominated
+              | _ -> false)
+          candidates)
+  done;
+  (delayed_at, scc_id)
+
+(* [apply_tag_delay raw_dfa delayed_at scc_id] rewrites the DFA:
+   - removes delayed tags from transitions entering each state, and
+   - emits [Set_position] with offset 1 on transitions leaving the cycle.
+
+   Offset 1 records the position of the previous code point, which is
+   exactly the value the original [Set_position] would have had on the
+   last iteration.
+   This turns O(n) tag writes per loop into O(1) on exit. *)
+let apply_tag_delay raw_dfa delayed_at scc_id =
+  Array.mapi
+    (fun s (trans, finals) ->
+      let trans =
+        Array.map
+          (fun (cs, target, tags) ->
+            let tags =
+              List.filter (fun t -> not (List.mem t delayed_at.(target))) tags
+            in
+            let delayed_ops =
+              if delayed_at.(s) <> [] && scc_id.(target) <> scc_id.(s) then
+                List.filter_map
+                  (fun op ->
+                    match op with
+                      | Set_position { cell; _ } ->
+                          Some (Set_position { cell; offset = 1 })
+                      | _ -> None)
+                  delayed_at.(s)
+              else []
+            in
+            (cs, target, tags @ delayed_ops))
+          trans
+      in
+      (trans, finals))
+    raw_dfa
+
+let tag_cell = function
+  | Set_position { cell; _ } -> cell
+  | Set_value (cell, _) -> cell
+
+(* [remap_cells slot_of num_slots tag_map raw_dfa init_tags] rewrites all tag cell
+   references in the DFA and init_tags through [slot_of], and composes
+   [tag_map] with the remapping. Returns [(num_slots, tag_map, raw_dfa, init_tags)]. *)
+let remap_cells slot_of num_slots tag_map raw_dfa init_tags =
+  let tag_map = Array.map (fun c -> slot_of.(c)) tag_map in
+  let remap_op = function
+    | Set_position { cell; offset } ->
+        Set_position { cell = slot_of.(cell); offset }
+    | Set_value (c, v) -> Set_value (slot_of.(c), v)
+  in
+  let raw_dfa =
+    Array.map
+      (fun (trans, finals) ->
+        let trans =
+          Array.map
+            (fun (cs, target, tags) ->
+              (cs, target, List.sort_uniq compare (List.map remap_op tags)))
+            trans
+        in
+        (trans, finals))
+      raw_dfa
+  in
+  let init_tags = List.sort_uniq compare (List.map remap_op init_tags) in
+  (num_slots, tag_map, raw_dfa, init_tags)
+
+(* [cell_owners num_tags rs tag_map] maps each cell to the rule that owns
+   it: -1 = unused, >= 0 = single rule, -2 = multiple rules. Walks the
+   NFA of each rule to discover which tags it created. *)
+let cell_owners num_tags rs tag_map =
+  let num_tags_raw = Array.length tag_map in
+  let tag_to_rule = Array.make num_tags_raw (-1) in
+  Array.iteri
+    (fun r (start, _) ->
+      let visited = Hashtbl.create 31 in
+      let rec visit node =
+        if not (Hashtbl.mem visited node.id) then (
+          Hashtbl.add visited node.id ();
+          (match node.tag with
+            | Some op -> tag_to_rule.(tag_cell op) <- r
+            | None -> ());
+          List.iter visit node.eps;
+          List.iter (fun (_, n) -> visit n) node.trans)
+      in
+      visit start)
+    rs;
+  let cell_to_rule = Array.make num_tags (-1) in
+  for t = 0 to num_tags_raw - 1 do
+    if tag_to_rule.(t) >= 0 then (
+      let c = tag_map.(t) in
+      let r = tag_to_rule.(t) in
+      if cell_to_rule.(c) = -1 then cell_to_rule.(c) <- r
+      else if cell_to_rule.(c) <> r then cell_to_rule.(c) <- -2)
+  done;
+  cell_to_rule
+
+(* [forward_reachable num_tags num_states raw_dfa init_tags] computes,
+   for each cell [c], the set of DFA states reachable from any transition
+   that writes to [c]. Two cells whose reachable sets overlap may be
+   simultaneously live and must not share a slot. *)
+let forward_reachable num_tags num_states raw_dfa init_tags =
+  let fwd = Array.init num_tags (fun _ -> Array.make num_states false) in
+  (* Seed: mark the target of each transition that writes to cell c. *)
+  for s = 0 to num_states - 1 do
+    let trans, _ = raw_dfa.(s) in
+    Array.iter
+      (fun (_, target, tags) ->
+        List.iter (fun op -> fwd.(tag_cell op).(target) <- true) tags)
+      trans
+  done;
+  List.iter (fun op -> fwd.(tag_cell op).(0) <- true) init_tags;
+  (* BFS: propagate reachability along DFA transitions. *)
+  for c = 0 to num_tags - 1 do
+    let queue = Queue.create () in
+    for s = 0 to num_states - 1 do
+      if fwd.(c).(s) then Queue.push s queue
+    done;
+    while not (Queue.is_empty queue) do
+      let s = Queue.pop queue in
+      let trans, _ = raw_dfa.(s) in
+      Array.iter
+        (fun (_, target, _) ->
+          if not fwd.(c).(target) then (
+            fwd.(c).(target) <- true;
+            Queue.push target queue))
+        trans
+    done
+  done;
+  fwd
+
+(* [color_cells num_tags num_states cell_to_rule fwd] assigns each cell to the
+   lowest-numbered slot that does not conflict with any previously
+   assigned cell. Two cells conflict when they belong to the same rule,
+   when either is shared across rules, or when their forward-reachable
+   sets overlap.
+   Returns [(slot_of, num_slots)]. *)
+let color_cells num_tags num_states cell_to_rule fwd =
+  let slot_of = Array.make num_tags 0 in
+  let max_slot = ref 0 in
+  for c = 0 to num_tags - 1 do
+    let used = Hashtbl.create 8 in
+    for c' = 0 to c - 1 do
+      let conflict =
+        cell_to_rule.(c) < 0
+        || cell_to_rule.(c') < 0
+        || cell_to_rule.(c) = cell_to_rule.(c')
+        ||
+        let overlap = ref false in
+        for s = 0 to num_states - 1 do
+          if fwd.(c).(s) && fwd.(c').(s) then overlap := true
+        done;
+        !overlap
+      in
+      if conflict then Hashtbl.replace used slot_of.(c') ()
+    done;
+    let slot = ref 0 in
+    while Hashtbl.mem used !slot do
+      incr slot
+    done;
+    slot_of.(c) <- !slot;
+    if !slot > !max_slot then max_slot := !slot
+  done;
+  (slot_of, !max_slot + 1)
+
+(* [share_cells] merges non-interfering tag cells from different rules
+   into shared physical slots.
+   Returns [(num_tags, tag_map, raw_dfa, init_tags)] with remapped cells,
+   or the inputs unchanged if no sharing is possible. *)
+let share_cells num_tags num_states rs raw_dfa init_tags tag_map =
+  let cell_to_rule = cell_owners num_tags rs tag_map in
+  let fwd = forward_reachable num_tags num_states raw_dfa init_tags in
+  let slot_of, num_slots = color_cells num_tags num_states cell_to_rule fwd in
+  if num_slots < num_tags then
+    remap_cells slot_of num_slots tag_map raw_dfa init_tags
+  else (num_tags, tag_map, raw_dfa, init_tags)
 
 (* [compile rs] determinizes the NFA for an array of regexp rules.
    Each rule is compiled to an NFA (entry node, final node) pair. The initial
@@ -307,7 +640,9 @@ type compiled = { dfa : dfa; init_tags : tag_op list; num_tags : int }
    (physical identity). Returns a {compiled} record with the DFA, initial
    tag operations, and total number of memory cells needed. *)
 let compile rs =
+  let num_rules = Array.length rs in
   let rs = Array.map compile_re rs in
+  let num_tags_raw = !cur_tag in
   let counter = ref 0 in
   let states = Hashtbl.create 31 in
   let states_def = Hashtbl.create 31 in
@@ -320,7 +655,7 @@ let compile rs =
       let trans = transition state in
       let trans = Array.map (fun (p, t, tags) -> (p, aux t, tags)) trans in
       let finals = Array.map (fun (_, f) -> List.memq f state) rs in
-      Hashtbl.add states_def i { trans; finals };
+      Hashtbl.add states_def i (trans, finals);
       i
   in
   let init = ref ([], []) in
@@ -328,11 +663,37 @@ let compile rs =
   let init_state, init_tags = !init in
   let i = aux init_state in
   assert (i = 0);
-  {
-    dfa = Array.init !counter (Hashtbl.find states_def);
-    init_tags = dedup_tags init_tags;
-    num_tags = !cur_tag;
-  }
+  let num_states = !counter in
+  let raw_dfa = Array.init num_states (Hashtbl.find states_def) in
+  if num_tags_raw = 0 then (
+    let dfa =
+      Array.map
+        (fun (trans, finals) ->
+          {
+            trans = Array.map (fun (cs, target, _) -> (cs, target, [])) trans;
+            finals;
+          })
+        raw_dfa
+    in
+    { dfa; init_tags = []; num_tags = 0; tag_map = [||] })
+  else
+    let tag_to_rule = cell_owners num_tags_raw rs (Array.init num_tags_raw Fun.id) in
+    let delayed_at, scc_id =
+      find_delayable_tags num_states num_rules raw_dfa tag_to_rule
+    in
+    let raw_dfa = apply_tag_delay raw_dfa delayed_at scc_id in
+    let tag_map = Array.init num_tags_raw Fun.id in
+    let num_tags, tag_map, raw_dfa, init_tags =
+      if num_tags_raw <= 1 || num_rules <= 1 then
+        (num_tags_raw, tag_map, raw_dfa, init_tags)
+      else share_cells num_tags_raw num_states rs raw_dfa init_tags tag_map
+    in
+    let dfa =
+      Array.map
+        (fun (trans, finals) -> { trans; finals })
+        raw_dfa
+    in
+    { dfa; init_tags = dedup_tags init_tags; num_tags; tag_map }
 
 let cset_to_label cset =
   let escape_dot c =
@@ -382,8 +743,10 @@ let dfa_to_dot dfa =
         (fun (cset, target, tags) ->
           let label = cset_to_label cset in
           let tag_op_to_string = function
-            | Set_position t -> "t" ^ string_of_int t
-            | Set_value (c, v) -> "d" ^ string_of_int c ^ "=" ^ string_of_int v
+            | Set_position { cell; offset = 0 } -> Printf.sprintf "t%d" cell
+            | Set_position { cell; offset } ->
+                Printf.sprintf "t%d-%d" cell offset
+            | Set_value (c, v) -> Printf.sprintf "d%d=%d" c v
           in
           let label =
             if tags = [] then label
diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli
index 5bac2b5..6a2ae48 100644
--- a/src/syntax/sedlex.mli
+++ b/src/syntax/sedlex.mli
@@ -55,9 +55,11 @@ val intersection : regexp -> regexp -> regexp option
 
 (** Tag operations emitted on DFA transitions. *)
 type tag_op =
-  | Set_position of int
-      (** [Set_position i]: record the current lexbuf position in memory cell
-          [i]. *)
+  | Set_position of { cell : int; offset : int }
+      (** [Set_position { cell; offset }]: record [pos - offset] in memory cell
+          [cell]. Offset 0 records the current position; offset 1 records the
+          previous code point's position (emitted by the tag delay optimization
+          on transitions leaving a loop). *)
   | Set_value of int * int
       (** [Set_value (cell, v)]: record integer [v] in memory cell [cell] (used
           for or-pattern discriminators). *)
@@ -108,6 +110,7 @@ type compiled = {
   num_tags : int;
       (** Total number of memory cells needed at runtime. When [num_tags = 0],
           no memory is allocated (pattern has no [as] bindings). *)
+  tag_map : int array;
 }
 
 (** [compile rules] determinizes the NFA for an array of regexp rules using
diff --git a/test/basic.ml b/test/basic.ml
index e44cc4f..e0e63fb 100644
--- a/test/basic.ml
+++ b/test/basic.ml
@@ -1303,7 +1303,7 @@ let%expect_test "as_bindings" =
           (Sedlexing.Utf8.of_submatch y)
     | _ -> assert false);
   [%expect {| x=d y=gh |}];
-  (* Test 11: Set_prev with backtracking (Opt at end) *)
+  (* Test 11: Delayed tag with backtracking (Opt at end) *)
   let buf = Sedlexing.Utf8.from_string "aabba" in
   (match%sedlex buf with
     | (Plus 'a' as x), ((Plus 'b', Opt 'a') as y) ->
@@ -1475,3 +1475,83 @@ let%expect_test "as_bindings_nested_sedlex" =
     | Plus 'a' .. 'z' as _x -> Printf.printf "mem_cells=%d\n" (num_mem buf)
     | _ -> assert false);
   [%expect {| mem_cells=0 |}]
+
+let%expect_test "as_bindings_multi_state_cycle" =
+  (* Plus "ab" creates a 2-state cycle; tag delay should still be correct *)
+  let buf = Sedlexing.Utf8.from_string "aaababcc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aaa y=bab z=cc |}];
+  (* Single iteration of the cycle *)
+  let buf = Sedlexing.Utf8.from_string "aabc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aa y=b z=c |}];
+  (* Many iterations *)
+  let buf = Sedlexing.Utf8.from_string "aabababababccc" in
+  (match%sedlex buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) ->
+        Printf.printf "x=%s y=%s z=%s\n"
+          (Sedlexing.Utf8.of_submatch x)
+          (Sedlexing.Utf8.of_submatch y)
+          (Sedlexing.Utf8.of_submatch z)
+    | _ -> assert false);
+  [%expect {| x=aa y=babababab z=ccc |}]
+
+let%expect_test "as_bindings_multi_exit_tag_ownership" =
+  (* The tag belongs to rule 1 (not rule 0). With incorrect tag ownership
+     (all tags mapped to rule 0), the delay check would incorrectly allow
+     delay because rule 0 is unreachable from the non-s exit — but rule 1
+     IS reachable there, and it reads the tag. *)
+  let buf = Sedlexing.Utf8.from_string "ababccc" in
+  (match%sedlex buf with
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule0\n"
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  let buf = Sedlexing.Utf8.from_string "abad" in
+  (match%sedlex buf with
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule0\n"
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | _ -> assert false);
+  [%expect {| rule0 |}]
+
+let%expect_test "as_bindings_multi_exit_tag_delay" =
+  (* Opt 'b' shifts Plus "ab" to share a cycle with Plus "ba".
+     The cycle has exits from two states; tag delay relies on the
+     generalized reachability check. *)
+  let buf = Sedlexing.Utf8.from_string "ababccc" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  (* With the optional 'b' prefix *)
+  let buf = Sedlexing.Utf8.from_string "bababc" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| x=abab |}];
+  (* Rule 1 match (exits from the other cycle state) *)
+  let buf = Sedlexing.Utf8.from_string "abad" in
+  (match%sedlex buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' ->
+        Printf.printf "x=%s\n" (Sedlexing.Utf8.of_submatch x)
+    | Opt 'a', Plus ('b', 'a'), 'd' -> Printf.printf "rule1\n"
+    | _ -> assert false);
+  [%expect {| rule1 |}]
diff --git a/test/codegen/test_gen.ml b/test/codegen/test_gen.ml
index 44af2e0..418cb83 100644
--- a/test/codegen/test_gen.ml
+++ b/test/codegen/test_gen.ml
@@ -562,10 +562,10 @@ let%expect_test "optim: element-length (Offset_from_tag)" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
       state2 -> state3 [label="'c'"];
       state3 [label="3\n[rule 0]", shape=doublecircle];
@@ -574,12 +574,12 @@ let%expect_test "optim: element-length (Offset_from_tag)" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
@@ -710,7 +710,7 @@ let%expect_test "optim: discriminator elision" =
     | _ -> ()
     |}]
 
-(* Optimization 4: Intra-rule tag coalescing
+(* Optimization 4: Intra-rule tag coalescing [NOT IMPLEMENTED]
    Tags with identical occurrence signatures should share one memory cell.
    Here x_end and y_start fire on the same transitions.
    Current: init_mem 1 (x_start=0, x_end=y_start via Tag offset, y_end=lexeme_length).
@@ -730,22 +730,22 @@ let%expect_test "optim: intra-rule tag coalescing" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state2 [label="'b'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -769,12 +769,112 @@ let%expect_test "optim: intra-rule tag coalescing" =
     | _ -> ()
     |}]
 
-(* Optimization 5: Cross-rule cell sharing (graph coloring)
-   Non-interfering rules should reuse the same memory cells.
+(* Or-pattern where both branches bind x at the same DFA position
+   (Plus 'b' after Plus 'a'). Each branch allocates its own start/end
+   tags for x. y's tags differ (Plus 'c' vs Plus 'f' → different states).
+   Current: init_mem 5. With intra-rule coalescing, x's branch tags
+   could share cells since they fire on the same transitions. *)
+let%expect_test "coalescing: or-pattern with same-position bindings" =
+  (match%sedlex_test buf with
+    | Plus 'a', (Plus 'b' as x), (Plus 'c' as y)
+    | Plus 'a', (Plus 'b' as x), (Plus 'f' as y) ->
+        ignore (x, y)
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a'"];
+      state1 [label="1"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1,t2-1}"];
+      state2 [label="2"];
+      state2 -> state2 [label="'b'"];
+      state2 -> state3 [label="'c' {d4=0,t1-1,t3-1}"];
+      state2 -> state4 [label="'f' {d4=1,t1-1,t3-1}"];
+      state3 [label="3\n[rule 0]", shape=doublecircle];
+      state3 -> state3 [label="'c' {d4=0}"];
+      state4 [label="4\n[rule 0]", shape=doublecircle];
+      state4 -> state4 [label="'f' {d4=1}"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | 1 ->
+          (Sedlexing.__private__set_mem_pos buf 0 1;
+           Sedlexing.__private__set_mem_pos buf 2 1;
+           __sedlex_state_2 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 ->
+          (Sedlexing.__private__set_mem_value buf 4 0;
+           Sedlexing.__private__set_mem_pos buf 1 1;
+           Sedlexing.__private__set_mem_pos buf 3 1;
+           __sedlex_state_3 buf)
+      | 2 ->
+          (Sedlexing.__private__set_mem_value buf 4 1;
+           Sedlexing.__private__set_mem_pos buf 1 1;
+           Sedlexing.__private__set_mem_pos buf 3 1;
+           __sedlex_state_4 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+       | 0 -> (Sedlexing.__private__set_mem_value buf 4 0; __sedlex_state_3 buf)
+       | _ -> Sedlexing.backtrack buf)
+    and __sedlex_state_4 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> (Sedlexing.__private__set_mem_value buf 4 1; __sedlex_state_4 buf)
+       | _ -> Sedlexing.backtrack buf) in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 5;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          if (Sedlexing.__private__mem_value buf 4) = 0
+          then
+            let __s = Sedlexing.__private__mem_pos buf 0 in
+            let __e = Sedlexing.__private__mem_pos buf 1 in
+            { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }
+          else
+            (let __s = Sedlexing.__private__mem_pos buf 2 in
+             let __e = Sedlexing.__private__mem_pos buf 3 in
+             { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) in
+        let y =
+          if (Sedlexing.__private__mem_value buf 4) = 0
+          then
+            let __s = Sedlexing.__private__mem_pos buf 1 in
+            let __e = Sedlexing.lexeme_length buf in
+            { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }
+          else
+            (let __s = Sedlexing.__private__mem_pos buf 3 in
+             let __e = Sedlexing.lexeme_length buf in
+             { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) }) in
+        ignore (x, y)
+    | _ -> ()
+    |}]
+
+(* Optimization 5: Cross-rule cell sharing (graph coloring) [DONE]
+   Non-interfering rules reuse the same memory cells.
    Rule 0 and rule 1 never co-exist in the same DFA state (beyond state 0),
-   so their tags can share cells.
-   Current: init_mem 4 (2 per rule: start + end tags for variable-length binding).
-   Goal: init_mem 2 (cells shared across non-interfering rules). *)
+   so their tags share cells.
+   Result: init_mem 2 (cells shared across non-interfering rules; delayed on exits). *)
 let%expect_test "optim: cross-rule cell sharing" =
   (match%sedlex_test buf with
     | Plus 'a', (Plus 'b' as x), Plus 'c' -> ignore x
@@ -791,40 +891,40 @@ let%expect_test "optim: cross-rule cell sharing" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
-      state0 -> state4 [label="'d' {t2}"];
+      state0 -> state1 [label="'a'"];
+      state0 -> state4 [label="'d'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b' {t1}"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
-      state2 -> state2 [label="'b' {t1}"];
-      state2 -> state3 [label="'c'"];
+      state2 -> state2 [label="'b'"];
+      state2 -> state3 [label="'c' {t1-1}"];
       state3 [label="3\n[rule 0]", shape=doublecircle];
       state3 -> state3 [label="'c'"];
       state4 [label="4"];
-      state4 -> state4 [label="'d' {t2}"];
-      state4 -> state5 [label="'e' {t3}"];
+      state4 -> state4 [label="'d'"];
+      state4 -> state5 [label="'e' {t0-1}"];
       state5 [label="5"];
-      state5 -> state5 [label="'e' {t3}"];
-      state5 -> state6 [label="'f'"];
+      state5 -> state5 [label="'e'"];
+      state5 -> state6 [label="'f' {t1-1}"];
       state6 [label="6\n[rule 1]", shape=doublecircle];
       state6 -> state6 [label="'f'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 2; __sedlex_state_4 buf)
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> __sedlex_state_4 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_2 buf)
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_2 buf)
-      | 1 -> __sedlex_state_3 buf
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_3 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_3 buf =
       Sedlexing.mark buf 0;
@@ -833,13 +933,13 @@ let%expect_test "optim: cross-rule cell sharing" =
        | _ -> Sedlexing.backtrack buf)
     and __sedlex_state_4 buf =
       match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 2; __sedlex_state_4 buf)
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 3; __sedlex_state_5 buf)
+      | 0 -> __sedlex_state_4 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_5 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_5 buf =
       match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 3; __sedlex_state_5 buf)
-      | 1 -> __sedlex_state_6 buf
+      | 0 -> __sedlex_state_5 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_6 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_6 buf =
       Sedlexing.mark buf 1;
@@ -847,7 +947,7 @@ let%expect_test "optim: cross-rule cell sharing" =
        | 0 -> __sedlex_state_6 buf
        | _ -> Sedlexing.backtrack buf) in
     match Sedlexing.start buf;
-          Sedlexing.__private__init_mem buf 4;
+          Sedlexing.__private__init_mem buf 2;
           __sedlex_state_0 buf
     with
     | 0 ->
@@ -858,8 +958,8 @@ let%expect_test "optim: cross-rule cell sharing" =
         ignore x
     | 1 ->
         let y =
-          let __s = Sedlexing.__private__mem_pos buf 2 in
-          let __e = Sedlexing.__private__mem_pos buf 3 in
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         ignore y
     | _ -> ()
@@ -870,8 +970,7 @@ let%expect_test "optim: cross-rule cell sharing" =
    a final state should be removed.
    Rule 0 has a binding on Plus 'b'; rule 1 does not.
    Both share the Plus 'a', Plus 'b' prefix in the DFA.
-   Current: init_mem 1, tag t0 set on shared prefix transitions
-   even when only rule 1 is reachable via 'd'.
+   Current: init_mem 1, t0 delayed (offset 1 on 'a'→'b' exit).
    Goal: no tags on transitions leading exclusively to rule 1. *)
 let%expect_test "optim: dead tag elimination" =
   (match%sedlex_test buf with
@@ -889,10 +988,10 @@ let%expect_test "optim: dead tag elimination" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2"];
       state2 -> state2 [label="'b'"];
       state2 -> state3 [label="'c'"];
@@ -903,12 +1002,12 @@ let%expect_test "optim: dead tag elimination" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
@@ -930,11 +1029,10 @@ let%expect_test "optim: dead tag elimination" =
     | _ -> ()
     |}]
 
-(* Optimization 7: Self-loop tag delay (Set_prev)
+(* Optimization 7: Self-loop tag delay [DONE]
    Tags on a self-loop that also appear on all entering transitions
-   should be delayed to exit transitions as Set_prev.
-   Current: init_mem 1, set_mem t0 on every 'a' iteration (O(n)).
-   Goal: no set_mem on the self-loop, set_mem_prev on exit (O(1)). *)
+   are delayed to exit transitions with offset 1.
+   Result: init_mem 1, no set_mem on the self-loop, set_mem_pos with offset 1 on exit (O(1)). *)
 let%expect_test "optim: self-loop tag delay" =
   (match%sedlex_test buf with (Plus 'a' as x), Plus 'b' -> ignore x | _ -> ());
   [%expect
@@ -948,22 +1046,22 @@ let%expect_test "optim: self-loop tag delay" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state2 [label="'b'"];
     }
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -985,7 +1083,7 @@ let%expect_test "optim: self-loop tag delay" =
 
 (* Optimization 8: Tag remapping
    After coalescing and dead-tag elimination, the PPX should remap
-   Tag references through the compiler's tag_map.
+   tag references.
    Current: 0 tags (all offsets known: x=0..1, y=1..end-1, z=end-1..end).
    Goal: already optimal. *)
 let%expect_test "optim: tag remapping after coalescing" =
@@ -1042,11 +1140,11 @@ let%expect_test "optim: tag remapping after coalescing" =
     | _ -> ()
     |}]
 
-(* Optimization 9: Set_prev with backtracking
+(* Optimization 9: Delayed tag with backtracking [DONE]
    Opt at the end means the DFA can accept at two states (with or without
-   the optional 'a'). When self-loop tag delay is implemented, the delayed
-   tags (Set_prev) must survive mark/backtrack correctly.
-   Current: init_mem 1 (x: start=0, end=tag0; y: start=tag0, end=lexeme_length). *)
+   the optional 'a'). The delayed tag (offset 1) must survive mark/backtrack
+   correctly.
+   Result: init_mem 1, set_mem_pos with offset 1 on exit (x: start=0, end=tag0; y: start=tag0, end=lexeme_length). *)
 let%expect_test "optim: set_prev with backtracking" =
   (match%sedlex_test buf with
     | (Plus 'a' as x), ((Plus 'b', Opt 'a') as y) -> ignore (x, y)
@@ -1062,10 +1160,10 @@ let%expect_test "optim: set_prev with backtracking" =
       _start -> state0;
 
       state0 [label="0"];
-      state0 -> state1 [label="'a' {t0}"];
+      state0 -> state1 [label="'a'"];
       state1 [label="1"];
-      state1 -> state1 [label="'a' {t0}"];
-      state1 -> state2 [label="'b'"];
+      state1 -> state1 [label="'a'"];
+      state1 -> state2 [label="'b' {t0-1}"];
       state2 [label="2\n[rule 0]", shape=doublecircle];
       state2 -> state3 [label="'a'"];
       state2 -> state2 [label="'b'"];
@@ -1074,12 +1172,12 @@ let%expect_test "optim: set_prev with backtracking" =
     CODE:
     let rec __sedlex_state_0 buf =
       match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
+      | 0 -> __sedlex_state_1 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
-      | 0 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_1 buf)
-      | 1 -> __sedlex_state_2 buf
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_2 buf)
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_2 buf =
       Sedlexing.mark buf 0;
@@ -1164,6 +1262,88 @@ let%expect_test "Rep fixed-length prefix enables Start_plus" =
     | _ -> ()
     |}]
 
+(* Multi-exit tag delay: the cycle has exits from two different states,
+   but the tag's owning rule is only reachable from one of them.
+   Opt 'b' shifts Plus "ab" by one character so it shares the cycle
+   with Plus "ba"; each rule exits from a different cycle state.
+   tag_end fires in the cycle; the generalized check allows delay
+   because the other exit only reaches rule 1 (which has no tags). *)
+let%expect_test "optim: multi-exit tag delay" =
+  (match%sedlex_test buf with
+    | Opt 'b', (Plus ('a', 'b') as x), Plus 'c' -> ignore x
+    | Opt 'a', Plus ('b', 'a'), 'd' -> ()
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a'"];
+      state0 -> state6 [label="'b' {t0}"];
+      state1 [label="1"];
+      state1 -> state2 [label="'b'"];
+      state2 [label="2"];
+      state2 -> state3 [label="'a'"];
+      state2 -> state5 [label="'c' {t1-1}"];
+      state3 [label="3"];
+      state3 -> state2 [label="'b'"];
+      state3 -> state4 [label="'d'"];
+      state4 [label="4\n[rule 1]", shape=doublecircle];
+      state5 [label="5\n[rule 0]", shape=doublecircle];
+      state5 -> state5 [label="'c'"];
+      state6 [label="6"];
+      state6 -> state3 [label="'a'"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_1 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 0; __sedlex_state_6 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_5 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> 1
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_5 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> __sedlex_state_5 buf
+       | _ -> Sedlexing.backtrack buf)
+    and __sedlex_state_6 buf =
+      match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | _ -> Sedlexing.backtrack buf in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 2;
+          Sedlexing.__private__set_mem_pos buf 0 0;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        ignore x
+    | 1 -> ()
+    | _ -> ()
+    |}]
+
 let%expect_test "as binding: or-chain then nested or on right" =
   (match%sedlex_test buf with
     | ("ab" as x), ("ef" as y)
@@ -1303,3 +1483,81 @@ let%expect_test "as binding: or-chain then nested or on right" =
         ignore (x, y)
     | _ -> ()
     |}]
+
+let%expect_test "as binding: Plus with self-loop delay" =
+  (match%sedlex_test buf with
+    | (Plus 'a' as x), (Plus "ab" as y), (Plus 'c' as z) -> ignore (x, y, z)
+    | _ -> ());
+  [%expect
+    {|
+    DOT:
+    digraph {
+      rankdir=LR;
+      node [shape=circle];
+
+      _start [shape=point];
+      _start -> state0;
+
+      state0 [label="0"];
+      state0 -> state1 [label="'a' {t0}"];
+      state1 [label="1"];
+      state1 -> state2 [label="'a'"];
+      state2 [label="2"];
+      state2 -> state2 [label="'a'"];
+      state2 -> state3 [label="'b' {t0-1}"];
+      state3 [label="3"];
+      state3 -> state4 [label="'a'"];
+      state3 -> state5 [label="'c' {t1-1}"];
+      state4 [label="4"];
+      state4 -> state3 [label="'b'"];
+      state5 [label="5\n[rule 0]", shape=doublecircle];
+      state5 -> state5 [label="'c'"];
+    }
+    CODE:
+    let rec __sedlex_state_0 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> (Sedlexing.__private__set_mem_pos buf 0 0; __sedlex_state_1 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_1 buf =
+      match __sedlex_partition_1 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_2 buf =
+      match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_2 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_3 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_3 buf =
+      match __sedlex_partition_3 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_4 buf
+      | 1 -> (Sedlexing.__private__set_mem_pos buf 1 1; __sedlex_state_5 buf)
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_4 buf =
+      match __sedlex_partition_4 (Sedlexing.__private__next_int buf) with
+      | 0 -> __sedlex_state_3 buf
+      | _ -> Sedlexing.backtrack buf
+    and __sedlex_state_5 buf =
+      Sedlexing.mark buf 0;
+      (match __sedlex_partition_5 (Sedlexing.__private__next_int buf) with
+       | 0 -> __sedlex_state_5 buf
+       | _ -> Sedlexing.backtrack buf) in
+    match Sedlexing.start buf;
+          Sedlexing.__private__init_mem buf 2;
+          __sedlex_state_0 buf
+    with
+    | 0 ->
+        let x =
+          let __s = 0 in
+          let __e = Sedlexing.__private__mem_pos buf 0 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        let y =
+          let __s = Sedlexing.__private__mem_pos buf 0 in
+          let __e = Sedlexing.__private__mem_pos buf 1 in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        let z =
+          let __s = Sedlexing.__private__mem_pos buf 1 in
+          let __e = Sedlexing.lexeme_length buf in
+          { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
+        ignore (x, y, z)
+    | _ -> ()
+    |}]
diff --git a/test/codegen/test_realistic.ml b/test/codegen/test_realistic.ml
index 083745f..635fac4 100644
--- a/test/codegen/test_realistic.ml
+++ b/test/codegen/test_realistic.ml
@@ -1,15 +1,12 @@
 (* Realistic multi-rule lexer exercising many patterns simultaneously.
-   Current: init_mem 2.
+   Current: init_mem 1 (rules 0,1 share cell via cross-rule graph coloring).
    - Rule 0: (Plus 'a'..'z' as ns), '.', (Plus 'a'..'z' as name) → 1 tag
      (ns: start=0, end=tag0; name: start=tag0+1 via Tag offset, end=lexeme_length)
    - Rule 1: (Plus 'A'..'Z' as label), '=', (Plus '0'..'9' as value) → 1 tag
-     (label: start=0, end=tag1; value: start=tag1+1 via Tag offset, end=lexeme_length)
+     (label: start=0, end=tag0; value: start=tag0+1 via Tag offset, end=lexeme_length)
    - Rule 2: "0x", (Plus hex as hex), ';' → 0 tags (prefix=2, suffix=1)
    - Rule 3: '(', ('a'..'z' as x), ',', ('a'..'z' as y), ')' → 0 tags (all fixed offsets)
-   - Rule 4: (Plus digits as tok) | (Plus letters as tok) → 0 tags (discriminator elided)
-   Remaining optimization goals:
-   - Self-loop tag delay → rules 0,1 use Set_prev (O(1) instead of O(n))
-   - Cross-rule cell sharing → rules 0,1 share cells (init_mem 2 → 1) *)
+   - Rule 4: (Plus digits as tok) | (Plus letters as tok) → 0 tags (discriminator elided) *)
 let%expect_test "realistic: multi-token lexer" =
   (match%sedlex_test buf with
     | (Plus 'a' .. 'z' as ns), '.', (Plus 'a' .. 'z' as name) ->
@@ -34,8 +31,8 @@ let%expect_test "realistic: multi-token lexer" =
       state0 -> state1 [label="'('"];
       state0 -> state6 [label="'0'"];
       state0 -> state7 [label="'1'-'9'"];
-      state0 -> state11 [label="'A'-'Z' {t1}"];
-      state0 -> state14 [label="'a'-'z' {t0}"];
+      state0 -> state11 [label="'A'-'Z'"];
+      state0 -> state14 [label="'a'-'z'"];
       state1 [label="1"];
       state1 -> state2 [label="'a'-'z'"];
       state2 [label="2"];
@@ -57,15 +54,15 @@ let%expect_test "realistic: multi-token lexer" =
       state9 -> state10 [label="';'"];
       state10 [label="10\n[rule 2]", shape=doublecircle];
       state11 [label="11"];
-      state11 -> state12 [label="'='"];
-      state11 -> state11 [label="'A'-'Z' {t1}"];
+      state11 -> state12 [label="'=' {t0-1}"];
+      state11 -> state11 [label="'A'-'Z'"];
       state12 [label="12"];
       state12 -> state13 [label="'0'-'9'"];
       state13 [label="13\n[rule 1]", shape=doublecircle];
       state13 -> state13 [label="'0'-'9'"];
       state14 [label="14\n[rule 4]", shape=doublecircle];
-      state14 -> state15 [label="'.'"];
-      state14 -> state14 [label="'a'-'z' {t0}"];
+      state14 -> state15 [label="'.' {t0-1}"];
+      state14 -> state14 [label="'a'-'z'"];
       state15 [label="15"];
       state15 -> state16 [label="'a'-'z'"];
       state16 [label="16\n[rule 0]", shape=doublecircle];
@@ -77,8 +74,8 @@ let%expect_test "realistic: multi-token lexer" =
       | 0 -> __sedlex_state_1 buf
       | 1 -> __sedlex_state_6 buf
       | 2 -> __sedlex_state_7 buf
-      | 3 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_11 buf)
-      | 4 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_14 buf)
+      | 3 -> __sedlex_state_11 buf
+      | 4 -> __sedlex_state_14 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_1 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
@@ -118,8 +115,8 @@ let%expect_test "realistic: multi-token lexer" =
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_11 buf =
       match __sedlex_partition_9 (Sedlexing.__private__next_int buf) with
-      | 0 -> __sedlex_state_12 buf
-      | 1 -> (Sedlexing.__private__set_mem_pos buf 1; __sedlex_state_11 buf)
+      | 0 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_12 buf)
+      | 1 -> __sedlex_state_11 buf
       | _ -> Sedlexing.backtrack buf
     and __sedlex_state_12 buf =
       match __sedlex_partition_6 (Sedlexing.__private__next_int buf) with
@@ -133,8 +130,8 @@ let%expect_test "realistic: multi-token lexer" =
     and __sedlex_state_14 buf =
       Sedlexing.mark buf 4;
       (match __sedlex_partition_10 (Sedlexing.__private__next_int buf) with
-       | 0 -> __sedlex_state_15 buf
-       | 1 -> (Sedlexing.__private__set_mem_pos buf 0; __sedlex_state_14 buf)
+       | 0 -> (Sedlexing.__private__set_mem_pos buf 0 1; __sedlex_state_15 buf)
+       | 1 -> __sedlex_state_14 buf
        | _ -> Sedlexing.backtrack buf)
     and __sedlex_state_15 buf =
       match __sedlex_partition_2 (Sedlexing.__private__next_int buf) with
@@ -146,7 +143,7 @@ let%expect_test "realistic: multi-token lexer" =
        | 0 -> __sedlex_state_16 buf
        | _ -> Sedlexing.backtrack buf) in
     match Sedlexing.start buf;
-          Sedlexing.__private__init_mem buf 2;
+          Sedlexing.__private__init_mem buf 1;
           __sedlex_state_0 buf
     with
     | 0 ->
@@ -162,10 +159,10 @@ let%expect_test "realistic: multi-token lexer" =
     | 1 ->
         let label =
           let __s = 0 in
-          let __e = Sedlexing.__private__mem_pos buf 1 in
+          let __e = Sedlexing.__private__mem_pos buf 0 in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         let value =
-          let __s = (Sedlexing.__private__mem_pos buf 1) + 1 in
+          let __s = (Sedlexing.__private__mem_pos buf 0) + 1 in
           let __e = Sedlexing.lexeme_length buf in
           { Sedlexing.lexbuf = buf; pos = __s; len = (__e - __s) } in
         ignore (label, value)