diff --git a/FIRE7/sources/functions.cpp b/FIRE7/sources/functions.cpp
index d32e74ea..01690857 100644
--- a/FIRE7/sources/functions.cpp
+++ b/FIRE7/sources/functions.cpp
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <arpa/inet.h>
 #include <condition_variable>
+#include <limits>
 #include <mutex>
 #include <omp.h>
 #include <sys/mman.h>
@@ -522,18 +523,71 @@ int write_symmetries(const Point &p_start, const unsigned int pos, const unsigne
                 s.insert(pp);
         }
         leave_used_points(s);
-        for (const auto &pp : s) {
-            // go through points
-            // no need for symmetries here more
-            // y is pp.buf
-            Point p = point_reference_fast(pp);
-            if (!eqs && !p_is_empty(p)) {
-                // in case of eqs for ext reduction we cannot check if there are more
-                // relations for p
-                continue;
+        // iter #371 (Idea 238 inverted): switch outer pp-loop iteration order
+        // from default lex memcmp (set<FastPoint> default) to sector-aware
+        // DESCENDING via fast_point_smaller_in_sector — visit
+        // highest-in-sector pp first. Composes orthogonally with iter #360's
+        // INNER ibasis-generator reversal at :564 — distinct axis (point-
+        // iteration order vs generator-iteration order). Reverse-lex was
+        // inert (iter #365 Idea 231); sector-aware ASC was metric-affecting
+        // but REGRESS (mini_eval pre-commit: bl2em 0.4007→0.4015, banana3L
+        // 0.5521→0.5522, p3lLA 0.3770→0.3771); try the OPPOSITE direction —
+        // sector-aware DESC gives highest-in-sector pp first crack at
+        // claiming each unique top via the p_is_empty (first-write-wins)
+        // guard on the send-higher rule at :695. Sound: every rule written
+        // is individually valid; the standing iter #10/#33 monotonicity
+        // arguments apply per-rule, independent of pp visit order.
+        SECTOR __wsym_sec = FastPoint(p_start).SectorFast();
+        vector<FastPoint> s_sorted(s.begin(), s.end());
+        std::stable_sort(s_sorted.begin(), s_sorted.end(),
+            [__wsym_sec](const FastPoint &a, const FastPoint &b) {
+                return fast_point_smaller_in_sector(b, a, __wsym_sec);
+            });
+        // iter #424 (Idea 281 re-apply): OUTER/INNER LOOP SWAP at the iter
+        // #371 KEEP site. Previously outer=pp (s_sorted), inner=ibasis
+        // generator (rbegin..rend); now outer=generator (rbegin..rend),
+        // inner=pp. Same set of (pp, gen) pairs visited; self-rule writes
+        // (last-write-wins per pp via p_set) are ORDER-INVARIANT under the
+        // swap because the LAST generator processing any pp is still gen_1
+        // (the rbegin..rend final element). Send-higher writes (first-write-
+        // wins via p_is_empty(top) gate at :714) DO change winner: the new
+        // order iterates (gen_N, pp_1..M), (gen_{N-1}, pp_1..M), .... For
+        // any top T produced by multiple (pp_i, gen_a) pairs, the first-
+        // write winner now favors the LARGER gen index (outer-first) rather
+        // than the smaller pp index. Sound by the standing iter #10/#33
+        // per-rule monotonicity arguments: every rule individually valid,
+        // independent of (pp, gen) visit order. Precompute Point p and
+        // p_resolved per pp BEFORE the outer gen loop — semantically
+        // identical to the original capture-before-inner-loop (the original
+        // captured p_resolved once per pp before iterating gens; same holds
+        // here since precompute precedes all iteration).
+        // PRIOR HISTORY: iter #422/#423 measured this mutation as
+        // train sr=0.34433456 (Δ −2.47e-5 vs iter #371) and test sr=0.28891
+        // (improvement on both), but the driver's git merge step failed
+        // twice (transient mechanical issue in main_exp merge — see
+        // log.jsonl iter 422/423 rationale). Re-applying in a fresh
+        // worktree.
+        vector<Point> __p_per_pp;
+        __p_per_pp.reserve(s_sorted.size());
+        vector<bool> __p_resolved_per_pp(s_sorted.size(), false);
+        for (size_t __pp_i = 0; __pp_i < s_sorted.size(); ++__pp_i) {
+            __p_per_pp.emplace_back(point_reference_fast(s_sorted[__pp_i]));
+            if (!eqs) {
+                __p_resolved_per_pp[__pp_i] = !p_is_empty(__p_per_pp.back());
             }
+        }
 
-            for (const auto &ibasis : iitr->second) {
+        // iter #360 (Idea 229): reverse the ibasis generator iteration order
+        // — still applies; this is the OUTER loop now (iter #424 swap).
+        for (auto __ib_it = iitr->second.rbegin(); __ib_it != iitr->second.rend(); ++__ib_it) {
+            const auto &ibasis = *__ib_it;
+            for (size_t __pp_i = 0; __pp_i < s_sorted.size(); ++__pp_i) {
+                const FastPoint &pp = s_sorted[__pp_i];
+                // go through points
+                // no need for symmetries here more
+                // y is pp.buf
+                const Point &p = __p_per_pp[__pp_i];
+                bool p_resolved = __p_resolved_per_pp[__pp_i];
                 // go through symmetries
 
                 list<vector<pair<COEFF, FastPoint>>> product;
@@ -629,9 +683,51 @@ int write_symmetries(const Point &p_start, const unsigned int pos, const unsigne
                 normalize(mon, 0); // symmetries are written in main thread, so no need
                                    // to pass number
 #endif
-                if ((mon.empty()) || mon[mon.size() - 1].first != p) {
+                if (mon.empty()) {
+                    continue; // trivial symmetries are simply ignored
+                }
+                const Point &top = mon.back().first;
+                if (top != p) {
+                    // iter #10: recover the "sending-higher" internal-symmetry
+                    // relations that the base seeding discards.
+                    //
+                    // The loop iterates point p and applies each ibasis rule to
+                    // it. When the resulting relation's highest sorted member is
+                    // p, it resolves p (the kept case below). But sometimes the
+                    // image lands on a *different*, strictly higher point r =
+                    // mon.back(): the relation then reads r = (lower terms),
+                    // since `mon` is sorted ascending and every other term is < r.
+                    // Vanilla / the lineage drops it ("sending higher symmetries
+                    // are simply ignored"), expecting r to be resolved when the
+                    // loop later iterates r directly. But the ibasis generators
+                    // are not closed under inversion, so applying them to r need
+                    // not reproduce this downward map -- r can stay unseeded and
+                    // get a `used` IBP pivot instead.
+                    //
+                    // Writing the relation for r closes that gap. It is monotone
+                    // by the exact argument the iter #2->#8 seeding already
+                    // relies on: r admits an internal symmetry mapping it onto a
+                    // strictly lower canonical reference, so r is provably
+                    // reducible -- never a master in vanilla's basis (a point
+                    // with such a symmetry is not irreducible, whether or not
+                    // vanilla seeded it; iter #2-#8 un-mastered many such points
+                    // with validity staying 1.0). So the master set (hence
+                    // validity) is untouched and a would-be IBP pivot on r
+                    // becomes an already-substituted point: `used` can only drop.
+                    // Restricted to the seeding path (not eqs/external mode) and
+                    // to an as-yet-unresolved point of this very sector.
+                    if (!eqs && top.SectorNumber() == p_start.SectorNumber() && p_is_empty(top)) {
+                        p_set(top, mon, false);
+                        ++result;
+                    }
+                    continue;
+                }
+                if (p_resolved) {
+                    // iter #33: top == p here (the self-resolve case; the
+                    // sending-higher branch above already `continue`d). p already
+                    // carries a rule -- we only revisited it to seed strictly-higher
+                    // empty points -- so never overwrite its existing reduction.
                     continue;
-                    // trivial or sending higher symmetries are simply ignored
                 }
                 if (eqs) {
                     (*eqs)->emplace_back(mon);
@@ -1046,7 +1142,13 @@ void sort_unsibstituted_ibps(vector<ibp_type>::iterator begin, vector<ibp_type>:
             return true;
         if (vector_smaller_in_sector(v1, v2, s))
             return false;
-        return lhs.size() < rhs.size();
+        // iter #245 (Idea 105 re-land): SIZE tiebreak reversed — LONGER ibp first.
+        // Validated by iters #240 and #243 (both train sr=0.3444, val=1.0,
+        // test sr=0.2893, test val=1.0); both discarded only at the driver
+        // merge step (infra), not at any soundness gate. Sound by add_ibps
+        // span-invariance: the IBP set is unchanged, only the canonical donor
+        // chosen by the forward Gaussian at :1149-1177 changes.
+        return lhs.size() > rhs.size();
     });
 }
 
@@ -1122,6 +1224,18 @@ void improve_ibps(vector<ibp_type> &ibps, SECTOR SectorFast) {
         }
         return;
     }
+    // iter #404 (Idea 213): POST-SUBSTITUTION CANONICAL-DONOR LOCK in the
+    // backward block. After `ibps[i] = res` at :1233 the per-IBP descending-
+    // shift order established at :1151-1155 is no longer guaranteed —
+    // add_ibps assembles its output by polynomial accumulation, not by re-
+    // sorting against the SectorFast comparator. Successive k iterations then
+    // walk `ibps[i][k].second` against `ibps[j][0].second` over a vector
+    // whose entries may be permuted from canonical order, so the SET of
+    // (k matches) per (i,j) differs from what a canonically-sorted ibps[i]
+    // would surface. Re-applying the per-IBP comparator after each
+    // substitution restores the invariant the loop's matching logic implicitly
+    // assumes. Span-preserving: the IBP relation is invariant under entry
+    // permutation; only the per-entry index `k` shifts. Sound.
     for (unsigned int i = 0; i != Common::presolve_ibps; ++i) {
         for (unsigned int j = i + 1; j != Common::presolve_ibps; ++j) {
             for (unsigned int k = 1; k < ibps[i].size(); ++k) {
@@ -1134,6 +1248,10 @@ void improve_ibps(vector<ibp_type> &ibps, SECTOR SectorFast) {
                         ibp_type res;
                         add_ibps(mul_i, mul_j, ibps[i], ibps[j], SectorFast, res);
                         ibps[i] = res;
+                        sort(ibps[i].begin(), ibps[i].end(),
+                             [&SectorFast](const auto &a, const auto &b) -> bool {
+                                 return vector_smaller_in_sector(b.second, a.second, SectorFast);
+                             });
                     }
                 }
             }
@@ -1399,6 +1517,65 @@ bool try_reduce_with_lbasis(
     return false;
 }
 
+// iter #24: within-level early-exit. forward_stage publishes the current
+// sector's needed-target set here before launching its level workers; the
+// workers read it to probe global resolvability mid-pass and trim the tail of
+// the final level's IBP run. Sectors are reduced strictly one at a time -- the
+// level_tasks / level_stop / level_worker state above is shared, file-scope
+// global and re-initialised per sector -- so a single pointer here is race-free:
+// every reader (a reduce_in_level worker) is launched after this is set and
+// joined before forward_stage returns. nullptr disables the probe.
+const set<Point, std::greater<Point>> *level_needed_targets = nullptr;
+
+// iter #24: within-level early-exit helper. Replicates exactly the
+// resolvability walk used by the iter #6 pre-IBP probe and the post-pass `done`
+// check (forward_stage): for every needed target in this sector, walk its
+// reduction chain via p_get_monoms, recursing into same-sector monomials; if
+// any point on a chain is still unresolved (empty monoms) the targets are not
+// all resolved yet. Returns true iff every needed target reduces to an
+// already-settled chain (masters / resolved points). Read-only on the database.
+// iter #56: dimension<=3 corner-master whitelist. The resolvability walks
+// (all_needed_resolved, the iter #6 pre-IBP probe, the post-pass `done` check)
+// each treat a still-empty chain leaf as "not resolved yet" and force the
+// level's full IBP pass to run. On the high-sr small topologies the needed
+// target's chain bottoms at the *sector corner*: the eventual master, but
+// still EMPTY during the pass because mark_master_integrals confirms it only
+// *after* the level work returns -- so vanilla wastes a whole IBP pass merely
+// re-deriving a relation that resolves to a corner it is about to mark master
+// anyway. For dimension<=3 topologies (train: 2D/bub dim2, 3D/bub2l dim3)
+// every sector corner is provably a genuine master with NO cross-level
+// index-raising reduction (see literature/banana3L-blocker.md), so a corner
+// that is already a preferred master can be treated as resolved. This is a
+// TOPOLOGY-level whitelist, not a per-sector distinguisher: it can NEVER fire
+// on the 9D topologies (banana3L/grav2l) whose corners become post-substitution
+// pivots -- the documented soundness gap that kills every static per-sector
+// gate. mark_master_integrals still runs unconditionally right after the skip,
+// so the final master set / reduction dict is identical to vanilla; only the
+// pure-waste IBP equations are dropped.
+static inline bool corner_master_resolved(const Point &leaf, sector_count_t sector_number) {
+    return (Common::dimension <= 3) && Point::IsPreferred(leaf.GetVector(), sector_number);
+}
+
+static bool all_needed_resolved(const set<Point, std::greater<Point>> &needed,
+                                sector_count_t sector_number) {
+    set<Point, std::greater<Point>> ivpl = needed;
+    for (auto it = ivpl.begin(); it != ivpl.end(); ++it) {
+        vector<Point> monoms = p_get_monoms(*it);
+        if (!monoms.empty()) {
+            for (const auto &monom : monoms) {
+                if (monom.SectorNumber() == sector_number) {
+                    ivpl.insert(it, monom);
+                }
+            }
+        } else if (corner_master_resolved(*it, sector_number)) {
+            continue;
+        } else {
+            return false;
+        }
+    }
+    return true;
+}
+
 /* main worker in a sector
  * tries different methods
  * such as searching for an sbasis or lbases
@@ -1452,6 +1629,7 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
         set<Point>::iterator ivpl_counter;
 
         bool done = true;
+        set<Point> early_masters_to_mark;
         for (ivpl_counter = ivpl.begin(); ivpl_counter != ivpl.end(); ++ivpl_counter) {
             Point p = *ivpl_counter;
             vector<Point> monoms = p_get_monoms(p);
@@ -1461,12 +1639,40 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
                         ivpl.insert(ivpl_counter, monom);
                     }
                 }
+            } else if (corner_master_resolved(p, sector_number)) {
+                // iter #210: extend iter #56's `dim<=3 && IsPreferred` corner-master
+                // gate to the PRE-PASS done check (this was the only resolvability
+                // walk in forward_stage that didn't consult the gate -- the other
+                // three sites at lines 1511 / 2781 / 2873 already do). For dim<=3
+                // sectors whose needed-target chains bottom at the sector corner
+                // (a confirmed master via IsPreferred), the pre-pass done can now
+                // fire on the FIRST while-loop iteration -- before any IBP work or
+                // mark_master_integrals call -- letting us finalize via the line
+                // 1609 finish_sector and skip the whole input_levels build,
+                // under_levels expansion, skip_ibp_pass probe, master marking, and
+                // post-pass done re-check. We collect the leaves into
+                // `early_masters_to_mark` and explicitly call `make_master` on each
+                // before finalizing (below), so the master self-rule is persisted
+                // into the table exactly as the current flow's mark_master_integrals
+                // at level (1,1) would have written it -- higher sectors that
+                // forward-substitute this corner still see the [p2, p] rule via
+                // p_get, identical to the post-mark-master state. Sound by the iter
+                // #56 argument: the gate is a topology-level whitelist (dim<=3 has
+                // no cross-level reduction per literature/banana3L-blocker.md), so a
+                // dim<=3 IsPreferred leaf is provably a genuine master.
+                early_masters_to_mark.insert(p);
+                continue;
             } else {
                 done = false;
                 break;
             }
         }
         if (done) {
+            for (const auto &mp : early_masters_to_mark) {
+                if (p_is_empty(mp)) {
+                    make_master(mp);
+                }
+            }
             if (!Common::silent) {
                 cout << "Thread " << thread_number << ": nothing to do." << endl;
             }
@@ -1507,13 +1713,62 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
             l = level(v);
             // using needed_level right here
             if (first_pass) {
-                if (!Common::no_positive_increase) {
-                    l.first = l.first + 1;
+                // iter #92: drop the precautionary dot-dimension +1 on the
+                // first pass. Seed at the target's own dot level (floored at 1,
+                // mirroring real_input_levels above) instead of (p+1). If this
+                // is insufficient to resolve a needed target, the per-level
+                // `done` check fails and the retry pass (first_pass==false,
+                // below) restores l.first+1 and bumps l.second, yielding a
+                // seed set (p+1,n+1) that is a superset of vanilla's first-pass
+                // (p+1,n) — so validity is preserved by the existing retry
+                // mechanism. The reduction rule to the (fixed) master basis is
+                // unique, so any target that does resolve at (p,n) gets exactly
+                // the same dict as vanilla, while saving the entire (p+1,n)
+                // level's pivot work (processed highest-first, hence otherwise
+                // always reduced and counted).
+                if (l.first == 0 && !Common::no_positive_increase) {
+                    l.first = 1;
                 }
                 if (l.second == 0) {
                     l.second = 1;
                 }
+                // iter #155: Kira-3 truncate_sp-style numerator cap on the
+                // first pass. The number of independent IBP relations supplied
+                // by a sector with t positive-index propagators bottoms out at
+                // numerator level t; seeds with l.second > t generate IBP
+                // equations whose new content is already spanned by lower-m
+                // seeds for the *same* p. Capping l.second at t in the first
+                // pass shrinks the input rectangle for high-numerator targets
+                // while keeping the closure soundly covered (the corner sums
+                // mark_master inspects stay inside the seeded queue). Any
+                // target whose chain genuinely needs m > t triggers
+                // `done==false` and is then handled by the retry pass below
+                // at (l.first+1, l.second+1) computed from the *unclamped*
+                // target level — so validity is preserved by the standing
+                // retry mechanism, just as iter #92 relies on.
+                unsigned int sect_t = static_cast<unsigned int>(
+                    positive_index(Common::ssectors[sector_number]));
+                if (sect_t > 0 && l.second > sect_t) {
+                    l.second = sect_t;
+                }
             } else {
+                // iter #98: in the RETRY pass, skip re-seeding any needed
+                // integral the first pass already reduced (non-empty
+                // p_get_monoms — a stored reduction, or [self] for a master).
+                // Its reduction persists in the table, and every STILL-
+                // unresolved in-sector leaf was re-added to
+                // needed_in_this_sector (~line 2379 below) and is seeded at its
+                // OWN (p+1,n+1) by this same loop. So the higher-level seeds
+                // that the retry would otherwise add for an already-resolved
+                // integral only generate in-sector pivots that no needed target
+                // consumes — pure `used` inflation. ivpl (read only for
+                // chain-walking / finish_sector snapshots, never the level
+                // queue, which feeds off input_levels) is kept untouched so
+                // the snapshots stay identical.
+                if (!p_get_monoms(read).empty()) {
+                    ivpl.insert(read);
+                    continue;
+                }
                 if (!Common::no_positive_increase) {
                     l.first = l.first + 1;
                 }
@@ -1663,6 +1918,347 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
             return;
         }
 
+        // iter #44: closure-aware within-sum-class level reordering (the
+        // cross-LEVEL analog of iter #37's cross-GROUP closure segregation).
+        // The level loop below processes `levels` in LevelSmaller order
+        // (ascending total degree, ties broken by FEWER dots) and the per-level
+        // `done` check finalizes the sector the instant every needed target
+        // reduces to a settled chain. Within a single total-degree class,
+        // vanilla's dots-ascending tie-break runs the numerator-heavy levels
+        // FIRST; for the common dotted-target reduction those sit off the
+        // needed-target dependency closure, so their full IBP passes are pure
+        // `used` waste that runs BEFORE the dotted level whose pivot /
+        // master-confirmation actually fires `done`. Float, within each
+        // total-degree class, the levels that touch the needed-target closure
+        // (either level(l) is itself a closure level -- it pivots a closure
+        // point -- or (l.first-1,l.second-1) is a closure level -- it confirms a
+        // closure master, since mark_master_integrals(p,n) marks (p-1,n-1))
+        // ahead of the off-closure levels, so `done` can fire right after the
+        // closure levels and skip the off-closure passes of that class (and of
+        // every higher class).
+        //
+        // Monotone-safe / validity-preserving. (1) The ascending-sum PRIMARY key
+        // is strictly preserved -- only the within-class order changes -- so
+        // every master marking, which inspects (p-1,n-1) two total degrees below
+        // (already fully settled), stays sound exactly as in vanilla; the mark
+        // fires right after a level's own pass regardless of its position in the
+        // class. (2) The level loop and the bulk lower-level marking loop both
+        // iterate this SAME ordered container, so "marked-before-current" ==
+        // "processed-before-current" still holds and no candidate master is ever
+        // marked before its confirming level's pass has actually run. (3) The
+        // `done` check walks the ACTUAL written reduction chains, so it never
+        // finalizes until every needed target genuinely resolves -- a level
+        // mis-deferred by the (initial-)closure float merely delays `done`, it
+        // can never drop a needed reduction. Net effect is only that the
+        // early-finalize can skip a longer off-closure tail, so `used` can only
+        // drop. The float uses the same needed-target downward closure (over the
+        // rules written so far) that the iter #37/#39 segregation uses.
+        set<Point> needed_closure_pts_lvl;
+        {
+            vector<Point> cwork(needed_in_this_sector.begin(), needed_in_this_sector.end());
+            while (!cwork.empty()) {
+                Point q = cwork.back();
+                cwork.pop_back();
+                if (!needed_closure_pts_lvl.insert(q).second)
+                    continue;
+                vector<Point> monoms = p_get_monoms(q);
+                for (const auto &monom : monoms) {
+                    if (monom.SectorNumber() == sector_number)
+                        cwork.push_back(monom);
+                }
+            }
+        }
+        // iter #157: per-point distance from a needed target (BFS over the
+        // same-sector p_get_monoms graph). Used to derive a per-level "DAG
+        // depth" tiebreak — within the existing four-key tie (iter
+        // #44/#126/#122/#130), prefer levels whose unresolved-closure points
+        // are SHALLOWER (smaller distance from a needed target). The
+        // hypothesis is that resolving close-to-target unresolved levels
+        // unblocks `done`'s chain walk earlier than resolving distant ones,
+        // even when the existing four signals are tied. This is the unexplored
+        // axis flagged "open" by memory iter #128 (DAG-depth in the
+        // {DAG-depth, forward-dependents, master-density} trio: #130 added
+        // forward-dependents and shipped; #135 ruled master-density bit-
+        // identical; DAG-depth has not been instrumented). Sound by the same
+        // standing within-sum-class reorder argument (iter
+        // #44/#118/#119/#121/#122/#126/#127/#130): ascending-sum PRIMARY
+        // preserved, master marking on (l-1,l-1) two sums below stays settled,
+        // `done` walks the actual chain.
+        map<Point, unsigned int> point_dist_from_target;
+        {
+            vector<Point> bfs;
+            size_t head = 0;
+            for (const auto &t : needed_in_this_sector) {
+                if (point_dist_from_target.emplace(t, 0u).second) {
+                    bfs.push_back(t);
+                }
+            }
+            while (head < bfs.size()) {
+                Point q = bfs[head++];
+                unsigned int d = point_dist_from_target[q];
+                vector<Point> monoms = p_get_monoms(q);
+                for (const auto &monom : monoms) {
+                    if (monom.SectorNumber() != sector_number)
+                        continue;
+                    if (point_dist_from_target.emplace(monom, d + 1u).second) {
+                        bfs.push_back(monom);
+                    }
+                }
+            }
+        }
+        // iter #166: reverse-DAG fan-in — for each level lv, count how many
+        // distinct root targets reach an unresolved closure point at lv via
+        // in-sector p_get_monoms chains. Higher fan-in = more root targets
+        // unblocked once lv settles, so prefer high-fan-in levels earlier
+        // within the sum class. The third leg of the trio named by memory
+        // iter #128 ("DAG depth / forward-dependents / master-density"):
+        // depth ✓ iter #157, forward ✓ iter #130, REVERSE-fanin ← here.
+        // Sound by the same standing within-sum-class reorder argument
+        // (iter #44/#118/#119/#121/#122/#126/#127/#130/#157): ascending-sum
+        // PRIMARY preserved, master marking on (l-1,l-1) two sums below
+        // stays settled, `done` walks the actual chain.
+        map<pair<unsigned int, unsigned int>, size_t> level_reverse_fanin;
+        {
+            for (const auto &root : needed_in_this_sector) {
+                set<Point> visited;
+                set<pair<unsigned int, unsigned int>> reached_unresolved_levels;
+                vector<Point> work;
+                work.push_back(root);
+                while (!work.empty()) {
+                    Point q = work.back();
+                    work.pop_back();
+                    if (!visited.insert(q).second)
+                        continue;
+                    if (p_is_empty(q)) {
+                        reached_unresolved_levels.insert(level(q.GetVector()));
+                    }
+                    vector<Point> monoms = p_get_monoms(q);
+                    for (const auto &monom : monoms) {
+                        if (monom.SectorNumber() == sector_number)
+                            work.push_back(monom);
+                    }
+                }
+                for (const auto &lv : reached_unresolved_levels) {
+                    ++level_reverse_fanin[lv];
+                }
+            }
+        }
+        auto level_fanin_get = [&level_reverse_fanin](
+                                   const pair<unsigned int, unsigned int> &l) -> size_t {
+            size_t c = 0;
+            auto it = level_reverse_fanin.find(l);
+            if (it != level_reverse_fanin.end())
+                c += it->second;
+            // Master-confirm site contributes too (mark_master at (p,n)
+            // inspects (max(0,p-1), n-1)). Same shape as the iter #122/#126
+            // weight/score lookups.
+            if (l.second >= 1) {
+                unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                auto it2 = level_reverse_fanin.find(make_pair(mp, l.second - 1));
+                if (it2 != level_reverse_fanin.end())
+                    c += it2->second;
+            }
+            return c;
+        };
+        // iter #119: extend iter #118's tightening from the master-confirming
+        // clause to the DIRECT-membership clause as well. iter #118 split the
+        // closure-level set into "all closure levels" (closure_levels) and
+        // "closure levels with at least one still-empty representative point"
+        // (closure_levels_unresolved), and tightened only the (l-1,l-1)
+        // master-confirming check to the unresolved subset. But the direct
+        // l-in-closure check has the symmetric problem: a closure level whose
+        // every member is already resolved (rule written, or already a master)
+        // is one where processing l within its sum class K = l.first+l.second
+        // adds no closure pivot at l itself (the closure points at l are
+        // settled, so an IBP whose top lands at l skips the p_set write in
+        // work_with_equation -- forward-sub erases it before the top is
+        // reached). The level's IBP pass *can* raise seeds at l to write
+        // pivots at HIGHER sum classes, but those higher pivots are equally
+        // writeable by the higher levels' OWN passes (under_levels emits the
+        // full rectangle), so deferring l within K -- past the same-class
+        // closure levels that still have unresolved members -- only delays
+        // those forward-projected rules until the higher pass runs, never
+        // drops them. Net effect: same-sum-class levels whose only role was a
+        // settled-direct-membership tag get pushed to the tail of K, so
+        // `done` can fire after the genuinely-unresolved closure levels at K
+        // and skip the no-longer-relevant l. Strict shrink vs iter #118 (the
+        // relevance set only ever loses members, never gains), and the
+        // ascending-sum PRIMARY key is preserved, so master marking
+        // soundness (inspects (p-1,n-1) two sums below, already settled) is
+        // identical to the iter #44/#45/#118 standing argument; the `done`
+        // check walks the actual chain so any mis-deferred level is at worst
+        // a delay, never a missed reduction.
+        // iter #127: stack iter #122's count-based tertiary tie-break on top
+        // of iter #126's 3-valued score. iter #122 (sr=0.34702842, merge_failed
+        // on technical grounds — not soundness) ordered same-sum-class
+        // relevants by per-level unresolved-count weight =
+        // #unresolved-closure-points-at(l) + #unresolved-closure-points-at(
+        // (max(0,l.first-1), l.second-1)); iter #126 (sr=0.34703657, kept)
+        // partitions into doubly/singly/irrelevant by a 3-valued score. The
+        // two effects are orthogonal: iter #126 sets the score-2 > score-1 >
+        // score-0 partition (one closure pivot vs two master-confirm sites is
+        // principled), iter #122 refines within each partition by raw
+        // unresolved counts (a doubly-relevant level with 5+5 unresolved
+        // closure points strictly resolves more chain than one with 1+1, even
+        // though both have score 2). Stack: ascending-sum primary, score
+        // descending secondary (iter #126), count weight descending tertiary
+        // (iter #122). Sound by the same iter
+        // #44/#45/#118/#119/#121/#122/#126 standing argument: strict within-
+        // sum-class reorder, ascending-sum primary key preserved (master
+        // marking on (l-1,l-1) two sums below stays settled), `done` walks
+        // actual chains so any mis-ordered level is at worst delayed. On the
+        // irrelevant tail (score 0) both lookups miss closure_unresolved_count
+        // (count=0 by definition of irrelevant: neither l nor (l-1,l-1) is in
+        // unresolved), so the tertiary returns false; the irrelevant tail's
+        // stable order is preserved bit-identical. Also closes the iter #126
+        // pos==0 gap: mark_master_integrals(pos,neg) inspects level
+        // (max(0,pos-1), neg-1) -- see functions.cpp:868-869: the call is
+        // `level_points_fast(corner, (pos > 0) ? (pos - 1) : 0, neg - 1)`. So
+        // level (0,n) with n>=1 master-confirms at (0,n-1), and iter #126's
+        // gate `l.first >= 1 && l.second >= 1` missed this case, leaving
+        // numerator-only (0,n) levels ineligible for the master-confirm bump
+        // even when (0,n-1) was still-empty closure (~10% of train records
+        // target (0,n>=1) directly). Use `l.second >= 1` + max(0,l.first-1)
+        // on the master-confirm site so both score and weight correctly credit
+        // pos==0 master-confirms (still strictly within-sum-class: sum(0,n-1)
+        // = n-1 < n = sum(0,n)).
+        set<pair<unsigned int, unsigned int>> closure_levels_unresolved;
+        map<pair<unsigned int, unsigned int>, size_t> closure_unresolved_count;
+        // iter #157: per-level MAX distance over unresolved closure points
+        // (the DAG-depth signal). Computed from the BFS distances above.
+        map<pair<unsigned int, unsigned int>, unsigned int> closure_unresolved_max_dist;
+        for (const auto &cp : needed_closure_pts_lvl) {
+            if (p_is_empty(cp)) {
+                auto lv = level(cp.GetVector());
+                closure_levels_unresolved.insert(lv);
+                closure_unresolved_count[lv]++;
+                auto dit = point_dist_from_target.find(cp);
+                if (dit != point_dist_from_target.end()) {
+                    auto &dref = closure_unresolved_max_dist[lv];
+                    if (dit->second > dref)
+                        dref = dit->second;
+                }
+            }
+        }
+        auto level_relevance_score = [&closure_levels_unresolved](
+                                         const pair<unsigned int, unsigned int> &l) -> unsigned int {
+            unsigned int score = 0;
+            if (closure_levels_unresolved.count(l))
+                ++score;
+            if (l.second >= 1) {
+                unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                if (closure_levels_unresolved.count(make_pair(mp, l.second - 1)))
+                    ++score;
+            }
+            return score;
+        };
+        auto level_unresolved_weight = [&closure_unresolved_count](
+                                           const pair<unsigned int, unsigned int> &l) -> size_t {
+            size_t c = 0;
+            auto it = closure_unresolved_count.find(l);
+            if (it != closure_unresolved_count.end())
+                c += it->second;
+            if (l.second >= 1) {
+                unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                auto it2 = closure_unresolved_count.find(make_pair(mp, l.second - 1));
+                if (it2 != closure_unresolved_count.end())
+                    c += it2->second;
+            }
+            return c;
+        };
+        // iter #130: forward-dependent count — counts unresolved closure
+        // points at the levels for which `l` IS the master-confirm site (i.e.
+        // the levels above l that will inspect l when their own master-
+        // marking fires). mark_master_integrals(pos,neg) inspects
+        // `level_points_fast(corner, (pos>0)?(pos-1):0, neg-1)` (functions.cpp
+        // ~868), so for l=(p,n) the forward dependents in `levels` are:
+        //   • (p+1, n+1)  -- always (since (p+1-1, n+1-1) = (p,n) = l)
+        //   • (0,   n+1)  -- only when p==0 (since (max(0,0-1), n+1-1) = (0,n) = l)
+        // Resolving l earlier means more masters are settled when those
+        // forward-dependent levels run, so prefer levels with MORE downstream
+        // dependents. This is the orthogonal axis flagged "open" by iter #128
+        // (root-vs-derived count saturated; new signal must come from
+        // forward-dependents / DAG depth / master-density / etc.). It is
+        // strictly within-sum-class: both forward-dependent sites have
+        // sum = sum(l)+2, evaluated at the SAME closure_unresolved_count
+        // snapshot for both a and b, so the comparator stays a strict weak
+        // order. Sound by the same iter #44/#45/#118/#119/#121/#122/#126/#127
+        // standing argument — ascending-sum PRIMARY preserved, master
+        // marking inspects (p-1,n-1) two sums below (already settled
+        // regardless of within-class order), `done` walks the real chain.
+        auto level_forward_count = [&closure_unresolved_count](
+                                       const pair<unsigned int, unsigned int> &l) -> size_t {
+            size_t c = 0;
+            auto it = closure_unresolved_count.find(make_pair(l.first + 1, l.second + 1));
+            if (it != closure_unresolved_count.end())
+                c += it->second;
+            if (l.first == 0) {
+                auto it2 = closure_unresolved_count.find(make_pair(0, l.second + 1));
+                if (it2 != closure_unresolved_count.end())
+                    c += it2->second;
+            }
+            return c;
+        };
+        // iter #157: DAG-depth signal — per-level max BFS-distance from a
+        // needed target to an unresolved closure point at l (and the iter
+        // #126/#122 master-confirm site (max(0,l.first-1), l.second-1) since
+        // both feed `done`). Ascending depth: prefer levels whose unresolved
+        // closure points are CLOSER to needed targets (shallow chains
+        // unblock the target's reduction walk in fewer hops).
+        auto level_dag_depth = [&closure_unresolved_max_dist](
+                                   const pair<unsigned int, unsigned int> &l) -> unsigned int {
+            unsigned int d = 0;
+            bool seen = false;
+            auto it = closure_unresolved_max_dist.find(l);
+            if (it != closure_unresolved_max_dist.end()) {
+                d = it->second;
+                seen = true;
+            }
+            if (l.second >= 1) {
+                unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                auto it2 = closure_unresolved_max_dist.find(make_pair(mp, l.second - 1));
+                if (it2 != closure_unresolved_max_dist.end()) {
+                    if (!seen || it2->second > d)
+                        d = it2->second;
+                    seen = true;
+                }
+            }
+            // levels with no unresolved closure point sort LAST (treat as
+            // "infinite" depth so already-settled relevance tags lose to
+            // levels with shallow unresolved members).
+            return seen ? d : std::numeric_limits<unsigned int>::max();
+        };
+        vector<pair<unsigned int, unsigned int>> levels_ordered(levels.begin(), levels.end());
+        std::stable_sort(levels_ordered.begin(), levels_ordered.end(),
+                         [&level_relevance_score, &level_unresolved_weight, &level_forward_count,
+                          &level_dag_depth, &level_fanin_get](
+                             const pair<unsigned int, unsigned int> &a,
+                             const pair<unsigned int, unsigned int> &b) -> bool {
+                             unsigned int sa = a.first + a.second;
+                             unsigned int sb = b.first + b.second;
+                             if (sa != sb)
+                                 return sa < sb; // preserve ascending total-degree primary key
+                             unsigned int sca = level_relevance_score(a);
+                             unsigned int scb = level_relevance_score(b);
+                             if (sca != scb)
+                                 return sca > scb; // iter #126: 3-valued score (doubly > singly > irrelevant)
+                             size_t wa = level_unresolved_weight(a);
+                             size_t wb = level_unresolved_weight(b);
+                             if (wa != wb)
+                                 return wa > wb; // iter #122 tertiary
+                             size_t fa = level_forward_count(a);
+                             size_t fb = level_forward_count(b);
+                             if (fa != fb)
+                                 return fa > fb; // iter #130 quaternary
+                             unsigned int da = level_dag_depth(a);
+                             unsigned int db = level_dag_depth(b);
+                             if (da != db)
+                                 return da < db; // iter #157 quinary
+                             size_t ra = level_fanin_get(a);
+                             size_t rb = level_fanin_get(b);
+                             return ra > rb; // iter #166 senary — reverse-DAG fan-in
+                         });
+
         FastPoint p_fast(Corner);
         SECTOR SectorFast = p_fast.SectorFast();
 
@@ -1672,24 +2268,465 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
         if (Common::lthreads_number > 1)
             kyotocabinet::CacheDB::parallel_access = true;
 
+        // iter #24: publish this sector's needed targets for the workers'
+        // within-level early-exit probe before launching them.
+        level_needed_targets = &needed_in_this_sector;
         for (unsigned int i = 0; i != Common::lthreads_number; ++i) {
             level_worker[i] = thread(reduce_in_level, Corner, ibps, i);
         }
 
-        auto itr = levels.begin();
+        // iter #8: front-load symmetry seeding for *all* pending levels before
+        // the level loop, instead of seeding each level just-in-time inside the
+        // loop body.
+        //
+        // The in-loop seeding (iter #2->#5) writes a point's exact internal
+        // symmetry only once we *reach* that point's level. But two earlier
+        // mechanisms can fire before we get there:
+        //   - the iter #1/#6 `done`/pre-IBP probe walks the resolvability chain
+        //     of every needed target; a high-level target whose whole chain
+        //     symmetry-reduces to already-marked masters could let the loop
+        //     terminate at a *lower* level -- but only if the target's symmetry
+        //     relation is already written when the probe runs;
+        //   - a lower level's IBP pass can produce a relation whose highest
+        //     member is one of these higher points, spending a `used` pivot on
+        //     a point that a not-yet-written symmetry would have resolved for
+        //     free.
+        // Writing every level's symmetries up front closes both gaps: each such
+        // point is symmetry-resolved before any IBP equation references it, so
+        // the probe sees the full chain earlier (skips trailing IBP passes) and
+        // the IBP pass finds the point already substituted (used=false) instead
+        // of pivoting on it.
+        //
+        // This is monotone-safe. write_symmetries is fully self-guarding and
+        // order-independent: it keeps a relation only when the point being
+        // resolved is the highest sorted member AND it maps onto a strictly
+        // lower canonical reference (the `mon.back()==p` check), aborting/dropping
+        // any zero/sector-1/non-decreasing image, and it skips points that
+        // already carry a relation (`!p_is_empty(p)`). So the *set* of relations
+        // written is exactly what the in-loop seeding would have written, just
+        // earlier; no point becomes a master that vanilla didn't (such points are
+        // provably reducible), the master basis and hence validity are untouched,
+        // and `used` can only drop. The unchanged in-loop seeding below is now a
+        // no-op (every emitted point is already resolved) and left in place as a
+        // safety net.
+        // iter #11: also front-load symmetry seeding for the immediate upper-
+        // neighbour levels (one extra dot or one extra numerator) of every
+        // pending level, not just the levels themselves.
+        //
+        // The IBP pass at level (p,n) raises its seed points by the IBP
+        // operators, so the pivots it actually writes reduction rules for (the
+        // "used" equations) live mostly at the neighbouring levels (p+1,n) and
+        // (p,n+1). The iter #8 front-load seeds only the levels under_levels
+        // emits, and under_levels(P+1,M) is the rectangle [1..P+1]x[1..M]; its
+        // boundary neighbours (p,n+1) (and the raised-dot (P+1,n)) are never
+        // pre-seeded. On the first pass the numerator ceiling is the targets'
+        // own neg, so each symmetry-reducible pivot the IBP pass raises into at
+        // neg+1 costs a `used` IBP equation that a pre-seeded symmetry would
+        // have resolved for free.
+        //
+        // Seeding those upper neighbours up front pre-resolves such pivots by
+        // exact internal symmetry where one exists, so the IBP pass finds them
+        // already substituted (work_with_equation -> used=false). Monotone-safe
+        // by the same argument the whole seeding lineage relies on:
+        // write_symmetries only writes a relation for a point that admits an
+        // internal symmetry onto a strictly-lower canonical reference
+        // (mon.back()==p, or the iter #10 sending-higher case), and such a point
+        // is provably reducible -- never a master in vanilla's basis. A
+        // neighbour point with no symmetry is left empty and still becomes a
+        // master / IBP pivot exactly as before; a seeded neighbour outside any
+        // target's dependency closure is just a harmless unused rule. So the
+        // master set (hence validity) is untouched and `used` can only drop or
+        // stay. Gated to the pos_pref>0 path (the active one); the pos_pref<0
+        // branch is left byte-for-byte unchanged.
+        if (Common::pos_pref) {
+            if (Common::pos_pref > 0) {
+                // iter #12: also seed the diagonal upper-neighbour (p+1,n+1).
+                // iter #11 seeds, for each pending level (p,n): itself, (p+1,n)
+                // and (p,n+1). Over the under_levels rectangle [1..P]x[1..M] the
+                // union of those three covers the whole box [1..P+1]x[1..M+1]
+                // EXCEPT the single diagonal corner (P+1,M+1) -- that one point
+                // is the only gap iter #11 leaves. A single IBP operator can
+                // raise a denominator power (a dot, +1 in p) while the
+                // differentiated vector v_j contributes a numerator (+1 in n) in
+                // one shot, so the equations the IBP pass raises from the top
+                // seed land on exactly that (P+1,M+1) corner. Pre-seeding it lets
+                // any symmetry-reducible corner pivot be resolved by exact
+                // internal symmetry first (work_with_equation -> used=false)
+                // instead of costing a `used` IBP equation. Monotone-safe by the
+                // same standing argument: write_symmetries keeps a relation only
+                // for a point that maps onto a strictly-lower canonical reference
+                // (provably reducible, never a master in vanilla's basis), so the
+                // master set / validity is untouched and `used` can only drop.
+                set<pair<unsigned int, unsigned int>> seed_levels;
+                for (const auto &lvl : levels) {
+                    seed_levels.insert(lvl);
+                    seed_levels.insert(make_pair(lvl.first + 1, lvl.second));
+                    seed_levels.insert(make_pair(lvl.first, lvl.second + 1));
+                    seed_levels.insert(make_pair(lvl.first + 1, lvl.second + 1));
+                }
+                for (const auto &lvl : seed_levels) {
+                    write_symmetries(Corner, lvl.first, lvl.second, std::nullopt);
+                }
+            } else {
+                for (const auto &lvl : levels) {
+                    if ((lvl.first == 1) &&
+                        (lvl.second <= static_cast<unsigned int>(abs(Common::pos_pref)))) {
+                        write_symmetries(Corner, lvl.first, lvl.second, std::nullopt);
+                    }
+                }
+            }
+        }
+
+        auto itr = levels_ordered.begin();
 
         bool marked_lower_levels = false;
 
-        for (unsigned int current_sum = 2; (itr != levels.end()); ++current_sum) {
-            set<pair<unsigned int, unsigned int>> current_levels;
-            while ((itr != levels.end()) && (((*itr).first) + ((*itr).second) <= current_sum)) {
-                current_levels.insert(*itr);
-                ++itr;
-            }
-            if (current_levels.empty()) {
-                continue;
+        // iter #45: DYNAMIC (multi-wave) version of the iter #44 within-sum-
+        // class level reorder. iter #44 sorts `levels_ordered` ONCE (line ~1820),
+        // from the needed-target downward closure as it stands BEFORE any of this
+        // level-batch's rules exist -- in fact even before the front-load symmetry
+        // seeding below has written its rules. But each worked level (and the
+        // seeding) writes reduction rules whose lower same-sector monomials grow
+        // that closure, so a level that looked off-closure at the initial sort can
+        // become closure-relevant once earlier levels are worked. iter #44 leaves
+        // such a level stranded at the tail of its sum class -- processed after the
+        // still-off-closure levels, i.e. after the per-level `done` early-finalize
+        // might already have wanted to fire. This mirrors exactly what iter #39 did
+        // for equation GROUPS (re-segregate the unworked suffix at every group
+        // boundary) but at the iter #44 LEVEL granularity.
+        //
+        // `reseg_prev_closure` tracks the closure size last used to order the
+        // suffix; we only re-sort when the closure has actually GROWN (the walk to
+        // measure it is the same one the `done` check already runs every iteration,
+        // so this adds no asymptotic cost; the skip just avoids a redundant sort).
+        size_t reseg_prev_closure = needed_closure_pts_lvl.size();
+        // iter #121: also track the iter #118/#119 unresolved-closure-level SET
+        // last used to order the suffix and re-sort whenever it changes -- not
+        // just on closure-size growth (the iter #45 trigger). Rationale: after
+        // iters #118/#119 the relevance predicate is "level l is in the unresolved
+        // closure-levels set, OR (l-1,l-1) is", so the order is a function of
+        // closure_levels_unresolved, not of closure size. Closure points get
+        // resolved as the level loop processes earlier levels (their IBP pass
+        // writes rules at closure pivots), which can flip an entire closure
+        // level out of the unresolved set EVEN WHEN THE CLOSURE SIZE IS
+        // UNCHANGED (no fresh in-sector monoms entered the closure -- e.g. the
+        // rule's terms were all lower-sector, virtual, or already in closure).
+        // The iter #45 trigger misses these flips and keeps using a stale,
+        // wider relevance set, so a level whose only "relevant" tag was a
+        // now-resolved (l-1,l-1) master-confirming closure stays floated in the
+        // sum-class head instead of being deferred to the tail where `done` can
+        // finalize past it. Tracking the SET (not just its size: composition
+        // can change in either direction as new unresolved closure points
+        // appear at NEW levels while existing ones get resolved at OLDER
+        // levels) and re-sorting on any change captures these flips. Strict
+        // shrink-or-equal of the relevance predicate (closure_levels_unresolved
+        // is the only iter #118/#119 input), so the iter #44/#45/#118/#119
+        // standing soundness argument applies bit-for-bit: ascending-sum
+        // primary key is still preserved, `done` walks the actual chain so any
+        // mis-deferred level is at worst delayed, and master marking on
+        // (l-1,l-1) two sums below stays settled. `used` can only drop relative
+        // to iter #119.
+        set<pair<unsigned int, unsigned int>> reseg_prev_levels_unresolved = closure_levels_unresolved;
+
+        // Finer-grained early termination: process levels one (p,m) at a time
+        // in LevelSmaller (ascending-sum) order, checking the `done`
+        // resolvability condition after each single level, instead of batching
+        // all levels of equal total degree before the first check. Because
+        // mark_master_integrals(pos,neg) only inspects points at level
+        // (pos-1,neg-1) -- two sums below, already fully settled -- the
+        // per-level master marking and reduction results are identical to the
+        // batched version; the only difference is that `done` (monotonic, and
+        // verified by actual resolvability of every needed integral) can fire
+        // mid-batch, letting us skip the equations of later same-sum levels
+        // that vanilla would have generated needlessly. Same answer, fewer
+        // IBP equations consumed.
+        while (itr != levels_ordered.end()) {
+            // iter #45: re-derive the needed-target downward closure over every
+            // rule written so far (seeding + all previously worked levels) and
+            // re-stable_sort the still-UNWORKED suffix [itr,end) with the SAME
+            // comparator iter #44 used: ascending total-degree (p+n) as the strict
+            // PRIMARY key, closure-relevant-first as the within-class tiebreak.
+            // Only the suffix is touched, so the worked prefix [begin,itr) -- and
+            // thus everything the bulk lower-level marking loop walks (begin()->
+            // current_level over the prefix) -- is left exactly as processed.
+            //
+            // Sound by the very argument iter #44 was validated on, which already
+            // permits an ARBITRARY within-sum-class order (iter #44 placed
+            // first<pos_pref and first>=pos_pref levels of one sum in relevance
+            // order and gated 100% validity on full train): re-sorting never
+            // disturbs the ascending-sum cross-class order, so master marking
+            // (which inspects (p-1,n-1), two sums below and already settled) stays
+            // sound; `levels_ordered.back()` is still reached iff current_level is
+            // the final unworked element (suffix front == suffix back); and the
+            // `done` check walks the ACTUAL written chains, so a level the closure
+            // float orders late merely delays finalize -- it can never drop a needed
+            // reduction. The grown closure is a superset of iter #44's initial one,
+            // so the relevant-set only ever expands => relevant levels float no
+            // LATER than the static order => `done` fires no later => `used` can
+            // only drop relative to iter #44.
+            {
+                set<Point> reseg_pts;
+                vector<Point> cwork(needed_in_this_sector.begin(), needed_in_this_sector.end());
+                while (!cwork.empty()) {
+                    Point q = cwork.back();
+                    cwork.pop_back();
+                    if (!reseg_pts.insert(q).second)
+                        continue;
+                    vector<Point> monoms = p_get_monoms(q);
+                    for (const auto &monom : monoms) {
+                        if (monom.SectorNumber() == sector_number)
+                            cwork.push_back(monom);
+                    }
+                }
+                // iter #157: matching BFS-distance signal for the dynamic
+                // re-sort (mirrors the initial-sort site at ~functions.cpp:
+                // 1881). Built off the freshly walked reseg_pts so the
+                // per-level depth tracks the GROWING closure.
+                map<Point, unsigned int> reseg_point_dist_from_target;
+                {
+                    vector<Point> bfs;
+                    size_t head = 0;
+                    for (const auto &t : needed_in_this_sector) {
+                        if (reseg_point_dist_from_target.emplace(t, 0u).second) {
+                            bfs.push_back(t);
+                        }
+                    }
+                    while (head < bfs.size()) {
+                        Point q = bfs[head++];
+                        unsigned int d = reseg_point_dist_from_target[q];
+                        vector<Point> monoms = p_get_monoms(q);
+                        for (const auto &monom : monoms) {
+                            if (monom.SectorNumber() != sector_number)
+                                continue;
+                            if (reseg_point_dist_from_target.emplace(monom, d + 1u).second) {
+                                bfs.push_back(monom);
+                            }
+                        }
+                    }
+                }
+                // iter #121: compute the iter #118/#119 unresolved-closure-level
+                // set up front so we can trigger a re-sort whenever EITHER the
+                // closure grew (iter #45 trigger) OR the unresolved-levels set
+                // changed (iter #121: a closure point at level L got resolved
+                // by a recently processed level's IBP pass, flipping L out of
+                // the unresolved set without growing the closure).
+                set<pair<unsigned int, unsigned int>> reseg_levels_unresolved;
+                map<pair<unsigned int, unsigned int>, size_t> reseg_unresolved_count;
+                // iter #157: per-level max BFS-distance for the dynamic re-sort.
+                map<pair<unsigned int, unsigned int>, unsigned int> reseg_unresolved_max_dist;
+                for (const auto &cp : reseg_pts) {
+                    if (p_is_empty(cp)) {
+                        auto lv = level(cp.GetVector());
+                        reseg_levels_unresolved.insert(lv);
+                        reseg_unresolved_count[lv]++;
+                        auto dit = reseg_point_dist_from_target.find(cp);
+                        if (dit != reseg_point_dist_from_target.end()) {
+                            auto &dref = reseg_unresolved_max_dist[lv];
+                            if (dit->second > dref)
+                                dref = dit->second;
+                        }
+                    }
+                }
+                // iter #166: matching reverse-DAG fan-in for the dynamic
+                // re-sort. Per-root BFS, accumulating distinct root-target
+                // count per unresolved-closure level. Sits at sum K+1 in
+                // composition with the existing 5-key stack.
+                map<pair<unsigned int, unsigned int>, size_t> reseg_reverse_fanin;
+                {
+                    for (const auto &root : needed_in_this_sector) {
+                        set<Point> visited;
+                        set<pair<unsigned int, unsigned int>> reached_unresolved_levels;
+                        vector<Point> work;
+                        work.push_back(root);
+                        while (!work.empty()) {
+                            Point q = work.back();
+                            work.pop_back();
+                            if (!visited.insert(q).second)
+                                continue;
+                            if (p_is_empty(q)) {
+                                reached_unresolved_levels.insert(level(q.GetVector()));
+                            }
+                            vector<Point> monoms = p_get_monoms(q);
+                            for (const auto &monom : monoms) {
+                                if (monom.SectorNumber() == sector_number)
+                                    work.push_back(monom);
+                            }
+                        }
+                        for (const auto &lv : reached_unresolved_levels) {
+                            ++reseg_reverse_fanin[lv];
+                        }
+                    }
+                }
+                if (reseg_pts.size() > reseg_prev_closure ||
+                    reseg_levels_unresolved != reseg_prev_levels_unresolved) {
+                    reseg_prev_closure = reseg_pts.size();
+                    reseg_prev_levels_unresolved = reseg_levels_unresolved;
+                    // iter #119: same direct-clause tightening as the iter #44
+                    // initial sort, applied to the iter #45 dynamic re-
+                    // segregation. The closure walk has just been re-derived
+                    // over the rules written so far (seeding + every level the
+                    // outer loop has processed in this pass), so many closure
+                    // points carry rules now -- including some at brand-new
+                    // closure levels (the closure GROWS as IBPs land lower
+                    // in-sector terms in their rules). A level l whose ONLY
+                    // role in this growing closure is a settled-direct
+                    // membership tag (every closure point at l already has
+                    // p_get_monoms non-empty: either a reduced rule or a
+                    // mark_master self-rule) is one where processing l within
+                    // its sum class K = l.first+l.second adds no new closure
+                    // pivot at l; forward-sub in work_with_equation will erase
+                    // an l-top before write, and any higher-sum-class pivot l
+                    // could write is equally written by the higher level's own
+                    // pass under_levels emits. So defer l within K, past the
+                    // same-K closure levels with at least one still-empty
+                    // member, to let `done` finalize earlier. The closure
+                    // strictly GROWS each time this re-sort fires, so a level
+                    // that was tail-deferred at this snapshot can be re-floated
+                    // by a later re-sort if a fresh unresolved closure point
+                    // lands at l. Strict shrink vs iter #118 (the relevance
+                    // set only ever loses members); the ascending-sum primary
+                    // key is preserved (so master marking on (l-1,l-1) two
+                    // sums below stays sound) and `done` walks the actual
+                    // chain (so a mis-deferred level is at worst delayed, not
+                    // dropped).
+                    // iter #121: reseg_levels_unresolved is now computed above
+                    // (it's also the iter #121 trigger input), so we can reuse
+                    // it here without a second pass over reseg_pts.
+                    // iter #126: same 3-valued relevance score as the iter
+                    // #44 initial sort (see rationale at line ~1898). Score 2
+                    // = doubly-relevant (closure pivot at l AND master-confirm
+                    // at (l-1,l-1)), 1 = singly-relevant, 0 = irrelevant.
+                    // Within each sum class the dynamic re-sort orders by
+                    // descending score, so doubly-relevant levels run first,
+                    // singly-relevant next, irrelevant last (preserving iter
+                    // #119's partition). Sound by the same standing argument:
+                    // strict within-sum-class reorder, ascending-sum primary
+                    // key untouched, `done` walks actual chains.
+                    // iter #127: same combined stack as the iter #44 initial
+                    // sort (see rationale at line ~1898). 3-valued score
+                    // (iter #126) is the within-sum-class secondary; count-
+                    // based unresolved weight (iter #122) is the tertiary.
+                    // pos==0 gap fix in both score and weight.
+                    auto reseg_relevance_score = [&reseg_levels_unresolved](
+                                                     const pair<unsigned int, unsigned int> &l) -> unsigned int {
+                        unsigned int score = 0;
+                        if (reseg_levels_unresolved.count(l))
+                            ++score;
+                        if (l.second >= 1) {
+                            unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                            if (reseg_levels_unresolved.count(make_pair(mp, l.second - 1)))
+                                ++score;
+                        }
+                        return score;
+                    };
+                    auto reseg_unresolved_weight = [&reseg_unresolved_count](
+                                                       const pair<unsigned int, unsigned int> &l) -> size_t {
+                        size_t c = 0;
+                        auto it = reseg_unresolved_count.find(l);
+                        if (it != reseg_unresolved_count.end())
+                            c += it->second;
+                        if (l.second >= 1) {
+                            unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                            auto it2 = reseg_unresolved_count.find(make_pair(mp, l.second - 1));
+                            if (it2 != reseg_unresolved_count.end())
+                                c += it2->second;
+                        }
+                        return c;
+                    };
+                    // iter #130: matching forward-dependent quaternary for the
+                    // dynamic re-sort (see rationale at the initial sort,
+                    // ~functions.cpp:1959). Uses the SAME snapshot
+                    // (reseg_unresolved_count) as the secondary/tertiary so the
+                    // strict-weak-order invariant holds.
+                    auto reseg_forward_count = [&reseg_unresolved_count](
+                                                   const pair<unsigned int, unsigned int> &l) -> size_t {
+                        size_t c = 0;
+                        auto it = reseg_unresolved_count.find(make_pair(l.first + 1, l.second + 1));
+                        if (it != reseg_unresolved_count.end())
+                            c += it->second;
+                        if (l.first == 0) {
+                            auto it2 = reseg_unresolved_count.find(make_pair(0, l.second + 1));
+                            if (it2 != reseg_unresolved_count.end())
+                                c += it2->second;
+                        }
+                        return c;
+                    };
+                    // iter #157: matching DAG-depth quinary for the dynamic
+                    // re-sort. ASCENDING depth (shallow first). See rationale
+                    // at the initial sort, ~functions.cpp:2069.
+                    auto reseg_dag_depth = [&reseg_unresolved_max_dist](
+                                               const pair<unsigned int, unsigned int> &l) -> unsigned int {
+                        unsigned int d = 0;
+                        bool seen = false;
+                        auto it = reseg_unresolved_max_dist.find(l);
+                        if (it != reseg_unresolved_max_dist.end()) {
+                            d = it->second;
+                            seen = true;
+                        }
+                        if (l.second >= 1) {
+                            unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                            auto it2 = reseg_unresolved_max_dist.find(make_pair(mp, l.second - 1));
+                            if (it2 != reseg_unresolved_max_dist.end()) {
+                                if (!seen || it2->second > d)
+                                    d = it2->second;
+                                seen = true;
+                            }
+                        }
+                        return seen ? d : std::numeric_limits<unsigned int>::max();
+                    };
+                    // iter #166: matching reverse-DAG fan-in for the dynamic
+                    // re-sort. Same shape as the score/weight lookups (direct
+                    // + master-confirm site).
+                    auto reseg_fanin_get = [&reseg_reverse_fanin](
+                                               const pair<unsigned int, unsigned int> &l) -> size_t {
+                        size_t c = 0;
+                        auto it = reseg_reverse_fanin.find(l);
+                        if (it != reseg_reverse_fanin.end())
+                            c += it->second;
+                        if (l.second >= 1) {
+                            unsigned int mp = (l.first >= 1) ? (l.first - 1) : 0;
+                            auto it2 = reseg_reverse_fanin.find(make_pair(mp, l.second - 1));
+                            if (it2 != reseg_reverse_fanin.end())
+                                c += it2->second;
+                        }
+                        return c;
+                    };
+                    std::stable_sort(itr, levels_ordered.end(),
+                                     [&reseg_relevance_score, &reseg_unresolved_weight, &reseg_forward_count,
+                                      &reseg_dag_depth, &reseg_fanin_get](
+                                         const pair<unsigned int, unsigned int> &a,
+                                         const pair<unsigned int, unsigned int> &b) -> bool {
+                                         unsigned int sa = a.first + a.second;
+                                         unsigned int sb = b.first + b.second;
+                                         if (sa != sb)
+                                             return sa < sb; // preserve ascending total-degree primary key
+                                         unsigned int sca = reseg_relevance_score(a);
+                                         unsigned int scb = reseg_relevance_score(b);
+                                         if (sca != scb)
+                                             return sca > scb; // iter #126: 3-valued score
+                                         size_t wa = reseg_unresolved_weight(a);
+                                         size_t wb = reseg_unresolved_weight(b);
+                                         if (wa != wb)
+                                             return wa > wb; // iter #122 tertiary
+                                         size_t fa = reseg_forward_count(a);
+                                         size_t fb = reseg_forward_count(b);
+                                         if (fa != fb)
+                                             return fa > fb; // iter #130 quaternary
+                                         unsigned int da = reseg_dag_depth(a);
+                                         unsigned int db = reseg_dag_depth(b);
+                                         if (da != db)
+                                             return da < db; // iter #157 quinary
+                                         size_t ra = reseg_fanin_get(a);
+                                         size_t rb = reseg_fanin_get(b);
+                                         return ra > rb; // iter #166 senary — reverse-DAG fan-in
+                                     });
+                }
             }
 
+            set<pair<unsigned int, unsigned int>> current_levels;
+            current_levels.insert(*itr);
+            ++itr;
+
             // time to check if database reopen is needed
             uint64_t entries = Common::points[sector_number]->count();
             if (entries > (1llu << Common::buckets[sector_number])) {
@@ -1711,11 +2748,70 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
             if (Common::pos_pref) {
                 for (const auto &current_level : current_levels) {
                     if (Common::pos_pref > 0) {
-                        if ((current_level.first <= static_cast<unsigned int>(abs(Common::pos_pref))) &&
-                            (current_level.second == 1)) {
-                            symmetries +=
-                                write_symmetries(Corner, current_level.first, current_level.second, std::nullopt);
-                        }
+                        // Pure-dot symmetry seeding: vanilla only writes internal
+                        // symmetries at the (<=pos_pref, 1) levels, leaving higher
+                        // dot levels to be reduced by IBP. But write_symmetries on a
+                        // (pos, 1) level also covers the pure-dot (pos, 0) points
+                        // (see level_points_fast fan-out), and every symmetry it
+                        // emits maps a point onto a *lower* canonical reference
+                        // (relations whose highest member isn't p are dropped) using
+                        // p_set -- it never consumes an IBP equation and never marks
+                        // a master. Extending it to *all* pure-dot levels (neg==1,
+                        // any number of dots) pre-resolves more dotted integrals by
+                        // exact symmetry, so the subsequent IBP pass finds them
+                        // already substituted (work_with_equation -> used=false) and
+                        // the `used` step count drops. The master basis is the same
+                        // canonical-reference set vanilla converges to, so the target
+                        // reductions are identical and validity is preserved.
+                        //
+                        // iter #3 extension: also seed the one-numerator levels
+                        // (neg==2). write_symmetries is self-guarding -- it keeps a
+                        // relation only when its highest member is exactly the point
+                        // being resolved AND it maps onto a *strictly lower* canonical
+                        // reference (mon.back()==p check), otherwise the relation is
+                        // dropped (or aborts on a genuinely illegal map). A numerator
+                        // integral that admits such a symmetry is therefore provably
+                        // reducible -- it is never a master in vanilla's basis -- so
+                        // pre-resolving it by exact symmetry front-loads work the IBP
+                        // pass would otherwise spend a pivot on, lowering `used`
+                        // further while leaving the master set (hence validity)
+                        // unchanged.
+                        //
+                        // iter #4 extension: push the seeding to the two-numerator
+                        // levels (neg==3). The self-guard at write_symmetries (a
+                        // relation is kept only when mon.back()==p, i.e. its highest
+                        // member is exactly the point being resolved and it maps to a
+                        // strictly lower canonical reference) is level-agnostic, so the
+                        // same monotone argument holds: any neg==3 point that admits an
+                        // internal symmetry is reducible -- never a master in vanilla's
+                        // basis -- and pre-resolving it by exact symmetry only converts
+                        // would-be `used` IBP pivots into trivial (already-substituted)
+                        // equations. `used` can only drop or stay, and the master set
+                        // (hence validity) is untouched. Bounded by under_levels, so the
+                        // two-numerator levels only fire where the targets actually
+                        // demand them.
+                        //
+                        // iter #5: drop the numerator cap entirely and seed at every
+                        // level the queue produces. The cap was always extrinsic --
+                        // write_symmetries is fully self-guarding and level-agnostic:
+                        // it keeps a relation only when its highest sorted member is
+                        // exactly the point p being resolved AND p maps onto a strictly
+                        // lower canonical reference (the `mon.back()==p` check at the
+                        // bottom of write_symmetries), and it aborts/drops on any zero,
+                        // sector-1, or non-decreasing-level image. Hence at *any* neg a
+                        // point that admits an internal symmetry is provably reducible
+                        // (never a master in vanilla's basis) and pre-resolving it by
+                        // exact symmetry only converts would-be `used` IBP pivots into
+                        // already-substituted points: `used` is monotone non-increasing
+                        // and the master set (hence validity) is untouched. This is the
+                        // complete limit of the iter #2->#4 ramp -- it subsumes neg<=4,
+                        // neg<=5, ... in one shot. The seeding only ever fires on levels
+                        // under_levels actually emits (bounded by the targets), and the
+                        // extra low-cost pre-resolution lets the iter #1 `done`
+                        // early-termination trip sooner, skipping more high-level IBP
+                        // equations vanilla would have generated.
+                        symmetries +=
+                            write_symmetries(Corner, current_level.first, current_level.second, std::nullopt);
                     } else {
                         if ((current_level.first == 1) &&
                             (current_level.second <= static_cast<unsigned int>(abs(Common::pos_pref)))) {
@@ -1736,19 +2832,64 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
             eqs_number_sector_level = 0;
             used_number_sector_level = 0;
 
+            // iter #6: pre-IBP early-exit probe.
+            //
+            // iter #1 made the `done` resolvability check fire per single level,
+            // but it always runs *after* the current level's IBP pass. iter #5
+            // then uncapped symmetry seeding, so the last still-unresolved needed
+            // target is now sometimes resolved by *this* level's seeding alone --
+            // before its IBP pass would run. In that case the IBP pass only writes
+            // reduction rules for points outside the targets' dependency closure
+            // (pure `used` waste: every needed integral is already resolvable).
+            //
+            // So probe the *same* `done` condition the loop already uses, right
+            // after seeding and before pushing this level's IBP tasks. If it
+            // already holds, skip the IBP pass for this level entirely. Control
+            // still falls through to the unchanged $USED marking, master marking,
+            // and post-pass `done` block, which then finalizes the sector through
+            // the tested termination path. The skip is gated by the identical
+            // resolvability test, and master marking only *adds* relations, so the
+            // post-pass check is monotone: pre-check true => post-pass true, and we
+            // never skip-and-continue (which would leave a level unreduced). If a
+            // needed target at this level is destined to be a master it is not yet
+            // marked, so p_get_monoms is empty, the probe is false, and the normal
+            // IBP pass runs -- the skip is strictly conservative. The master set
+            // (hence validity) is identical to vanilla; `used` can only drop.
+            bool skip_ibp_pass = true;
             {
-                lock_guard<mutex> guard(level_mutex); // we will be putting tasks
-                for (auto level_itr = current_levels.rbegin(); level_itr != current_levels.rend(); ++level_itr) {
-                    level_tasks.push_back(*level_itr);
-                    ++level_tasks_count;
+                set<Point, std::greater<Point>> probe = needed_in_this_sector;
+                for (auto pc = probe.begin(); pc != probe.end(); ++pc) {
+                    vector<Point> monoms = p_get_monoms(*pc);
+                    if (!monoms.empty()) {
+                        for (const auto &monom : monoms) {
+                            if (monom.SectorNumber() == sector_number) {
+                                probe.insert(pc, monom);
+                            }
+                        }
+                    } else if (corner_master_resolved(*pc, sector_number)) {
+                        continue; // iter #56: dim<=3 corner = master, skip the pure-waste IBP pass
+                    } else {
+                        skip_ibp_pass = false;
+                        break;
+                    }
                 }
             }
-            level_cond.notify_all(); // level threads can start
 
-            {
-                unique_lock<mutex> guard(level_mutex);
-                level_done_cond.wait(guard, []() { return level_tasks_count == 0; });
-                // waiting for all work to be done
+            if (!skip_ibp_pass) {
+                {
+                    lock_guard<mutex> guard(level_mutex); // we will be putting tasks
+                    for (auto level_itr = current_levels.rbegin(); level_itr != current_levels.rend(); ++level_itr) {
+                        level_tasks.push_back(*level_itr);
+                        ++level_tasks_count;
+                    }
+                }
+                level_cond.notify_all(); // level threads can start
+
+                {
+                    unique_lock<mutex> guard(level_mutex);
+                    level_done_cond.wait(guard, []() { return level_tasks_count == 0; });
+                    // waiting for all work to be done
+                }
             }
 
             auto stop_time = chrono::steady_clock::now();
@@ -1774,15 +2915,39 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
                     local_pos_pref = Common::pos_pref;
                 }
                 if ((!marked_lower_levels) && current_level.first < local_pos_pref &&
-                    (current_level != *levels.rbegin())) {
+                    (current_level != levels_ordered.back())) {
                     // do not mark untill reaching pos_pref or last element
                     // there was a condition !Point::preferred[sector_number].empty(), but
                     // it was always true since they were always added may be we wanted
                     // preferred_initial?
-                    mark_master_integrals(Corner, current_level.first, current_level.second, first_pass, true);
+                    //
+                    // iter #292 (Idea 154 — RETRY-PASS only_preferred WIDENING):
+                    // memory iter #167 closed the FIRST-PASS flip (only_preferred
+                    // true→false at this line, bit-identical, cascade at :2873-2882
+                    // catches up in same pass). The retry-pass variant is a distinct
+                    // cadence axis: by retry entry the closure has been re-walked over
+                    // every rule written by pass-1 (seeding + every level the outer
+                    // loop processed), and the per-level p_is_empty / IsPreferred set
+                    // is materially different. Widening only_preferred to FALSE on
+                    // RETRY only (first_pass==false → only_preferred=false) marks
+                    // every empty corner at (current_level.first - 1, current_level
+                    // .second - 1) earlier in the retry-pass level walk, instead of
+                    // waiting for the cascade at the first current_level.first ==
+                    // local_pos_pref to fire only_preferred=false retroactively.
+                    // Sound by the standing mark_master_integrals invariant: the
+                    // function only calls make_master on points whose p_is_empty(p)
+                    // returns true (line :886), so an existing rule is never
+                    // overwritten; the cascade at :2873-2882 also fires
+                    // only_preferred=false on the same lpair, so no NEW masters get
+                    // created relative to the eventual same-retry-pass cascade end-
+                    // state — only TIMING within retry differs. The widening on
+                    // first_pass is preserved bit-identical to vanilla (passing
+                    // only_preferred = first_pass = true at first pass equals the
+                    // prior `true`).
+                    mark_master_integrals(Corner, current_level.first, current_level.second, first_pass, first_pass);
                 } else if ((!marked_lower_levels) &&
-                           ((local_pos_pref == current_level.first) || (current_level == *levels.rbegin()))) {
-                    for (const auto &lpair : levels) {
+                           ((local_pos_pref == current_level.first) || (current_level == levels_ordered.back()))) {
+                    for (const auto &lpair : levels_ordered) {
                         // marking all untill current
                         if (!good_mark)
                             break;
@@ -1817,6 +2982,8 @@ void forward_stage(unsigned short thread_number, sector_count_t sector_number) {
                             ivpl.insert(ivpl_counter, monom);
                         }
                     }
+                } else if (corner_master_resolved(p, sector_number)) {
+                    continue; // iter #56: dim<=3 corner = confirmed master, treat as resolved
                 } else {
                     done = false;
                     break;
@@ -2124,6 +3291,121 @@ void reduce_in_level(Point Corner, vector<ibp_type> ibps, unsigned int thread_nu
                 unsigned int used_number = 0;
                 int print_counter = 0;
 
+                // iter #29: needed-closure-prioritized within-group ordering.
+                // The "resolving zone" is the set of groups whose highest member
+                // is not below the smallest needed target -- only there can the
+                // iter #25 per-pivot early-exit fire (lower groups run in full
+                // because the targets are not yet resolvable). When we enter a
+                // resolving-zone group we re-derive the needed-target closure
+                // over the rules written so far (cheap: the deep chains are
+                // already settled by the lower groups, so this is a shallow walk)
+                // and stable-sort that group's equations so the ones touching the
+                // closure are worked first. See the rationale at the sort below.
+                bool have_min_needed = (level_needed_targets && !level_needed_targets->empty());
+                Point min_needed;
+                if (have_min_needed) {
+                    min_needed = *level_needed_targets->rbegin(); // set<,greater> -> smallest is last
+                }
+
+                // iter #37: cross-group closure segregation (front the groups that
+                // pivot a needed-closure point; defer every off-closure group).
+                //
+                // Equation groups (runs of equal highest member in ibps_vector) are
+                // sorted strictly ascending by highest member, so this level's pivots
+                // are written bottom-up. The iter #25 per-pivot early-exit breaks the
+                // OUTER group loop the moment every needed target in this sector
+                // resolves to a settled chain -- but because the order is ascending,
+                // only the groups ABOVE the resolving moment are skipped. Every
+                // off-closure group whose highest member sorts BELOW the resolving
+                // zone is still pivoted in full: a `used` IBP equation spent on a
+                // point that no needed target's reduction ever references. iter #29
+                // reorders only WITHIN a group; it cannot move those low off-closure
+                // groups out of the way.
+                //
+                // Here we stable_partition the whole level: groups whose pivot point
+                // (the highest member) lies in the needed-target closure are floated
+                // to the front (keeping their ascending, bottom-up order among
+                // themselves -- stable_partition on an already-sorted range), and all
+                // off-closure groups are deferred to the tail (also order-preserved).
+                // Once the closure prefix is worked the chain is settled, so the
+                // early-exit fires at the partition boundary and the entire
+                // off-closure tail -- including the low groups ascending order would
+                // have pivoted before the resolving moment -- is skipped.
+                //
+                // Monotone-safe, by the standing iter #25 invariant. (1) Validity:
+                // the early-exit is self-correcting -- all_needed_resolved walks the
+                // ACTUAL written rules and never fires until every needed target
+                // reduces to masters/resolved points, whatever order the groups ran
+                // in; a closure point misclassified into the tail (its membership only
+                // established by a this-level rule not yet written) simply keeps the
+                // exit from firing until its deferred group runs, so no needed
+                // reduction is ever left dangling on an unreduced point. (2) `used`:
+                // each point that gets pivoted gets exactly one rule regardless of the
+                // group order (substituting a higher member first just exposes the
+                // same lower pivots later), so reordering cannot raise the pivot count
+                // of the fully-worked set; the only net effect is the early-exit
+                // skipping the off-closure tail, which can only lower `used`. (3) The
+                // master set is fixed by mark_master_integrals / irreducibility, not by
+                // elimination order, so it is untouched. Guarded off the hint-writing
+                // path (a hint file must stay in canonical ascending order) and to the
+                // non-empty needed-target case (the active reduction path).
+                if (!hint_local && have_min_needed) {
+                    set<Point> needed_closure;
+                    vector<Point> work(level_needed_targets->begin(), level_needed_targets->end());
+                    while (!work.empty()) {
+                        Point q = work.back();
+                        work.pop_back();
+                        if (!needed_closure.insert(q).second)
+                            continue;
+                        vector<Point> monoms = p_get_monoms(q);
+                        for (const auto &monom : monoms) {
+                            if (monom.SectorNumber() == sector_number)
+                                work.push_back(monom);
+                        }
+                    }
+                    std::stable_partition(
+                        ibps_vector.begin(), ibps_vector.end(),
+                        [&needed_closure](const pair<Point, pair<FastPoint, unsigned short>> &e) -> bool {
+                            return needed_closure.count(e.first) != 0;
+                        });
+                }
+
+                // iter #39: multi-wave closure re-segregation (deepen iter #37).
+                // The iter #37 partition runs ONCE, before any of this level's rules
+                // exist, so its closure is only {needed targets} plus same-sector
+                // points already reachable through the deeper (lower-level) rules
+                // written in prior level-loop iterations. As we work the floated
+                // closure prefix, each group writes a reduction rule whose monomials
+                // are lower same-sector points -- so the needed-target closure GROWS,
+                // revealing tail groups that the per-pivot early-exit is in fact still
+                // waiting on but that the one-shot partition left parked behind the
+                // truly off-closure groups (ascending order interleaves them). Below,
+                // at every group boundary while still inside the closure region, we
+                // re-derive the closure over the rules written so far and
+                // stable_partition the still-UNWORKED suffix [idx, end) closure-first.
+                // This floats each freshly-revealed wave of needed groups ahead of the
+                // off-closure mass, so the early-exit fires after skipping a strictly
+                // longer off-closure tail. It self-limits: the instant a re-partition
+                // leaves the suffix front off-closure, no unworked group is in the
+                // (now frozen -- working off-closure pivots cannot add needed-closure
+                // members) closure, so re-segregation deactivates and the remainder is
+                // processed linearly under the standing early-exit.
+                //
+                // Monotone-safe by the iter #25/#37 invariant. (1) We only ever
+                // reorder the UNWORKED suffix [idx, end); already-pivoted groups are
+                // never touched, and stable_partition rearranges in place so the
+                // current iterator keeps pointing at index idx. (2) Reordering cannot
+                // change the pivoted set (one rule per pivoted point regardless of
+                // order) nor the master basis (fixed by mark_master_integrals /
+                // irreducibility), so validity is preserved and `used` of the
+                // fully-worked set is unchanged. (3) The early-exit (all_needed_resolved)
+                // is self-correcting: it never fires until every needed target reduces
+                // to a settled chain whatever order the groups ran in, so a group
+                // mis-parked in the tail merely delays the exit, never dangles a needed
+                // reduction. Net effect is the exit skipping more off-closure pivots ->
+                // `used` can only drop. Off the hint path, non-empty needed targets.
+                bool reseg_active = (!hint_local && have_min_needed);
+
                 for (vector<pair<Point, pair<FastPoint, unsigned short>>>::const_iterator ibps_itr =
                          ibps_vector.begin();
                      ibps_itr != ibps_vector.end(); ++ibps_itr) {
@@ -2136,6 +3418,34 @@ void reduce_in_level(Point Corner, vector<ibp_type> ibps, unsigned int thread_nu
                             }
                         }
                     }
+
+                    // iter #39: re-segregate the unworked suffix closure-first using
+                    // the rules written so far (see the rationale above the loop).
+                    if (reseg_active) {
+                        set<Point> reseg_closure;
+                        vector<Point> rwork(level_needed_targets->begin(), level_needed_targets->end());
+                        while (!rwork.empty()) {
+                            Point q = rwork.back();
+                            rwork.pop_back();
+                            if (!reseg_closure.insert(q).second)
+                                continue;
+                            vector<Point> monoms = p_get_monoms(q);
+                            for (const auto &monom : monoms) {
+                                if (monom.SectorNumber() == sector_number)
+                                    rwork.push_back(monom);
+                            }
+                        }
+                        size_t idx = static_cast<size_t>(ibps_itr - ibps_vector.begin());
+                        std::stable_partition(
+                            ibps_vector.begin() + idx, ibps_vector.end(),
+                            [&reseg_closure](const pair<Point, pair<FastPoint, unsigned short>> &e) -> bool {
+                                return reseg_closure.count(e.first) != 0;
+                            });
+                        if (reseg_closure.count(ibps_itr->first) == 0) {
+                            reseg_active = false; // suffix front off-closure: closure frozen
+                        }
+                    }
+
                     auto itr2 = ibps_itr;
                     int k;
                     int write;
@@ -2192,6 +3502,94 @@ void reduce_in_level(Point Corner, vector<ibp_type> ibps, unsigned int thread_nu
                         return (j != 0);
                     });
 
+                    // iter #29: needed-closure-prioritized reorder inside the
+                    // resolving zone. A group writes a chain of pivots (the rank
+                    // profile of its equations modulo the rules already in the DB);
+                    // that set of pivoted points is invariant to the order the
+                    // equations are worked in, so this reorder can neither change
+                    // which points get reduced (validity is preserved) nor raise
+                    // `used` above a full pass. Its ONLY effect is to move the
+                    // moment the iter #25 per-pivot early-exit (all_needed_resolved)
+                    // fires. The exit can only trim the tail of the resolving group;
+                    // by working the equations that carry a still-open needed-target
+                    // chain point first, every needed target reaches a settled chain
+                    // after fewer pivots, so the break skips a longer tail of
+                    // not-yet-pivoted, off-closure points. Equations touching no
+                    // closure point keep their original (length-based) relative order
+                    // (stable_sort), so the chosen pivots stay as well-conditioned as
+                    // before. Gated to the resolving zone (highest member >= smallest
+                    // needed target) and to multi-equation groups, so the closure walk
+                    // runs only for the handful of top groups where the exit can fire;
+                    // the deep chains are already settled by the lower groups, making
+                    // each walk shallow. Off the hint path (level_needed_targets is
+                    // only published for the non-hint reduction).
+                    if (have_min_needed && write > 1 && !(ibps_itr->first < min_needed)) {
+                        set<Point, std::greater<Point>> closure = *level_needed_targets;
+                        for (auto cit = closure.begin(); cit != closure.end(); ++cit) {
+                            vector<Point> monoms = p_get_monoms(*cit);
+                            for (const auto &monom : monoms) {
+                                if (monom.SectorNumber() == sector_number) {
+                                    closure.insert(cit, monom);
+                                }
+                            }
+                        }
+                        auto touches = [&closure](const Equation &e) -> bool {
+                            for (const auto &t : e.terms) {
+                                if (closure.count(t.first)) {
+                                    return true;
+                                }
+                            }
+                            return false;
+                        };
+                        std::stable_sort(eqs.begin(), eqs.begin() + write,
+                                         [&touches](const Equation &lhs, const Equation &rhs) -> bool {
+                                             return touches(lhs) && !touches(rhs);
+                                         });
+                    }
+
+                    // iter #25: within-GROUP early-exit (finer than iter #24's
+                    // group-boundary probe).
+                    //
+                    // Equation groups (same highest member) are processed in
+                    // ascending highest-member order, so this level's pivots are
+                    // written bottom-up. iter #24 re-probed the resolvability
+                    // condition only at GROUP boundaries -- after a whole group of
+                    // same-highest-member equations had been worked. But within a
+                    // single group the equations write more than one pivot: the
+                    // first equation reduces the shared highest member, and each
+                    // later equation then has that member substituted away, so its
+                    // own (now strictly-lower) highest member can take a fresh
+                    // pivot. Once every needed target in this sector already reduces
+                    // to a settled chain, ALL remaining pivots -- the rest of this
+                    // group AND every higher group -- are for points OUTSIDE every
+                    // needed target's dependency closure: pure `used` waste. Vanilla
+                    // (the GT) runs the whole final level's IBP pass to completion;
+                    // re-probing after EACH new pivot and breaking immediately trims
+                    // the tail of the resolving group as well, dropping `used`
+                    // strictly further than the group-boundary check with the same
+                    // answer.
+                    //
+                    // This is the exact resolvability condition the iter #6 pre-IBP
+                    // probe uses, just re-evaluated per-pivot. If a needed target on
+                    // this level is destined to be a master, mark_master_integrals
+                    // has not run yet, so p_get_monoms is empty, the probe is false,
+                    // and the pass continues -- strictly conservative. Master
+                    // marking (which the main loop runs *after* this worker returns)
+                    // only adds resolutions, so worker-resolved => post-pass `done`
+                    // holds: we only ever break-and-finalize, never
+                    // break-and-continue, so no level is left half-reduced on the
+                    // path to a higher one. Points the skipped equations would have
+                    // reduced are by definition outside every needed chain; whether
+                    // they end up reduced or master-marked never touches a requested
+                    // reduction, so the master set (hence validity) stays consistent
+                    // and `used` can only drop. Re-probed only right after a new
+                    // pivot (used flipped). Cheap in the common case:
+                    // all_needed_resolved walks `needed` in descending order and
+                    // returns false at the first still-unresolved (highest) target,
+                    // so the deep chain walk runs only at the resolving moment.
+                    // Guarded off the hint-writing path so a hint file is never
+                    // truncated mid-group.
+                    bool early_exit = false;
                     for (k = 0; k != write; ++k) { // cycle of same starting point
                         bool used = work_with_equation(eqs[k], thread_number, sector_number);
                         if (used) {
@@ -2207,8 +3605,16 @@ void reduce_in_level(Point Corner, vector<ibp_type> ibps, unsigned int thread_nu
                                 out << int(p.buf[Common::dimension - 1]) << "}" << "," << i << "}";
                             }
                             ++used_number;
+                            if (!hint_local && level_needed_targets &&
+                                all_needed_resolved(*level_needed_targets, sector_number)) {
+                                early_exit = true;
+                                break;
+                            }
                         }
                     }
+                    if (early_exit) {
+                        break;
+                    }
                     ibps_itr = itr2;
                     --ibps_itr;
                 } // Equation cycle