From 1957db683d5ca83f080ee36b0425c859aee45583 Mon Sep 17 00:00:00 2001
From: Alexis Duburcq <alexis.duburcq@gmail.com>
Date: Mon, 25 May 2026 17:32:18 +0200
Subject: [PATCH 1/7] Fix contact point pruning in batched sim.

---
 .../engine/solvers/rigid/collider/collider.py |  2 ++
 .../engine/solvers/rigid/collider/contact.py  | 20 ++++++++++++++-----
 genesis/options/solvers.py                    |  2 +-
 genesis/utils/array_class.py                  |  2 ++
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index c6aa09b9d..1d5b3036b 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -90,6 +90,7 @@ def __init__(self, rigid_solver: "RigidSolver"):
         self._diff_pos_tolerance = 1e-2
         self._diff_normal_tolerance = 1e-2
         self._prune_deep_penetration_ratio = 3.0
+        self._prune_hull_collinear_tol = 1e-3
 
         self._init_static_config()
         self._use_split_narrowphase = (
@@ -200,6 +201,7 @@ def _init_collision_fields(self) -> None:
             diff_normal_tolerance=self._diff_normal_tolerance,
             contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0,
             prune_deep_penetration_ratio=self._prune_deep_penetration_ratio,
+            prune_hull_collinear_tol=self._prune_hull_collinear_tol,
         )
         self._init_collision_pair_idx(self._collision_pair_idx)
         self._init_valid_pairs()
diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 1379b1f26..f7e7cb2d9 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -672,6 +672,7 @@ def func_prune_contacts(
     max_contact_pairs = collider_info.max_contact_pairs[None]
     tol = collider_info.contact_pruning_tolerance[None]
     prune_deep_penetration_ratio = collider_info.prune_deep_penetration_ratio[None]
+    prune_hull_collinear_tol = collider_info.prune_hull_collinear_tol[None]
     LP_KEY_STRIDE = gs.qd_float(1.0e7)
     EPS = rigid_global_info.EPS[None]
 
@@ -898,7 +899,7 @@ def func_prune_contacts(
                     # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails
                     # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff),
                     # so genuine midpoints would survive as spurious hull vertices.
-                    hull_collinear_tol = tol * max_in_plane_r2
+                    hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2
 
                     # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k).
                     k = 0
@@ -922,6 +923,14 @@ def func_prune_contacts(
                         k += 1
 
                     upper_start = k
+                    # Memory-fence workaround for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2):
+                    # without an explicit scratch store between the lower-hull and upper-hull passes, the upper-hull
+                    # pop-loop's reads of contact_hull_stack don't observe the writes from the lower hull, so its
+                    # cross-product / pop-check effectively runs on stale data and every candidate is kept. This shows
+                    # up as the kernel never popping anything in the upper hull and producing a hull whose size equals
+                    # the bucket size. Writing any value to a non-overlapping scratch slot here serializes the two
+                    # passes' contact_hull_stack accesses and restores parallel correctness; the value is unused.
+                    collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0
                     # quadrants range supports 1 or 2 args only; iterate sorted indices backward over
                     # [b_start, b_end - 2] by reflecting the index.
                     for k_step in range(b_size - 1):
@@ -965,12 +974,13 @@ def func_prune_contacts(
                     # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep
                     # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep
                     # supports are restored).
-                    hull_pen_sum = gs.qd_float(0.0)
+                    hull_pen_max = gs.qd_float(0.0)
                     for hk in range(k):
                         survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
-                        hull_pen_sum = hull_pen_sum + collider_state.contact_data.penetration[survivor, i_b]
-                    hull_pen_avg = hull_pen_sum / qd.cast(k, gs.qd_float)
-                    deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_avg
+                        p = collider_state.contact_data.penetration[survivor, i_b]
+                        if p > hull_pen_max:
+                            hull_pen_max = p
+                    deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
                     for i in range(b_start, b_end):
                         if collider_state.contact_keep[i, i_b] == 0:
                             if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold:
diff --git a/genesis/options/solvers.py b/genesis/options/solvers.py
index c34045606..691e0b727 100644
--- a/genesis/options/solvers.py
+++ b/genesis/options/solvers.py
@@ -507,7 +507,7 @@ class RigidOptions(Options):
     ls_tolerance: PositiveFloat = 1e-2
     noslip_iterations: NonNegativeInt = 0
     noslip_tolerance: PositiveFloat = 1e-6
-    contact_pruning_tolerance: PositiveFloat | None = None
+    contact_pruning_tolerance: PositiveFloat | None = 0.1
     sparse_solve: StrictBool = False
     constraint_timeconst: PositiveFloat = 0.01
     use_contact_island: StrictBool = False
diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py
index e1bde4911..286858bee 100644
--- a/genesis/utils/array_class.py
+++ b/genesis/utils/array_class.py
@@ -792,6 +792,7 @@ class ColliderInfo:
     # link-pair contact pruning
     contact_pruning_tolerance: qd.Tensor
     prune_deep_penetration_ratio: qd.Tensor
+    prune_hull_collinear_tol: qd.Tensor
 
 
 def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_config, **kwargs):
@@ -824,6 +825,7 @@ def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_c
         diff_normal_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["diff_normal_tolerance"]),
         contact_pruning_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["contact_pruning_tolerance"]),
         prune_deep_penetration_ratio=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_deep_penetration_ratio"]),
+        prune_hull_collinear_tol=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_hull_collinear_tol"]),
     )
 
 

From 65828308619c1342689a0cbbe52390f9d647e031 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 25 May 2026 08:51:01 -0700
Subject: [PATCH 2/7] deskai6: cooperative warp-per-env link-pair dedup,
 rebased onto #2831

Rebase of my cooperative-warp dedup work onto Alexis's #2831
(fix_contact_pruning_batched) which fixes the Quadrants Metal codegen
bug that was blocking dedup on macOS. PR #2831's contact.py changes are
incorporated verbatim into my cooperative kernel:

1. Memory-fence scratch write between the lower-hull and upper-hull
   passes of Andrew's monotone chain (contact_hull_stack[max_contact_pairs - 1, i_b] = 0).
   Without this, on Metal _B >= 2 the upper-hull reads of
   contact_hull_stack don't observe the lower-hull writes and every
   candidate is kept (hull size == bucket size, no pruning).
   Standalone repro: perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py

2. New tunable prune_hull_collinear_tol (1e-3 default) separated from
   the depth coplanarity tol. Lets users tune the hull-popper cross-
   product slop independently of the depth gate.

3. Deep-penetration restore threshold changed from average to MAX of
   hull penetrations. A non-hull contact whose penetration substantially
   exceeds the deepest hull vertex represents a distinct physical
   support (deep middle of a long body) that the support-polygon
   argument doesn't authorize dropping.

Side cleanups now possible because #2831 fixes the Metal bug:

* Removed the gs.backend not in (gs.cuda, gs.amdgpu) guard from
  collider.py. Dedup is now enabled on Metal too; only CPU is still
  excluded (the cooperative kernel adds overhead without parallel
  speedup on a serial backend).
* Removed the @pytest.mark.skipif(darwin) from test_contact_pruning;
  the test now runs on every GPU backend.

The cooperative-warp structure (32 threads/env, qd.simt.subgroup
reductions for the per-bucket mean-normal / centroid, coop projection
and mark-drop, fused phase 3 compact+spatial-sort) is preserved from
the previous branch.

Note: this branch is currently based on duburcqa:fix_contact_pruning_batched (#2831).
Once #2831 merges, the branch base should be moved back to origin/main and
the #2831 commit will simply be absorbed.

Backup of the pre-rebase tip: hp/dedup-alexis-perf-9d-stage3-r2-pre-2831-rebase-backup.
---
 .../engine/solvers/rigid/collider/collider.py |  63 +-
 .../engine/solvers/rigid/collider/contact.py  | 745 ++++++++++--------
 genesis/utils/array_class.py                  |   2 +
 tests/test_rigid_physics.py                   |   1 +
 4 files changed, 472 insertions(+), 339 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index 1d5b3036b..2eae75bf8 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -72,6 +72,19 @@
     from genesis.engine.solvers.rigid.rigid_solver import RigidSolver
 
 
+# Above this env count, the cooperative warp-per-env dedup kernel stops paying off because the GPU's SMs are already
+# grid-saturated by the launch itself, so the per-env serial sub-phases (lex sort, hull build, opt-10 cycle permute)
+# start to dominate. The threshold is a sweet spot for current consumer + datacenter GPUs (5090 / H100 class), where
+# 8192 blocks of 32 threads each already exceeds ~10x the SM count.
+_DEDUP_MAX_N_ENVS = 8192
+
+# Default tolerance applied automatically when scene gates allow dedup and the user explicitly sets the option
+# RigidOptions.contact_pruning_tolerance to None. Upstream default (post-PR #2831) is also 0.1; this constant keeps
+# the "auto-enable when user explicitly None" path independent of the upstream default. 0.1 = 10 percent
+# dimensionless slop fraction on the depth coplanarity gate.
+_DEDUP_DEFAULT_TOLERANCE = 0.1
+
+
 IS_OLD_TORCH = tuple(map(int, torch.__version__.split(".")[:2])) < (2, 8)
 
 
@@ -168,6 +181,38 @@ def _init_static_config(self) -> None:
         # any geom is nonconvex (vertex-based narrowphase emits many contacts per pair), or when terrain is present.
         any_link_multi_geom = any(len(link.geoms) > 1 for link in self._solver.links)
         link_pair_pruning_supported = any_link_multi_geom or has_nonconvex_nonterrain or has_terrain
+        # The dedup kernel is GPU-only: its cooperative 32-lane structure relies on warp-level qd.simt.subgroup
+        # primitives (reduce_all_add_tiled, broadcast, shuffle, sync) that map cleanly to NVIDIA warps and AMD
+        # wavefronts and (post-PR #2831 memory-fence workaround) to Apple Metal simdgroups. CPU has no equivalent --
+        # Quadrants serializes the cooperative loop per env so the kernel just adds overhead without parallel speedup.
+        # At very large n_envs the GPU is already grid-saturated by the warp-cooperative launch (n_envs warps), and
+        # the per-env serial phases (lex sort + hull build + opt-10 cycle permute) start to dominate, so dedup loses
+        # its margin over the baseline narrowphase output. The 8192 threshold is conservative: a 5090 has ~170 SMs
+        # and the kernel uses 32 threads/block, so 8192 envs = 8192 blocks = ~48 blocks/SM (past occupancy
+        # saturation).
+        if link_pair_pruning_supported and gs.backend == gs.cpu:
+            gs.logger.info(
+                f"Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes "
+                f"the per-env loop)."
+            )
+            link_pair_pruning_supported = False
+        if link_pair_pruning_supported and self._solver.n_envs > _DEDUP_MAX_N_ENVS:
+            gs.logger.info(
+                f"Disabling link-pair contact dedup: n_envs={self._solver.n_envs} exceeds threshold "
+                f"_DEDUP_MAX_N_ENVS={_DEDUP_MAX_N_ENVS} (the cooperative kernel's overhead stops paying off "
+                f"once the GPU is already grid-saturated)."
+            )
+            link_pair_pruning_supported = False
+
+        # Resolve the effective dedup tolerance. The upstream option default is 0.1 (post-PR #2831), so dedup is on
+        # by default whenever the scene gates allow. A user-supplied float overrides; explicit user None plus
+        # gates-allow auto-enables to _DEDUP_DEFAULT_TOLERANCE; gates-disallow forces None regardless of user setting
+        # (so an explicit float on CPU still doesn't run -- the cooperative kernel wouldn't pay off there).
+        user_tol = self._solver._options.contact_pruning_tolerance
+        if link_pair_pruning_supported:
+            self._effective_pruning_tolerance = user_tol if user_tol is not None else _DEDUP_DEFAULT_TOLERANCE
+        else:
+            self._effective_pruning_tolerance = None
 
         # Initialize the static config, which stores every data that are compile-time constants.
         # Note that updating any of them will trigger recompilation.
@@ -199,7 +244,7 @@ def _init_collision_fields(self) -> None:
             mpr_to_gjk_overlap_ratio=self._mpr_to_gjk_overlap_ratio,
             diff_pos_tolerance=self._diff_pos_tolerance,
             diff_normal_tolerance=self._diff_normal_tolerance,
-            contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0,
+            contact_pruning_tolerance=self._effective_pruning_tolerance or 0.0,
             prune_deep_penetration_ratio=self._prune_deep_penetration_ratio,
             prune_hull_collinear_tol=self._prune_hull_collinear_tol,
         )
@@ -821,11 +866,15 @@ def detection(self) -> None:
                 self._solver._errno,
             )
 
-        if (
-            self._collider_static_config.link_pair_pruning_supported
-            and self._solver._options.contact_pruning_tolerance is not None
-            and not self._solver._static_rigid_sim_config.requires_grad
-        ):
+        # deskai6 opt 10: func_prune_contacts fuses the spatial clamp+sort into its phase 3, so when dedup runs we skip
+        # the standalone clamp_and_sort kernel (one less launch + one less full permute pass). The dispatch gate uses
+        # the resolved effective tolerance (auto-enabled to 0.1 in _init_static_config when scene gates pass), not the
+        # raw user option, so dedup runs by default on supported scenes whenever the user didn't explicitly disable it.
+        ran_fused_dedup = (
+            self._effective_pruning_tolerance is not None and not self._solver._static_rigid_sim_config.requires_grad
+        )
+
+        if ran_fused_dedup:
             func_prune_contacts(
                 self._collider_state,
                 self._collider_info,
@@ -833,7 +882,7 @@ def detection(self) -> None:
                 self._solver._static_rigid_sim_config,
             )
 
-        if self._use_split_narrowphase:
+        if self._use_split_narrowphase and not ran_fused_dedup:
             func_clamp_and_sort_contacts(
                 self._collider_state,
                 self._collider_info,
diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index f7e7cb2d9..d0fdc8b76 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -594,8 +594,11 @@ def func_clamp_and_sort_contacts(
             collider_state.contact_sort_key[j + 1, i_b] = curr_key
             collider_state.contact_sort_idx[j + 1, i_b] = curr_idx
 
-        # Phase 2: apply permutation in-place via cycle decomposition.
-        # Each contact is read and written exactly once.
+        # Phase 2: apply permutation in-place via cycle decomposition. Each contact is read and written exactly once.
+        # All 11 contact_data fields are permuted unconditionally (matching upstream): an earlier deskai6 opt-8b version
+        # tried to gate force/pair_idx writes on requires_grad, but the resulting qd.static-scoped variables tripped a
+        # Quadrants NameError in DEBUG-mode unit tests, and a hoist workaround caused subtle Metal-backend regressions
+        # in test_smooth_box_no_drift. The 2 saved writes per non-grad cycle are not worth the platform risk.
         for i in range(n_con):
             if collider_state.contact_sort_idx[i, i_b] != i:
                 tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
@@ -665,7 +668,7 @@ def func_prune_contacts(
        in-place via cycle decomposition (11-field swap per contact record).
     2. Per bucket of >= 5 contacts: compute mean normal (folded to a common hemisphere). Check depth coplanarity of
        contact positions. If they share a plane, project to (u, v), Andrew's monotone chain. Mark survivors in
-       contact_keep[].
+       contact_keep[]. Then restore any non-hull contact whose penetration is much deeper than the hull's average.
     3. Compact: copy kept contacts to the front, update n_contacts.
     """
     _B = collider_state.n_contacts.shape[0]
@@ -676,340 +679,418 @@ def func_prune_contacts(
     LP_KEY_STRIDE = gs.qd_float(1.0e7)
     EPS = rigid_global_info.EPS[None]
 
-    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
-    for i_b in range(_B):
+    # deskai6 opt 9d (real warp-coop, stage 1): pull contact_keep init and phase-1a key+idx init OUT of `if tid == 0:`
+    # and parallelize them across the 32 lanes (stride-tid). Phase 1a sort, phase 2 bucket walk, and phase 3 (sort +
+    # cycle-permute) stay serial on lane 0. Subsequent stages add coop reductions (phase-2 mean / centroid) and parallel
+    # cycle-permute.
+    _K = qd.static(32)
+    qd.loop_config(name="prune_contacts_coop", block_dim=_K)
+    for i_flat in range(_B * _K):
+        tid = i_flat % _K
+        i_b = i_flat // _K
+        # All lanes compute n_con (cheap, no memory write on non-lane-0).
         n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs)
-        collider_state.n_contacts[i_b] = n_con
-        if n_con < 5:
-            continue
-
-        # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key.
-        for i in range(n_con):
-            la = collider_state.contact_data.link_a[i, i_b]
-            lb = collider_state.contact_data.link_b[i, i_b]
-            la_min = qd.min(la, lb)
-            la_max = qd.max(la, lb)
-            collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast(
-                la_max, gs.qd_float
-            )
-            collider_state.contact_sort_idx[i, i_b] = i
-
-        for i in range(1, n_con):
-            ck = collider_state.contact_sort_key[i, i_b]
-            if collider_state.contact_sort_key[i - 1, i_b] <= ck:
-                continue
-            ci = collider_state.contact_sort_idx[i, i_b]
-            j = i - 1
-            while j >= 0:
-                if collider_state.contact_sort_key[j, i_b] <= ck:
-                    break
-                collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
-                collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
-                j = j - 1
-            collider_state.contact_sort_key[j + 1, i_b] = ck
-            collider_state.contact_sort_idx[j + 1, i_b] = ci
-
-        # Phase 1b: apply permutation in-place via cycle decomposition (11 fields).
-        for i in range(n_con):
-            if collider_state.contact_sort_idx[i, i_b] != i:
-                tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
-                tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
-                tmp_penetration = collider_state.contact_data.penetration[i, i_b]
-                tmp_normal = collider_state.contact_data.normal[i, i_b]
-                tmp_pos = collider_state.contact_data.pos[i, i_b]
-                tmp_friction = collider_state.contact_data.friction[i, i_b]
-                tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
-                tmp_force = collider_state.contact_data.force[i, i_b]
-                tmp_link_a = collider_state.contact_data.link_a[i, i_b]
-                tmp_link_b = collider_state.contact_data.link_b[i, i_b]
-                tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b]
-
-                j = i
-                while collider_state.contact_sort_idx[j, i_b] != i:
-                    src = collider_state.contact_sort_idx[j, i_b]
-                    collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
-                    collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
-                    collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b]
-                    collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
-                    collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
-                    collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
-                    collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b]
-                    collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b]
-                    collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
-                    collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
-                    collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b]
-                    collider_state.contact_sort_idx[j, i_b] = j
-                    j = src
-
-                collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
-                collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
-                collider_state.contact_data.penetration[j, i_b] = tmp_penetration
-                collider_state.contact_data.normal[j, i_b] = tmp_normal
-                collider_state.contact_data.pos[j, i_b] = tmp_pos
-                collider_state.contact_data.friction[j, i_b] = tmp_friction
-                collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
-                collider_state.contact_data.force[j, i_b] = tmp_force
-                collider_state.contact_data.link_a[j, i_b] = tmp_link_a
-                collider_state.contact_data.link_b[j, i_b] = tmp_link_b
-                collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx
-                collider_state.contact_sort_idx[j, i_b] = j
-
-        # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only
-        # hull-vertex contacts as keep again.
-        for i in range(n_con):
-            collider_state.contact_keep[i, i_b] = 1
-
-        # Phase 2: walk link-pair buckets.
-        b_start = 0
-        while b_start < n_con:
-            la0 = collider_state.contact_data.link_a[b_start, i_b]
-            lb0 = collider_state.contact_data.link_b[b_start, i_b]
-            la0_min = qd.min(la0, lb0)
-            la0_max = qd.max(la0, lb0)
-            b_end = b_start + 1
-            while b_end < n_con:
-                la = collider_state.contact_data.link_a[b_end, i_b]
-                lb = collider_state.contact_data.link_b[b_end, i_b]
-                if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max:
-                    break
-                b_end += 1
-            b_size = b_end - b_start
-
-            if b_size >= 5:
-                # Mean normal (folded to the hemisphere of contact b_start) and centroid.
-                ref_n = collider_state.contact_data.normal[b_start, i_b]
-                rnx = ref_n[0]
-                rny = ref_n[1]
-                rnz = ref_n[2]
-                mnx = gs.qd_float(0.0)
-                mny = gs.qd_float(0.0)
-                mnz = gs.qd_float(0.0)
-                cx = gs.qd_float(0.0)
-                cy = gs.qd_float(0.0)
-                cz = gs.qd_float(0.0)
-                for i in range(b_start, b_end):
-                    n_i = collider_state.contact_data.normal[i, i_b]
-                    s = gs.qd_float(1.0)
-                    if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0):
-                        s = gs.qd_float(-1.0)
-                    mnx += s * n_i[0]
-                    mny += s * n_i[1]
-                    mnz += s * n_i[2]
-                    p_i = collider_state.contact_data.pos[i, i_b]
-                    cx += p_i[0]
-                    cy += p_i[1]
-                    cz += p_i[2]
-                inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float)
-                cx *= inv_n
-                cy *= inv_n
-                cz *= inv_n
-                mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz)
-
-                # Hoisted out so the hull-build branch below can read it (quadrants scopes per if).
-                max_in_plane_r2 = gs.qd_float(0.0)
-
-                coplanar = mnrm > EPS
-                if coplanar:
-                    mnx /= mnrm
-                    mny /= mnrm
-                    mnz /= mnrm
-
-                    # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No
-                    # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a
-                    # corner of the contact patch) still participates in the 2D hull because its position is a vertex of
-                    # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional
-                    # support polygon regardless of that contact's normal direction.
-                    max_depth = gs.qd_float(0.0)
-                    for i in range(b_start, b_end):
-                        p_i = collider_state.contact_data.pos[i, i_b]
-                        dx = p_i[0] - cx
-                        dy = p_i[1] - cy
-                        dz = p_i[2] - cz
-                        depth = qd.abs(dx * mnx + dy * mny + dz * mnz)
-                        if depth > max_depth:
-                            max_depth = depth
-                        r2 = dx * dx + dy * dy + dz * dz - depth * depth
-                        if r2 > max_in_plane_r2:
-                            max_in_plane_r2 = r2
-
-                    if max_depth > tol * qd.sqrt(max_in_plane_r2):
-                        coplanar = False
-
-                if coplanar:
-                    # In-plane basis (u, v): seed from the world axis least-aligned with mean normal.
-                    abs_mnx = qd.abs(mnx)
-                    abs_mny = qd.abs(mny)
-                    abs_mnz = qd.abs(mnz)
-                    ax = gs.qd_float(1.0)
-                    ay = gs.qd_float(0.0)
-                    az = gs.qd_float(0.0)
-                    if abs_mny < abs_mnx and abs_mny < abs_mnz:
-                        ax = gs.qd_float(0.0)
-                        ay = gs.qd_float(1.0)
-                        az = gs.qd_float(0.0)
-                    elif abs_mnz < abs_mnx and abs_mnz <= abs_mny:
-                        ax = gs.qd_float(0.0)
+        if tid == 0:
+            collider_state.n_contacts[i_b] = n_con
+
+        # PARALLEL: contact_keep init. Default-keep marked here unconditionally so the fused phase 3 (compact + spatial
+        # sort) below produces a correct result for envs with n_con < 5 (no dedup buckets, but still need spatial sort).
+        # 32 lanes stride.
+        ii = tid
+        while ii < n_con:
+            collider_state.contact_keep[ii, i_b] = 1
+            ii += _K
+
+        if n_con >= 5:
+            # PARALLEL: phase 1a key + idx init, 32 lanes stride.
+            ii = tid
+            while ii < n_con:
+                la = collider_state.contact_data.link_a[ii, i_b]
+                lb = collider_state.contact_data.link_b[ii, i_b]
+                la_min = qd.min(la, lb)
+                la_max = qd.max(la, lb)
+                collider_state.contact_sort_key[ii, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast(
+                    la_max, gs.qd_float
+                )
+                collider_state.contact_sort_idx[ii, i_b] = ii
+                ii += _K
+
+            if tid == 0:
+                # SERIAL on lane 0: phase 1a insertion sort + phase 2 bucket walk.
+                for i in range(1, n_con):
+                    ck = collider_state.contact_sort_key[i, i_b]
+                    if collider_state.contact_sort_key[i - 1, i_b] <= ck:
+                        continue
+                    ci = collider_state.contact_sort_idx[i, i_b]
+                    j = i - 1
+                    while j >= 0:
+                        if collider_state.contact_sort_key[j, i_b] <= ck:
+                            break
+                        collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                        collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                        j = j - 1
+                    collider_state.contact_sort_key[j + 1, i_b] = ck
+                    collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+            qd.simt.subgroup.sync()
+
+            # Phase 2 (deskai6 opt 9d stage 2): bucket walk runs on ALL 32 lanes. The outer control flow (find b_end,
+            # iterate buckets) is duplicated across lanes since the inputs are all in DRAM (cache-friendly). Inside a
+            # bucket, the mean-normal / centroid sum is done coop via 6 reduce_all_add_tiled calls. The rest of the
+            # bucket processing (coplanarity check with early-exit, in-plane basis, projection, lex sort, hull build,
+            # mark survivors) stays serial on lane 0.
+            b_start = 0
+            while b_start < n_con:
+                key0 = collider_state.contact_sort_key[b_start, i_b]
+                b_end = b_start + 1
+                while b_end < n_con:
+                    if collider_state.contact_sort_key[b_end, i_b] != key0:
+                        break
+                    b_end += 1
+                b_size = b_end - b_start
+
+                if b_size >= 5:
+                    ref_src = collider_state.contact_sort_idx[b_start, i_b]
+                    ref_n = collider_state.contact_data.normal[ref_src, i_b]
+                    rnx = ref_n[0]
+                    rny = ref_n[1]
+                    rnz = ref_n[2]
+                    mnx_l = gs.qd_float(0.0)
+                    mny_l = gs.qd_float(0.0)
+                    mnz_l = gs.qd_float(0.0)
+                    cx_l = gs.qd_float(0.0)
+                    cy_l = gs.qd_float(0.0)
+                    cz_l = gs.qd_float(0.0)
+                    jj = b_start + tid
+                    while jj < b_end:
+                        src_i = collider_state.contact_sort_idx[jj, i_b]
+                        n_i = collider_state.contact_data.normal[src_i, i_b]
+                        s = gs.qd_float(1.0)
+                        if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0):
+                            s = gs.qd_float(-1.0)
+                        mnx_l += s * n_i[0]
+                        mny_l += s * n_i[1]
+                        mnz_l += s * n_i[2]
+                        p_i = collider_state.contact_data.pos[src_i, i_b]
+                        cx_l += p_i[0]
+                        cy_l += p_i[1]
+                        cz_l += p_i[2]
+                        jj += _K
+
+                    mnx = qd.simt.subgroup.reduce_all_add_tiled(mnx_l, 5)
+                    mny = qd.simt.subgroup.reduce_all_add_tiled(mny_l, 5)
+                    mnz = qd.simt.subgroup.reduce_all_add_tiled(mnz_l, 5)
+                    cx = qd.simt.subgroup.reduce_all_add_tiled(cx_l, 5)
+                    cy = qd.simt.subgroup.reduce_all_add_tiled(cy_l, 5)
+                    cz = qd.simt.subgroup.reduce_all_add_tiled(cz_l, 5)
+
+                    # POST-REDUCE math runs on all 32 lanes (deterministic, cheap; redundant arithmetic is free vs.
+                    # broadcasting the reduce results).
+                    inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float)
+                    cx *= inv_n
+                    cy *= inv_n
+                    cz *= inv_n
+                    mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz)
+
+                    max_in_plane_r2 = gs.qd_float(0.0)
+                    coplanar = mnrm > EPS
+                    if coplanar:
+                        mnx /= mnrm
+                        mny /= mnrm
+                        mnz /= mnrm
+
+                        # COOP coplanarity check (stage 3). Each lane strides [b_start + tid, b_end) by _K, locally
+                        # tracking max_depth / max_in_plane_r2. Wasted work per warp is at most b_size/_K contacts.
+                        # The upstream algo no longer checks per-contact normals (a contact with a diagonal normal at
+                        # the corner of a patch still participates in the 2D hull because its position is a vertex), so
+                        # we only do the depth coplanarity gate here.
+                        max_depth_l = gs.qd_float(0.0)
+                        max_r2_l = gs.qd_float(0.0)
+                        jj = b_start + tid
+                        while jj < b_end:
+                            src_i = collider_state.contact_sort_idx[jj, i_b]
+                            p_i = collider_state.contact_data.pos[src_i, i_b]
+                            dx = p_i[0] - cx
+                            dy = p_i[1] - cy
+                            dz = p_i[2] - cz
+                            depth = qd.abs(dx * mnx + dy * mny + dz * mnz)
+                            if depth > max_depth_l:
+                                max_depth_l = depth
+                            r2 = dx * dx + dy * dy + dz * dz - depth * depth
+                            if r2 > max_r2_l:
+                                max_r2_l = r2
+                            jj += _K
+
+                        max_depth = qd.simt.subgroup.reduce_all_max_tiled(max_depth_l, 5)
+                        max_in_plane_r2 = qd.simt.subgroup.reduce_all_max_tiled(max_r2_l, 5)
+
+                        if max_depth > tol * qd.sqrt(max_in_plane_r2):
+                            coplanar = False
+
+                    if coplanar:
+                        # Basis on all lanes (deterministic from mnx/mny/mnz which the reduce broadcast to every lane).
+                        abs_mnx = qd.abs(mnx)
+                        abs_mny = qd.abs(mny)
+                        abs_mnz = qd.abs(mnz)
+                        ax = gs.qd_float(1.0)
                         ay = gs.qd_float(0.0)
-                        az = gs.qd_float(1.0)
-                    adn = ax * mnx + ay * mny + az * mnz
-                    ux = ax - adn * mnx
-                    uy = ay - adn * mny
-                    uz = az - adn * mnz
-                    unrm = qd.sqrt(ux * ux + uy * uy + uz * uz)
-                    ux /= unrm
-                    uy /= unrm
-                    uz /= unrm
-                    vx = mny * uz - mnz * uy
-                    vy = mnz * ux - mnx * uz
-                    vz = mnx * uy - mny * ux
-
-                    # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it);
-                    # v goes into the dedicated scratch buffer.
-                    for i in range(b_start, b_end):
-                        p_i = collider_state.contact_data.pos[i, i_b]
-                        collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz
-                        collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz
-
-                    # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u
-                    # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort
-                    # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and
-                    # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first.
-                    sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2)
-                    for i in range(b_start, b_end):
-                        collider_state.contact_sort_idx[i, i_b] = i
-                    for i in range(b_start + 1, b_end):
-                        ci = collider_state.contact_sort_idx[i, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        j = i - 1
-                        while j >= b_start:
-                            pj = collider_state.contact_sort_idx[j, i_b]
-                            pu = collider_state.contact_sort_key[pj, i_b]
-                            pv = collider_state.contact_proj_v[pj, i_b]
-                            if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv):
-                                break
-                            collider_state.contact_sort_idx[j + 1, i_b] = pj
-                            j -= 1
-                        collider_state.contact_sort_idx[j + 1, i_b] = ci
-
-                    # Mark all bucket entries as drop, then mark hull vertices as keep.
-                    for i in range(b_start, b_end):
-                        collider_state.contact_keep[i, i_b] = 0
-
-                    # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails
-                    # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff),
-                    # so genuine midpoints would survive as spurious hull vertices.
-                    hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2
-
-                    # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k).
-                    k = 0
-                    for i in range(b_start, b_end):
-                        ci = collider_state.contact_sort_idx[i, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        while k >= 2:
-                            idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
-                            idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
-                            au = collider_state.contact_sort_key[idx_a, i_b]
-                            av = collider_state.contact_proj_v[idx_a, i_b]
-                            bu = collider_state.contact_sort_key[idx_b, i_b]
-                            bv = collider_state.contact_proj_v[idx_b, i_b]
-                            cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
-                            if cross <= hull_collinear_tol:
-                                k -= 1
-                            else:
-                                break
-                        collider_state.contact_hull_stack[b_start + k, i_b] = ci
-                        k += 1
-
-                    upper_start = k
-                    # Memory-fence workaround for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2):
-                    # without an explicit scratch store between the lower-hull and upper-hull passes, the upper-hull
-                    # pop-loop's reads of contact_hull_stack don't observe the writes from the lower hull, so its
-                    # cross-product / pop-check effectively runs on stale data and every candidate is kept. This shows
-                    # up as the kernel never popping anything in the upper hull and producing a hull whose size equals
-                    # the bucket size. Writing any value to a non-overlapping scratch slot here serializes the two
-                    # passes' contact_hull_stack accesses and restores parallel correctness; the value is unused.
-                    collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0
-                    # quadrants range supports 1 or 2 args only; iterate sorted indices backward over
-                    # [b_start, b_end - 2] by reflecting the index.
-                    for k_step in range(b_size - 1):
-                        ii = b_end - 2 - k_step
-                        ci = collider_state.contact_sort_idx[ii, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        while k >= upper_start + 1:
-                            idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
-                            idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
-                            au = collider_state.contact_sort_key[idx_a, i_b]
-                            av = collider_state.contact_proj_v[idx_a, i_b]
-                            bu = collider_state.contact_sort_key[idx_b, i_b]
-                            bv = collider_state.contact_proj_v[idx_b, i_b]
-                            cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
-                            if cross <= hull_collinear_tol:
-                                k -= 1
-                            else:
-                                break
-                        # The closing iteration of the upper hull visits the leftmost point, which already sits at
-                        # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to
-                        # b_size and keeps the write index within max_contact_pairs even for buckets where the lower-
-                        # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple
-                        # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate
-                        # of an already-kept lower-hull vertex).
-                        if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size:
+                        az = gs.qd_float(0.0)
+                        if abs_mny < abs_mnx and abs_mny < abs_mnz:
+                            ax = gs.qd_float(0.0)
+                            ay = gs.qd_float(1.0)
+                            az = gs.qd_float(0.0)
+                        elif abs_mnz < abs_mnx and abs_mnz <= abs_mny:
+                            ax = gs.qd_float(0.0)
+                            ay = gs.qd_float(0.0)
+                            az = gs.qd_float(1.0)
+                        adn = ax * mnx + ay * mny + az * mnz
+                        ux = ax - adn * mnx
+                        uy = ay - adn * mny
+                        uz = az - adn * mnz
+                        unrm = qd.sqrt(ux * ux + uy * uy + uz * uz)
+                        ux /= unrm
+                        uy /= unrm
+                        uz /= unrm
+                        vx = mny * uz - mnz * uy
+                        vy = mnz * ux - mnx * uz
+                        vz = mnx * uy - mny * ux
+
+                        # COOP projection: 32 lanes stride writes to contact_sort_key + contact_proj_v.
+                        jj = b_start + tid
+                        while jj < b_end:
+                            src_i = collider_state.contact_sort_idx[jj, i_b]
+                            p_i = collider_state.contact_data.pos[src_i, i_b]
+                            collider_state.contact_sort_key[jj, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz
+                            collider_state.contact_proj_v[jj, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz
+                            jj += _K
+
+                        # COOP mark-drop: stride writes to contact_keep[orig].
+                        jj = b_start + tid
+                        while jj < b_end:
+                            orig = collider_state.contact_sort_idx[jj, i_b]
+                            collider_state.contact_keep[orig, i_b] = 0
+                            jj += _K
+
+                        # COOP lex_idx init: stride writes.
+                        jj = b_start + tid
+                        while jj < b_end:
+                            collider_state.contact_lex_idx[jj, i_b] = jj
+                            jj += _K
+
+                        # SYNC between coop writes (sort_key, proj_v, lex_idx, contact_keep[orig]) and the lane-0 lex
+                        # sort + hull build that reads them.
+                        qd.simt.subgroup.sync()
+
+                    if tid == 0 and coplanar:
+                        # SERIAL on lane 0: lex sort + Andrew monotone-chain hull build + mark survivors. These walk a
+                        # stack and have data-dependent inner loops that don't decompose across warp lanes.
+                        #
+                        # The sort_u_tol on the u comparison is critical for correctness, not just a perf tweak:
+                        # contacts whose u values differ only by sub-millimeter MPR noise need to sort by v, otherwise
+                        # mid-edge points get sorted between the two corners they sit between and survive the lower-
+                        # hull pass as spurious hull vertices (cf. upstream test_contact_pruning regression when this
+                        # tolerance is missing).
+                        sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2)
+                        for i in range(b_start + 1, b_end):
+                            ci = collider_state.contact_lex_idx[i, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            j = i - 1
+                            while j >= b_start:
+                                pj = collider_state.contact_lex_idx[j, i_b]
+                                pu = collider_state.contact_sort_key[pj, i_b]
+                                pv = collider_state.contact_proj_v[pj, i_b]
+                                if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv):
+                                    break
+                                collider_state.contact_lex_idx[j + 1, i_b] = pj
+                                j -= 1
+                            collider_state.contact_lex_idx[j + 1, i_b] = ci
+
+                        # Upstream PR #2831 split the hull collinearity tolerance out of `tol`: tol is the depth
+                        # coplanarity gate, prune_hull_collinear_tol is the cross-product slop for the monotone-chain
+                        # popper. They were the same constant before; keeping them separate lets users tune one without
+                        # changing the other.
+                        hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2
+
+                        k = 0
+                        for i in range(b_start, b_end):
+                            ci = collider_state.contact_lex_idx[i, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            while k >= 2:
+                                idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                au = collider_state.contact_sort_key[idx_a, i_b]
+                                av = collider_state.contact_proj_v[idx_a, i_b]
+                                bu = collider_state.contact_sort_key[idx_b, i_b]
+                                bv = collider_state.contact_proj_v[idx_b, i_b]
+                                cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                if cross <= hull_collinear_tol:
+                                    k -= 1
+                                else:
+                                    break
                             collider_state.contact_hull_stack[b_start + k, i_b] = ci
                             k += 1
 
-                    # Chain holds k unique hull vertices (no trailing duplicate).
-                    for hk in range(k):
-                        survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
-                        collider_state.contact_keep[survivor, i_b] = 1
-
-                    # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The
-                    # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share
-                    # the same normal and penetration; a contact with substantially higher penetration than the hull's
-                    # average represents a distinct physical support (the body of a fork resting beyond its tines, the
-                    # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is
-                    # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform-
-                    # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep
-                    # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep
-                    # supports are restored).
-                    hull_pen_max = gs.qd_float(0.0)
-                    for hk in range(k):
-                        survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
-                        p = collider_state.contact_data.penetration[survivor, i_b]
-                        if p > hull_pen_max:
-                            hull_pen_max = p
-                    deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
-                    for i in range(b_start, b_end):
-                        if collider_state.contact_keep[i, i_b] == 0:
-                            if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold:
-                                collider_state.contact_keep[i, i_b] = 1
-
-            b_start = b_end
-
-        # Phase 3: compact kept contacts to the front.
-        write = 0
-        for read in range(n_con):
-            if collider_state.contact_keep[read, i_b] != 0:
-                if write != read:
-                    collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[read, i_b]
-                    collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[read, i_b]
-                    collider_state.contact_data.penetration[write, i_b] = collider_state.contact_data.penetration[
-                        read, i_b
-                    ]
-                    collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[read, i_b]
-                    collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b]
-                    collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[read, i_b]
-                    collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[
-                        read, i_b
-                    ]
-                    collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b]
-                    collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[read, i_b]
-                    collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[read, i_b]
-                    collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[read, i_b]
-                write += 1
-        collider_state.n_contacts[i_b] = write
+                        upper_start = k
+                        # Memory-fence workaround for a Quadrants codegen issue on Metal _B >= 2 (genesis PR #2831):
+                        # without an explicit scratch store between the lower- and upper-hull lane-0 ``for`` passes,
+                        # the upper-hull pop-loop's reads of contact_hull_stack don't observe the lower-hull writes,
+                        # so the cross-product check effectively runs on stale data and every candidate is kept (hull
+                        # size == bucket size, no pruning). Writing any value to a non-overlapping slot here forces
+                        # write-then-read ordering on the shared buffer. The value is unused. See
+                        # perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py for a standalone reproduction.
+                        collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0
+                        for k_step in range(b_size - 1):
+                            ii_lex = b_end - 2 - k_step
+                            ci = collider_state.contact_lex_idx[ii_lex, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            while k >= upper_start + 1:
+                                idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                au = collider_state.contact_sort_key[idx_a, i_b]
+                                av = collider_state.contact_proj_v[idx_a, i_b]
+                                bu = collider_state.contact_sort_key[idx_b, i_b]
+                                bv = collider_state.contact_proj_v[idx_b, i_b]
+                                cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                if cross <= hull_collinear_tol:
+                                    k -= 1
+                                else:
+                                    break
+                            if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size:
+                                collider_state.contact_hull_stack[b_start + k, i_b] = ci
+                                k += 1
+
+                        for hk in range(k):
+                            survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b]
+                            survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b]
+                            collider_state.contact_keep[survivor_orig, i_b] = 1
+
+                        # Restore non-hull contacts whose penetration is much deeper than the hull boundary's max
+                        # (PR #2831 switched from avg to max). Rationale: a contact whose penetration substantially
+                        # exceeds the hull's deepest vertex represents a distinct physical support (deep body of a
+                        # fork beyond its tines, deep middle of a long body) that the support-polygon argument
+                        # doesn't actually authorize dropping (the argument only holds when all contacts share
+                        # normal AND penetration). The 3x factor over the hull max is well above the typical ~1.x
+                        # penetration spread on transient/rocking faces but well below the deep interior penetrations
+                        # seen when a non-flat body rests inside its convex envelope. Indices here live in orig-space
+                        # (because the cycle-permute is fused into the phase 3 below in opt 10 -- contact_data is
+                        # still in pre-sort order, so we translate sort-space hull/bucket indices through
+                        # contact_sort_idx).
+                        hull_pen_max = gs.qd_float(0.0)
+                        for hk in range(k):
+                            survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b]
+                            survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b]
+                            p = collider_state.contact_data.penetration[survivor_orig, i_b]
+                            if p > hull_pen_max:
+                                hull_pen_max = p
+                        deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
+                        for jj_idx in range(b_start, b_end):
+                            orig = collider_state.contact_sort_idx[jj_idx, i_b]
+                            if collider_state.contact_keep[orig, i_b] == 0:
+                                if collider_state.contact_data.penetration[orig, i_b] > deep_keep_threshold:
+                                    collider_state.contact_keep[orig, i_b] = 1
+
+                b_start = b_end
+
+        if tid == 0:
+            # Phase 3 (deskai6 opt 10): FUSED compact + spatial sort. Replaces:
+            #   - dedup phase 3 (compact based on contact_keep)
+            #   - func_clamp_and_sort_contacts (assign group-x sort key + insertion sort + cycle-permute)
+            # with a single permutation pass: compute a sort key per slot that pushes dropped contacts past the end
+            # (sentinel +inf) and orders kept contacts by their geom-pair group's x-pos (anchored to the first kept
+            # contact in the group, preserving narrowphase intra-group order via stable sort). Then sort sort-keys +
+            # sort-indices, then cycle-permute the 9 contact_data fields exactly once.
+            #
+            # On dex_hand this saves ~25 displaced contacts worth of field copies that used to be done in dedup phase 3
+            # before clamp_and_sort re-permuted everything. Same 9 fields per contact (force & pair_idx still skipped
+            # per opt 8a; clamp_and_sort still runs in the grad path so its own opt 8b gate stays in place).
+            SENTINEL_BIG = gs.qd_float(1e30)
+            group_key = gs.qd_float(0.0)
+            prev_ga = -1
+            prev_gb = -1
+            for i in range(n_con):
+                if collider_state.contact_keep[i, i_b] != 0:
+                    ga = collider_state.contact_data.geom_a[i, i_b]
+                    gb = collider_state.contact_data.geom_b[i, i_b]
+                    if ga != prev_ga or gb != prev_gb:
+                        group_key = collider_state.contact_data.pos[i, i_b][0]
+                        prev_ga = ga
+                        prev_gb = gb
+                    collider_state.contact_sort_key[i, i_b] = group_key
+                else:
+                    collider_state.contact_sort_key[i, i_b] = SENTINEL_BIG
+                collider_state.contact_sort_idx[i, i_b] = i
+
+            for i in range(1, n_con):
+                ck = collider_state.contact_sort_key[i, i_b]
+                if collider_state.contact_sort_key[i - 1, i_b] <= ck:
+                    continue
+                ci = collider_state.contact_sort_idx[i, i_b]
+                j = i - 1
+                while j >= 0:
+                    if collider_state.contact_sort_key[j, i_b] <= ck:
+                        break
+                    collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                    collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                    j = j - 1
+                collider_state.contact_sort_key[j + 1, i_b] = ck
+                collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+            # n_contacts is the count of non-sentinel sort keys (the dropped contacts sit at the tail and are ignored
+            # by downstream consumers via the new n_contacts).
+            n_kept = 0
+            for i in range(n_con):
+                if collider_state.contact_sort_key[i, i_b] < SENTINEL_BIG:
+                    n_kept += 1
+                else:
+                    break
+            collider_state.n_contacts[i_b] = n_kept
+
+            # Cycle-permute contact_data fields into their sorted positions. Only n_kept positions need correct values;
+            # the tail (dropped) slots are scratched but not read again this step.
+            for i in range(n_kept):
+                if collider_state.contact_sort_idx[i, i_b] != i:
+                    tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
+                    tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
+                    tmp_penetration = collider_state.contact_data.penetration[i, i_b]
+                    tmp_normal = collider_state.contact_data.normal[i, i_b]
+                    tmp_pos = collider_state.contact_data.pos[i, i_b]
+                    tmp_friction = collider_state.contact_data.friction[i, i_b]
+                    tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
+                    tmp_link_a = collider_state.contact_data.link_a[i, i_b]
+                    tmp_link_b = collider_state.contact_data.link_b[i, i_b]
+
+                    j = i
+                    while collider_state.contact_sort_idx[j, i_b] != i:
+                        src = collider_state.contact_sort_idx[j, i_b]
+                        collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
+                        collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
+                        collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
+                        collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
+                        collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
+                        collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
+                        collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
+                        collider_state.contact_sort_idx[j, i_b] = j
+                        j = src
+
+                    collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
+                    collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
+                    collider_state.contact_data.penetration[j, i_b] = tmp_penetration
+                    collider_state.contact_data.normal[j, i_b] = tmp_normal
+                    collider_state.contact_data.pos[j, i_b] = tmp_pos
+                    collider_state.contact_data.friction[j, i_b] = tmp_friction
+                    collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
+                    collider_state.contact_data.link_a[j, i_b] = tmp_link_a
+                    collider_state.contact_data.link_b[j, i_b] = tmp_link_b
+                    collider_state.contact_sort_idx[j, i_b] = j
 
 
 @qd.kernel
diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py
index 286858bee..466efaa3e 100644
--- a/genesis/utils/array_class.py
+++ b/genesis/utils/array_class.py
@@ -698,6 +698,7 @@ class ColliderState:
     contact_proj_v: qd.Tensor
     contact_keep: qd.Tensor
     contact_hull_stack: qd.Tensor
+    contact_lex_idx: qd.Tensor
 
 
 def get_collider_state(
@@ -760,6 +761,7 @@ def get_collider_state(
         contact_proj_v=V(dtype=gs.qd_float, shape=prune_shape),
         contact_keep=V(dtype=gs.qd_int, shape=prune_shape),
         contact_hull_stack=V(dtype=gs.qd_int, shape=prune_shape),
+        contact_lex_idx=V(dtype=gs.qd_int, shape=prune_shape),
     )
 
 
diff --git a/tests/test_rigid_physics.py b/tests/test_rigid_physics.py
index 45cd1d1b9..e2d3620bb 100644
--- a/tests/test_rigid_physics.py
+++ b/tests/test_rigid_physics.py
@@ -1374,6 +1374,7 @@ def test_contact_dedup(surface_kind, show_viewer):
 
 
 @pytest.mark.required
+@pytest.mark.parametrize("backend", [gs.gpu])
 def test_contact_pruning(show_viewer):
     GEOM_HALF_SIZE = 0.1
     MARGIN = 1e-4

From edec9907584040357c84dc075c44b513ef5c12c2 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 25 May 2026 09:11:31 -0700
Subject: [PATCH 3/7] chore: fix F541 f-string without placeholders in CPU
 dedup-gate log

---
 genesis/engine/solvers/rigid/collider/collider.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index 2eae75bf8..843d7fbd7 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -192,8 +192,8 @@ def _init_static_config(self) -> None:
         # saturation).
         if link_pair_pruning_supported and gs.backend == gs.cpu:
             gs.logger.info(
-                f"Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes "
-                f"the per-env loop)."
+                "Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes "
+                "the per-env loop)."
             )
             link_pair_pruning_supported = False
         if link_pair_pruning_supported and self._solver.n_envs > _DEDUP_MAX_N_ENVS:

From c5a35798876c96a1eb1a7c4798584ad0621be2e7 Mon Sep 17 00:00:00 2001
From: Alexis Duburcq <alexis.duburcq@gmail.com>
Date: Mon, 25 May 2026 19:35:51 +0200
Subject: [PATCH 4/7] Fuse pruning and clamp + sort.

---
 .../engine/solvers/rigid/collider/collider.py |  28 +-
 .../engine/solvers/rigid/collider/contact.py  | 885 +++++++++---------
 2 files changed, 458 insertions(+), 455 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index c6aa09b9d..aaba5d891 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -47,8 +47,7 @@
     func_contact_orthogonals,
     func_rotate_frame,
     func_set_upstream_grad,
-    func_clamp_and_sort_contacts,
-    func_prune_contacts,
+    func_clamp_prune_and_sort_contacts,
 )
 from . import narrowphase
 from .narrowphase import (
@@ -819,24 +818,13 @@ def detection(self) -> None:
                 self._solver._errno,
             )
 
-        if (
-            self._collider_static_config.link_pair_pruning_supported
-            and self._solver._options.contact_pruning_tolerance is not None
-            and not self._solver._static_rigid_sim_config.requires_grad
-        ):
-            func_prune_contacts(
-                self._collider_state,
-                self._collider_info,
-                self._solver._rigid_global_info,
-                self._solver._static_rigid_sim_config,
-            )
-
-        if self._use_split_narrowphase:
-            func_clamp_and_sort_contacts(
-                self._collider_state,
-                self._collider_info,
-                self._solver._static_rigid_sim_config,
-            )
+        func_clamp_prune_and_sort_contacts(
+            self._collider_state,
+            self._collider_info,
+            self._solver._rigid_global_info,
+            self._solver._static_rigid_sim_config,
+            self._collider_static_config,
+        )
 
     def get_contacts(self, as_tensor: bool = True, to_torch: bool = True, keep_batch_dim: bool = False):
         # Early return if already pre-computed
diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 0fa8940f5..0f682df8f 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -536,119 +536,23 @@ def func_rotate_frame(
 
 
 @qd.kernel(fastcache=True)
-def func_clamp_and_sort_contacts(
+def func_clamp_prune_and_sort_contacts(
     collider_state: array_class.ColliderState,
     collider_info: array_class.ColliderInfo,
+    rigid_global_info: array_class.RigidGlobalInfo,
     static_rigid_sim_config: qd.template(),
+    collider_static_config: qd.template(),
 ):
-    """Sort contacts spatially by x-coordinate, moving entire geom-pair groups as units, while clamping number of
-    contacts to avoid unbounded memory access.
-
-    When contacts are added with use_atomic=True from parallel narrowphase kernels, the counter may exceed the array
-    bounds even though only in-bounds entries are actually written. The clamp brings the counter back to the valid
-    range before sorting.
-
-    Contacts from the same geom pair are contiguous after narrowphase. We assign every contact in a group the
-    x-position of the group's first contact. The stable insertion sort then reorders groups spatially while preserving
-    the narrowphase ordering within each group.
+    """Merged clamp + (optional) link-pair pruning + (optional) x-position sort, in one per-env loop pass.
 
-    Two-phase approach to minimise memory traffic:
-    1. Insertion sort on a compact (key, index) pair — 8 bytes per swap instead of moving all 11 contact fields
-    2. In-place cycle-following permutation that moves each contact record exactly once
-    """
-    _B = collider_state.n_contacts.shape[0]
-    max_contact_pairs = collider_info.max_contact_pairs[None]
+    Phases per env (gated at compile time by ``collider_static_config``):
+    - Always: clamp ``n_contacts`` to ``max_contact_pairs`` to absorb the parallel-narrowphase overcount.
+    - If ``link_pair_pruning_supported and not requires_grad``: prune redundant contacts via 2D convex hull on the
+      contact-patch plane (skipped at runtime when ``contact_pruning_tolerance`` is 0).
+    - If ``has_non_box_plane_convex_convex and backend != cpu``: spatial sort by x-position with geom-pair groups
+      treated as units (provides spatial locality for the constraint solver).
 
-    qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL)
-    for i_b in range(_B):
-        n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs)
-        collider_state.n_contacts[i_b] = n_con
-
-        # Phase 1: initialise and insertion-sort the (key, idx) arrays.
-        group_key = gs.qd_float(0.0)
-        for i in range(n_con):
-            ga = collider_state.contact_data.geom_a[i, i_b]
-            gb = collider_state.contact_data.geom_b[i, i_b]
-            if (
-                i == 0
-                or ga != collider_state.contact_data.geom_a[i - 1, i_b]
-                or gb != collider_state.contact_data.geom_b[i - 1, i_b]
-            ):
-                group_key = collider_state.contact_data.pos[i, i_b][0]
-            collider_state.contact_sort_key[i, i_b] = group_key
-            collider_state.contact_sort_idx[i, i_b] = i
-
-        for i in range(1, n_con):
-            curr_key = collider_state.contact_sort_key[i, i_b]
-            if collider_state.contact_sort_key[i - 1, i_b] <= curr_key:
-                continue
-
-            curr_idx = collider_state.contact_sort_idx[i, i_b]
-            j = i - 1
-            while j >= 0:
-                if collider_state.contact_sort_key[j, i_b] <= curr_key:
-                    break
-                collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
-                collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
-                j = j - 1
-            collider_state.contact_sort_key[j + 1, i_b] = curr_key
-            collider_state.contact_sort_idx[j + 1, i_b] = curr_idx
-
-        # Phase 2: apply permutation in-place via cycle decomposition.
-        # Each contact is read and written exactly once.
-        for i in range(n_con):
-            if collider_state.contact_sort_idx[i, i_b] != i:
-                tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
-                tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
-                tmp_penetration = collider_state.contact_data.penetration[i, i_b]
-                tmp_normal = collider_state.contact_data.normal[i, i_b]
-                tmp_pos = collider_state.contact_data.pos[i, i_b]
-                tmp_friction = collider_state.contact_data.friction[i, i_b]
-                tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
-                tmp_force = collider_state.contact_data.force[i, i_b]
-                tmp_link_a = collider_state.contact_data.link_a[i, i_b]
-                tmp_link_b = collider_state.contact_data.link_b[i, i_b]
-                tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b]
-
-                j = i
-                while collider_state.contact_sort_idx[j, i_b] != i:
-                    src = collider_state.contact_sort_idx[j, i_b]
-                    collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
-                    collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
-                    collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b]
-                    collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
-                    collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
-                    collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
-                    collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b]
-                    collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b]
-                    collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
-                    collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
-                    collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b]
-                    collider_state.contact_sort_idx[j, i_b] = j
-                    j = src
-
-                collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
-                collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
-                collider_state.contact_data.penetration[j, i_b] = tmp_penetration
-                collider_state.contact_data.normal[j, i_b] = tmp_normal
-                collider_state.contact_data.pos[j, i_b] = tmp_pos
-                collider_state.contact_data.friction[j, i_b] = tmp_friction
-                collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
-                collider_state.contact_data.force[j, i_b] = tmp_force
-                collider_state.contact_data.link_a[j, i_b] = tmp_link_a
-                collider_state.contact_data.link_b[j, i_b] = tmp_link_b
-                collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx
-                collider_state.contact_sort_idx[j, i_b] = j
-
-
-@qd.kernel(fastcache=True)
-def func_prune_contacts(
-    collider_state: array_class.ColliderState,
-    collider_info: array_class.ColliderInfo,
-    rigid_global_info: array_class.RigidGlobalInfo,
-    static_rigid_sim_config: qd.template(),
-):
-    """Prune redundant contacts per link-pair via support-polygon (2D convex hull on the contact-patch plane).
+    The pruning logic groups contacts by canonical (min(link_a, link_b), max(link_a, link_b)).
 
     Operates after func_clamp_and_sort_contacts. Groups contacts by canonical (min(link_a, link_b), max(link_a,
     link_b)) and, for each bucket of >= 5 contacts whose positions lie in a single plane (perpendicular to the
@@ -679,335 +583,446 @@ def func_prune_contacts(
     for i_b in range(_B):
         n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs)
         collider_state.n_contacts[i_b] = n_con
-        if n_con < 5:
-            continue
-
-        # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key.
-        for i in range(n_con):
-            la = collider_state.contact_data.link_a[i, i_b]
-            lb = collider_state.contact_data.link_b[i, i_b]
-            la_min = qd.min(la, lb)
-            la_max = qd.max(la, lb)
-            collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast(
-                la_max, gs.qd_float
-            )
-            collider_state.contact_sort_idx[i, i_b] = i
-
-        for i in range(1, n_con):
-            ck = collider_state.contact_sort_key[i, i_b]
-            if collider_state.contact_sort_key[i - 1, i_b] <= ck:
-                continue
-            ci = collider_state.contact_sort_idx[i, i_b]
-            j = i - 1
-            while j >= 0:
-                if collider_state.contact_sort_key[j, i_b] <= ck:
-                    break
-                collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
-                collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
-                j = j - 1
-            collider_state.contact_sort_key[j + 1, i_b] = ck
-            collider_state.contact_sort_idx[j + 1, i_b] = ci
-
-        # Phase 1b: apply permutation in-place via cycle decomposition (11 fields).
-        for i in range(n_con):
-            if collider_state.contact_sort_idx[i, i_b] != i:
-                tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
-                tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
-                tmp_penetration = collider_state.contact_data.penetration[i, i_b]
-                tmp_normal = collider_state.contact_data.normal[i, i_b]
-                tmp_pos = collider_state.contact_data.pos[i, i_b]
-                tmp_friction = collider_state.contact_data.friction[i, i_b]
-                tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
-                tmp_force = collider_state.contact_data.force[i, i_b]
-                tmp_link_a = collider_state.contact_data.link_a[i, i_b]
-                tmp_link_b = collider_state.contact_data.link_b[i, i_b]
-                tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b]
-
-                j = i
-                while collider_state.contact_sort_idx[j, i_b] != i:
-                    src = collider_state.contact_sort_idx[j, i_b]
-                    collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
-                    collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
-                    collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b]
-                    collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
-                    collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
-                    collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
-                    collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b]
-                    collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b]
-                    collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
-                    collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
-                    collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b]
+
+        # === Pruning phase (link-pair support polygon). Gated by static config: only emitted when the
+        # scene has multi-geom links / nonconvex / terrain, and not in autodiff mode. Skipped at runtime
+        # when contact_pruning_tolerance is 0.
+        if qd.static(collider_static_config.link_pair_pruning_supported and not static_rigid_sim_config.requires_grad):
+            if n_con >= 5 and tol > gs.qd_float(0.0):
+                # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key.
+                for i in range(n_con):
+                    la = collider_state.contact_data.link_a[i, i_b]
+                    lb = collider_state.contact_data.link_b[i, i_b]
+                    la_min = qd.min(la, lb)
+                    la_max = qd.max(la, lb)
+                    collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast(
+                        la_max, gs.qd_float
+                    )
+                    collider_state.contact_sort_idx[i, i_b] = i
+
+                for i in range(1, n_con):
+                    ck = collider_state.contact_sort_key[i, i_b]
+                    if collider_state.contact_sort_key[i - 1, i_b] <= ck:
+                        continue
+                    ci = collider_state.contact_sort_idx[i, i_b]
+                    j = i - 1
+                    while j >= 0:
+                        if collider_state.contact_sort_key[j, i_b] <= ck:
+                            break
+                        collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                        collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                        j = j - 1
+                    collider_state.contact_sort_key[j + 1, i_b] = ck
+                    collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+                # Phase 1b: apply permutation in-place via cycle decomposition (11 fields).
+                for i in range(n_con):
+                    if collider_state.contact_sort_idx[i, i_b] != i:
+                        tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
+                        tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
+                        tmp_penetration = collider_state.contact_data.penetration[i, i_b]
+                        tmp_normal = collider_state.contact_data.normal[i, i_b]
+                        tmp_pos = collider_state.contact_data.pos[i, i_b]
+                        tmp_friction = collider_state.contact_data.friction[i, i_b]
+                        tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
+                        tmp_force = collider_state.contact_data.force[i, i_b]
+                        tmp_link_a = collider_state.contact_data.link_a[i, i_b]
+                        tmp_link_b = collider_state.contact_data.link_b[i, i_b]
+                        tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b]
+
+                        j = i
+                        while collider_state.contact_sort_idx[j, i_b] != i:
+                            src = collider_state.contact_sort_idx[j, i_b]
+                            collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
+                            collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
+                            collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[
+                                src, i_b
+                            ]
+                            collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
+                            collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
+                            collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[
+                                src, i_b
+                            ]
+                            collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[
+                                src, i_b
+                            ]
+                            collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b]
+                            collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
+                            collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
+                            collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[
+                                src, i_b
+                            ]
+                            collider_state.contact_sort_idx[j, i_b] = j
+                            j = src
+
+                        collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
+                        collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
+                        collider_state.contact_data.penetration[j, i_b] = tmp_penetration
+                        collider_state.contact_data.normal[j, i_b] = tmp_normal
+                        collider_state.contact_data.pos[j, i_b] = tmp_pos
+                        collider_state.contact_data.friction[j, i_b] = tmp_friction
+                        collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
+                        collider_state.contact_data.force[j, i_b] = tmp_force
+                        collider_state.contact_data.link_a[j, i_b] = tmp_link_a
+                        collider_state.contact_data.link_b[j, i_b] = tmp_link_b
+                        collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx
+                        collider_state.contact_sort_idx[j, i_b] = j
+
+                # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only
+                # hull-vertex contacts as keep again.
+                for i in range(n_con):
+                    collider_state.contact_keep[i, i_b] = 1
+
+                # Phase 2: walk link-pair buckets.
+                b_start = 0
+                while b_start < n_con:
+                    la0 = collider_state.contact_data.link_a[b_start, i_b]
+                    lb0 = collider_state.contact_data.link_b[b_start, i_b]
+                    la0_min = qd.min(la0, lb0)
+                    la0_max = qd.max(la0, lb0)
+                    b_end = b_start + 1
+                    while b_end < n_con:
+                        la = collider_state.contact_data.link_a[b_end, i_b]
+                        lb = collider_state.contact_data.link_b[b_end, i_b]
+                        if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max:
+                            break
+                        b_end += 1
+                    b_size = b_end - b_start
+
+                    if b_size >= 5:
+                        # Mean normal (folded to the hemisphere of contact b_start) and centroid.
+                        ref_n = collider_state.contact_data.normal[b_start, i_b]
+                        rnx = ref_n[0]
+                        rny = ref_n[1]
+                        rnz = ref_n[2]
+                        mnx = gs.qd_float(0.0)
+                        mny = gs.qd_float(0.0)
+                        mnz = gs.qd_float(0.0)
+                        cx = gs.qd_float(0.0)
+                        cy = gs.qd_float(0.0)
+                        cz = gs.qd_float(0.0)
+                        for i in range(b_start, b_end):
+                            n_i = collider_state.contact_data.normal[i, i_b]
+                            s = gs.qd_float(1.0)
+                            if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0):
+                                s = gs.qd_float(-1.0)
+                            mnx += s * n_i[0]
+                            mny += s * n_i[1]
+                            mnz += s * n_i[2]
+                            p_i = collider_state.contact_data.pos[i, i_b]
+                            cx += p_i[0]
+                            cy += p_i[1]
+                            cz += p_i[2]
+                        inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float)
+                        cx *= inv_n
+                        cy *= inv_n
+                        cz *= inv_n
+                        mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz)
+
+                        # Hoisted out so the hull-build branch below can read it (quadrants scopes per if).
+                        max_in_plane_r2 = gs.qd_float(0.0)
+
+                        coplanar = mnrm > EPS
+                        if coplanar:
+                            mnx /= mnrm
+                            mny /= mnrm
+                            mnz /= mnrm
+
+                            # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No
+                            # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a
+                            # corner of the contact patch) still participates in the 2D hull because its position is a vertex of
+                            # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional
+                            # support polygon regardless of that contact's normal direction.
+                            max_depth = gs.qd_float(0.0)
+                            for i in range(b_start, b_end):
+                                p_i = collider_state.contact_data.pos[i, i_b]
+                                dx = p_i[0] - cx
+                                dy = p_i[1] - cy
+                                dz = p_i[2] - cz
+                                depth = qd.abs(dx * mnx + dy * mny + dz * mnz)
+                                if depth > max_depth:
+                                    max_depth = depth
+                                r2 = dx * dx + dy * dy + dz * dz - depth * depth
+                                if r2 > max_in_plane_r2:
+                                    max_in_plane_r2 = r2
+
+                            if max_depth > tol * qd.sqrt(max_in_plane_r2):
+                                coplanar = False
+
+                        if coplanar:
+                            # In-plane basis (u, v): seed from the world axis least-aligned with mean normal.
+                            abs_mnx = qd.abs(mnx)
+                            abs_mny = qd.abs(mny)
+                            abs_mnz = qd.abs(mnz)
+                            ax = gs.qd_float(1.0)
+                            ay = gs.qd_float(0.0)
+                            az = gs.qd_float(0.0)
+                            if abs_mny < abs_mnx and abs_mny < abs_mnz:
+                                ax = gs.qd_float(0.0)
+                                ay = gs.qd_float(1.0)
+                                az = gs.qd_float(0.0)
+                            elif abs_mnz < abs_mnx and abs_mnz <= abs_mny:
+                                ax = gs.qd_float(0.0)
+                                ay = gs.qd_float(0.0)
+                                az = gs.qd_float(1.0)
+                            adn = ax * mnx + ay * mny + az * mnz
+                            ux = ax - adn * mnx
+                            uy = ay - adn * mny
+                            uz = az - adn * mnz
+                            unrm = qd.sqrt(ux * ux + uy * uy + uz * uz)
+                            ux /= unrm
+                            uy /= unrm
+                            uz /= unrm
+                            vx = mny * uz - mnz * uy
+                            vy = mnz * ux - mnx * uz
+                            vz = mnx * uy - mny * ux
+
+                            # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it);
+                            # v goes into the dedicated scratch buffer.
+                            for i in range(b_start, b_end):
+                                p_i = collider_state.contact_data.pos[i, i_b]
+                                collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz
+                                collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz
+
+                            # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u
+                            # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort
+                            # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and
+                            # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first.
+                            sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2)
+                            for i in range(b_start, b_end):
+                                collider_state.contact_sort_idx[i, i_b] = i
+                            for i in range(b_start + 1, b_end):
+                                ci = collider_state.contact_sort_idx[i, i_b]
+                                cu = collider_state.contact_sort_key[ci, i_b]
+                                cv = collider_state.contact_proj_v[ci, i_b]
+                                j = i - 1
+                                while j >= b_start:
+                                    pj = collider_state.contact_sort_idx[j, i_b]
+                                    pu = collider_state.contact_sort_key[pj, i_b]
+                                    pv = collider_state.contact_proj_v[pj, i_b]
+                                    if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv):
+                                        break
+                                    collider_state.contact_sort_idx[j + 1, i_b] = pj
+                                    j -= 1
+                                collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+                            # Mark all bucket entries as drop, then mark hull vertices as keep.
+                            for i in range(b_start, b_end):
+                                collider_state.contact_keep[i, i_b] = 0
+
+                            # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails
+                            # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff),
+                            # so genuine midpoints would survive as spurious hull vertices.
+                            hull_collinear_tol = tol * max_in_plane_r2
+
+                            # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k).
+                            k = 0
+                            for i in range(b_start, b_end):
+                                ci = collider_state.contact_sort_idx[i, i_b]
+                                cu = collider_state.contact_sort_key[ci, i_b]
+                                cv = collider_state.contact_proj_v[ci, i_b]
+                                while k >= 2:
+                                    idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                    idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                    au = collider_state.contact_sort_key[idx_a, i_b]
+                                    av = collider_state.contact_proj_v[idx_a, i_b]
+                                    bu = collider_state.contact_sort_key[idx_b, i_b]
+                                    bv = collider_state.contact_proj_v[idx_b, i_b]
+                                    cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                    if cross <= hull_collinear_tol:
+                                        k -= 1
+                                    else:
+                                        break
+                                collider_state.contact_hull_stack[b_start + k, i_b] = ci
+                                k += 1
+
+                            upper_start = k
+                            # Memory-fence for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): without an
+                            # explicit barrier between the lower-hull and upper-hull passes, the upper-hull pop-loop's reads
+                            # of contact_hull_stack don't observe the writes from the lower hull, so its cross-product /
+                            # pop-check effectively runs on stale data and every candidate is kept, producing a hull whose
+                            # size equals the bucket size.
+                            if qd.static(static_rigid_sim_config.backend == gs.metal):
+                                qd.simt.block.sync()
+                            # quadrants range supports 1 or 2 args only; iterate sorted indices backward over
+                            # [b_start, b_end - 2] by reflecting the index.
+                            for k_step in range(b_size - 1):
+                                ii = b_end - 2 - k_step
+                                ci = collider_state.contact_sort_idx[ii, i_b]
+                                cu = collider_state.contact_sort_key[ci, i_b]
+                                cv = collider_state.contact_proj_v[ci, i_b]
+                                while k >= upper_start + 1:
+                                    idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                    idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                    au = collider_state.contact_sort_key[idx_a, i_b]
+                                    av = collider_state.contact_proj_v[idx_a, i_b]
+                                    bu = collider_state.contact_sort_key[idx_b, i_b]
+                                    bv = collider_state.contact_proj_v[idx_b, i_b]
+                                    cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                    if cross <= hull_collinear_tol:
+                                        k -= 1
+                                    else:
+                                        break
+                                # The closing iteration of the upper hull visits the leftmost point, which already sits at
+                                # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to
+                                # b_size and keeps the write index within max_contact_pairs even for buckets where the lower-
+                                # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple
+                                # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate
+                                # of an already-kept lower-hull vertex).
+                                if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size:
+                                    collider_state.contact_hull_stack[b_start + k, i_b] = ci
+                                    k += 1
+
+                            # Chain holds k unique hull vertices (no trailing duplicate).
+                            for hk in range(k):
+                                survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
+                                collider_state.contact_keep[survivor, i_b] = 1
+
+                            # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The
+                            # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share
+                            # the same normal and penetration; a contact with substantially higher penetration than the hull's
+                            # average represents a distinct physical support (the body of a fork resting beyond its tines, the
+                            # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is
+                            # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform-
+                            # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep
+                            # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep
+                            # supports are restored).
+                            hull_pen_max = gs.qd_float(0.0)
+                            for hk in range(k):
+                                survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
+                                p = collider_state.contact_data.penetration[survivor, i_b]
+                                if p > hull_pen_max:
+                                    hull_pen_max = p
+                            deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
+                            for i in range(b_start, b_end):
+                                if collider_state.contact_keep[i, i_b] == 0:
+                                    if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold:
+                                        collider_state.contact_keep[i, i_b] = 1
+
+                    b_start = b_end
+
+                # Phase 3: compact kept contacts to the front.
+                write = 0
+                for read in range(n_con):
+                    if collider_state.contact_keep[read, i_b] != 0:
+                        if write != read:
+                            collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.penetration[write, i_b] = (
+                                collider_state.contact_data.penetration[read, i_b]
+                            )
+                            collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b]
+                            collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b]
+                            collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[
+                                read, i_b
+                            ]
+                            collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[
+                                read, i_b
+                            ]
+                        write += 1
+                collider_state.n_contacts[i_b] = write
+
+        # === Spatial sort by x-position with geom-pair grouping. Gated on the same condition that decided
+        # whether to run the split narrowphase (Genesis Collider._use_split_narrowphase).
+        if qd.static(
+            collider_static_config.has_non_box_plane_convex_convex and static_rigid_sim_config.backend != gs.cpu
+        ):
+            n_con = collider_state.n_contacts[i_b]
+            # Phase 1: initialise and insertion-sort the (key, idx) arrays.
+            group_key = gs.qd_float(0.0)
+            for i in range(n_con):
+                ga = collider_state.contact_data.geom_a[i, i_b]
+                gb = collider_state.contact_data.geom_b[i, i_b]
+                if (
+                    i == 0
+                    or ga != collider_state.contact_data.geom_a[i - 1, i_b]
+                    or gb != collider_state.contact_data.geom_b[i - 1, i_b]
+                ):
+                    group_key = collider_state.contact_data.pos[i, i_b][0]
+                collider_state.contact_sort_key[i, i_b] = group_key
+                collider_state.contact_sort_idx[i, i_b] = i
+
+            for i in range(1, n_con):
+                curr_key = collider_state.contact_sort_key[i, i_b]
+                if collider_state.contact_sort_key[i - 1, i_b] <= curr_key:
+                    continue
+
+                curr_idx = collider_state.contact_sort_idx[i, i_b]
+                j = i - 1
+                while j >= 0:
+                    if collider_state.contact_sort_key[j, i_b] <= curr_key:
+                        break
+                    collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                    collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                    j = j - 1
+                collider_state.contact_sort_key[j + 1, i_b] = curr_key
+                collider_state.contact_sort_idx[j + 1, i_b] = curr_idx
+
+            # Phase 2: apply permutation in-place via cycle decomposition.
+            # Each contact is read and written exactly once.
+            for i in range(n_con):
+                if collider_state.contact_sort_idx[i, i_b] != i:
+                    tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
+                    tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
+                    tmp_penetration = collider_state.contact_data.penetration[i, i_b]
+                    tmp_normal = collider_state.contact_data.normal[i, i_b]
+                    tmp_pos = collider_state.contact_data.pos[i, i_b]
+                    tmp_friction = collider_state.contact_data.friction[i, i_b]
+                    tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
+                    tmp_force = collider_state.contact_data.force[i, i_b]
+                    tmp_link_a = collider_state.contact_data.link_a[i, i_b]
+                    tmp_link_b = collider_state.contact_data.link_b[i, i_b]
+                    tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b]
+
+                    j = i
+                    while collider_state.contact_sort_idx[j, i_b] != i:
+                        src = collider_state.contact_sort_idx[j, i_b]
+                        collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
+                        collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
+                        collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
+                        collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
+                        collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
+                        collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b]
+                        collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
+                        collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
+                        collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b]
+                        collider_state.contact_sort_idx[j, i_b] = j
+                        j = src
+
+                    collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
+                    collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
+                    collider_state.contact_data.penetration[j, i_b] = tmp_penetration
+                    collider_state.contact_data.normal[j, i_b] = tmp_normal
+                    collider_state.contact_data.pos[j, i_b] = tmp_pos
+                    collider_state.contact_data.friction[j, i_b] = tmp_friction
+                    collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
+                    collider_state.contact_data.force[j, i_b] = tmp_force
+                    collider_state.contact_data.link_a[j, i_b] = tmp_link_a
+                    collider_state.contact_data.link_b[j, i_b] = tmp_link_b
+                    collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx
                     collider_state.contact_sort_idx[j, i_b] = j
-                    j = src
-
-                collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
-                collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
-                collider_state.contact_data.penetration[j, i_b] = tmp_penetration
-                collider_state.contact_data.normal[j, i_b] = tmp_normal
-                collider_state.contact_data.pos[j, i_b] = tmp_pos
-                collider_state.contact_data.friction[j, i_b] = tmp_friction
-                collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
-                collider_state.contact_data.force[j, i_b] = tmp_force
-                collider_state.contact_data.link_a[j, i_b] = tmp_link_a
-                collider_state.contact_data.link_b[j, i_b] = tmp_link_b
-                collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx
-                collider_state.contact_sort_idx[j, i_b] = j
-
-        # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only
-        # hull-vertex contacts as keep again.
-        for i in range(n_con):
-            collider_state.contact_keep[i, i_b] = 1
-
-        # Phase 2: walk link-pair buckets.
-        b_start = 0
-        while b_start < n_con:
-            la0 = collider_state.contact_data.link_a[b_start, i_b]
-            lb0 = collider_state.contact_data.link_b[b_start, i_b]
-            la0_min = qd.min(la0, lb0)
-            la0_max = qd.max(la0, lb0)
-            b_end = b_start + 1
-            while b_end < n_con:
-                la = collider_state.contact_data.link_a[b_end, i_b]
-                lb = collider_state.contact_data.link_b[b_end, i_b]
-                if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max:
-                    break
-                b_end += 1
-            b_size = b_end - b_start
-
-            if b_size >= 5:
-                # Mean normal (folded to the hemisphere of contact b_start) and centroid.
-                ref_n = collider_state.contact_data.normal[b_start, i_b]
-                rnx = ref_n[0]
-                rny = ref_n[1]
-                rnz = ref_n[2]
-                mnx = gs.qd_float(0.0)
-                mny = gs.qd_float(0.0)
-                mnz = gs.qd_float(0.0)
-                cx = gs.qd_float(0.0)
-                cy = gs.qd_float(0.0)
-                cz = gs.qd_float(0.0)
-                for i in range(b_start, b_end):
-                    n_i = collider_state.contact_data.normal[i, i_b]
-                    s = gs.qd_float(1.0)
-                    if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0):
-                        s = gs.qd_float(-1.0)
-                    mnx += s * n_i[0]
-                    mny += s * n_i[1]
-                    mnz += s * n_i[2]
-                    p_i = collider_state.contact_data.pos[i, i_b]
-                    cx += p_i[0]
-                    cy += p_i[1]
-                    cz += p_i[2]
-                inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float)
-                cx *= inv_n
-                cy *= inv_n
-                cz *= inv_n
-                mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz)
-
-                # Hoisted out so the hull-build branch below can read it (quadrants scopes per if).
-                max_in_plane_r2 = gs.qd_float(0.0)
-
-                coplanar = mnrm > EPS
-                if coplanar:
-                    mnx /= mnrm
-                    mny /= mnrm
-                    mnz /= mnrm
-
-                    # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No
-                    # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a
-                    # corner of the contact patch) still participates in the 2D hull because its position is a vertex of
-                    # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional
-                    # support polygon regardless of that contact's normal direction.
-                    max_depth = gs.qd_float(0.0)
-                    for i in range(b_start, b_end):
-                        p_i = collider_state.contact_data.pos[i, i_b]
-                        dx = p_i[0] - cx
-                        dy = p_i[1] - cy
-                        dz = p_i[2] - cz
-                        depth = qd.abs(dx * mnx + dy * mny + dz * mnz)
-                        if depth > max_depth:
-                            max_depth = depth
-                        r2 = dx * dx + dy * dy + dz * dz - depth * depth
-                        if r2 > max_in_plane_r2:
-                            max_in_plane_r2 = r2
-
-                    if max_depth > tol * qd.sqrt(max_in_plane_r2):
-                        coplanar = False
-
-                if coplanar:
-                    # In-plane basis (u, v): seed from the world axis least-aligned with mean normal.
-                    abs_mnx = qd.abs(mnx)
-                    abs_mny = qd.abs(mny)
-                    abs_mnz = qd.abs(mnz)
-                    ax = gs.qd_float(1.0)
-                    ay = gs.qd_float(0.0)
-                    az = gs.qd_float(0.0)
-                    if abs_mny < abs_mnx and abs_mny < abs_mnz:
-                        ax = gs.qd_float(0.0)
-                        ay = gs.qd_float(1.0)
-                        az = gs.qd_float(0.0)
-                    elif abs_mnz < abs_mnx and abs_mnz <= abs_mny:
-                        ax = gs.qd_float(0.0)
-                        ay = gs.qd_float(0.0)
-                        az = gs.qd_float(1.0)
-                    adn = ax * mnx + ay * mny + az * mnz
-                    ux = ax - adn * mnx
-                    uy = ay - adn * mny
-                    uz = az - adn * mnz
-                    unrm = qd.sqrt(ux * ux + uy * uy + uz * uz)
-                    ux /= unrm
-                    uy /= unrm
-                    uz /= unrm
-                    vx = mny * uz - mnz * uy
-                    vy = mnz * ux - mnx * uz
-                    vz = mnx * uy - mny * ux
-
-                    # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it);
-                    # v goes into the dedicated scratch buffer.
-                    for i in range(b_start, b_end):
-                        p_i = collider_state.contact_data.pos[i, i_b]
-                        collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz
-                        collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz
-
-                    # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u
-                    # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort
-                    # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and
-                    # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first.
-                    sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2)
-                    for i in range(b_start, b_end):
-                        collider_state.contact_sort_idx[i, i_b] = i
-                    for i in range(b_start + 1, b_end):
-                        ci = collider_state.contact_sort_idx[i, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        j = i - 1
-                        while j >= b_start:
-                            pj = collider_state.contact_sort_idx[j, i_b]
-                            pu = collider_state.contact_sort_key[pj, i_b]
-                            pv = collider_state.contact_proj_v[pj, i_b]
-                            if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv):
-                                break
-                            collider_state.contact_sort_idx[j + 1, i_b] = pj
-                            j -= 1
-                        collider_state.contact_sort_idx[j + 1, i_b] = ci
-
-                    # Mark all bucket entries as drop, then mark hull vertices as keep.
-                    for i in range(b_start, b_end):
-                        collider_state.contact_keep[i, i_b] = 0
-
-                    # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails
-                    # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff),
-                    # so genuine midpoints would survive as spurious hull vertices.
-                    hull_collinear_tol = tol * max_in_plane_r2
-
-                    # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k).
-                    k = 0
-                    for i in range(b_start, b_end):
-                        ci = collider_state.contact_sort_idx[i, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        while k >= 2:
-                            idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
-                            idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
-                            au = collider_state.contact_sort_key[idx_a, i_b]
-                            av = collider_state.contact_proj_v[idx_a, i_b]
-                            bu = collider_state.contact_sort_key[idx_b, i_b]
-                            bv = collider_state.contact_proj_v[idx_b, i_b]
-                            cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
-                            if cross <= hull_collinear_tol:
-                                k -= 1
-                            else:
-                                break
-                        collider_state.contact_hull_stack[b_start + k, i_b] = ci
-                        k += 1
-
-                    upper_start = k
-                    # Memory-fence for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): without an
-                    # explicit barrier between the lower-hull and upper-hull passes, the upper-hull pop-loop's reads
-                    # of contact_hull_stack don't observe the writes from the lower hull, so its cross-product /
-                    # pop-check effectively runs on stale data and every candidate is kept, producing a hull whose
-                    # size equals the bucket size.
-                    if qd.static(static_rigid_sim_config.backend == gs.metal):
-                        qd.simt.block.sync()
-                    # quadrants range supports 1 or 2 args only; iterate sorted indices backward over
-                    # [b_start, b_end - 2] by reflecting the index.
-                    for k_step in range(b_size - 1):
-                        ii = b_end - 2 - k_step
-                        ci = collider_state.contact_sort_idx[ii, i_b]
-                        cu = collider_state.contact_sort_key[ci, i_b]
-                        cv = collider_state.contact_proj_v[ci, i_b]
-                        while k >= upper_start + 1:
-                            idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
-                            idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
-                            au = collider_state.contact_sort_key[idx_a, i_b]
-                            av = collider_state.contact_proj_v[idx_a, i_b]
-                            bu = collider_state.contact_sort_key[idx_b, i_b]
-                            bv = collider_state.contact_proj_v[idx_b, i_b]
-                            cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
-                            if cross <= hull_collinear_tol:
-                                k -= 1
-                            else:
-                                break
-                        # The closing iteration of the upper hull visits the leftmost point, which already sits at
-                        # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to
-                        # b_size and keeps the write index within max_contact_pairs even for buckets where the lower-
-                        # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple
-                        # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate
-                        # of an already-kept lower-hull vertex).
-                        if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size:
-                            collider_state.contact_hull_stack[b_start + k, i_b] = ci
-                            k += 1
-
-                    # Chain holds k unique hull vertices (no trailing duplicate).
-                    for hk in range(k):
-                        survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
-                        collider_state.contact_keep[survivor, i_b] = 1
-
-                    # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The
-                    # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share
-                    # the same normal and penetration; a contact with substantially higher penetration than the hull's
-                    # average represents a distinct physical support (the body of a fork resting beyond its tines, the
-                    # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is
-                    # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform-
-                    # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep
-                    # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep
-                    # supports are restored).
-                    hull_pen_max = gs.qd_float(0.0)
-                    for hk in range(k):
-                        survivor = collider_state.contact_hull_stack[b_start + hk, i_b]
-                        p = collider_state.contact_data.penetration[survivor, i_b]
-                        if p > hull_pen_max:
-                            hull_pen_max = p
-                    deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
-                    for i in range(b_start, b_end):
-                        if collider_state.contact_keep[i, i_b] == 0:
-                            if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold:
-                                collider_state.contact_keep[i, i_b] = 1
-
-            b_start = b_end
-
-        # Phase 3: compact kept contacts to the front.
-        write = 0
-        for read in range(n_con):
-            if collider_state.contact_keep[read, i_b] != 0:
-                if write != read:
-                    collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[read, i_b]
-                    collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[read, i_b]
-                    collider_state.contact_data.penetration[write, i_b] = collider_state.contact_data.penetration[
-                        read, i_b
-                    ]
-                    collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[read, i_b]
-                    collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b]
-                    collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[read, i_b]
-                    collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[
-                        read, i_b
-                    ]
-                    collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b]
-                    collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[read, i_b]
-                    collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[read, i_b]
-                    collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[read, i_b]
-                write += 1
-        collider_state.n_contacts[i_b] = write
 
 
 @qd.kernel

From ec50db09c830d9f93e74d52c3977f3170736e817 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 25 May 2026 11:24:55 -0700
Subject: [PATCH 5/7] Remove stale prune_hull_collinear_tol after fused-kernel
 merge (Alexis collapsed it into tol*max_in_plane_r2)

---
 genesis/engine/solvers/rigid/collider/collider.py | 2 --
 genesis/utils/array_class.py                      | 2 --
 2 files changed, 4 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index aa8c95294..aaba5d891 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -89,7 +89,6 @@ def __init__(self, rigid_solver: "RigidSolver"):
         self._diff_pos_tolerance = 1e-2
         self._diff_normal_tolerance = 1e-2
         self._prune_deep_penetration_ratio = 3.0
-        self._prune_hull_collinear_tol = 1e-3
 
         self._init_static_config()
         self._use_split_narrowphase = (
@@ -200,7 +199,6 @@ def _init_collision_fields(self) -> None:
             diff_normal_tolerance=self._diff_normal_tolerance,
             contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0,
             prune_deep_penetration_ratio=self._prune_deep_penetration_ratio,
-            prune_hull_collinear_tol=self._prune_hull_collinear_tol,
         )
         self._init_collision_pair_idx(self._collision_pair_idx)
         self._init_valid_pairs()
diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py
index 466efaa3e..8daa2bf3f 100644
--- a/genesis/utils/array_class.py
+++ b/genesis/utils/array_class.py
@@ -794,7 +794,6 @@ class ColliderInfo:
     # link-pair contact pruning
     contact_pruning_tolerance: qd.Tensor
     prune_deep_penetration_ratio: qd.Tensor
-    prune_hull_collinear_tol: qd.Tensor
 
 
 def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_config, **kwargs):
@@ -827,7 +826,6 @@ def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_c
         diff_normal_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["diff_normal_tolerance"]),
         contact_pruning_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["contact_pruning_tolerance"]),
         prune_deep_penetration_ratio=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_deep_penetration_ratio"]),
-        prune_hull_collinear_tol=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_hull_collinear_tol"]),
     )
 
 

From 6f16ba75ec9a446f1b05d430165a4c463d88f996 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 25 May 2026 12:00:43 -0700
Subject: [PATCH 6/7] Restore cooperative warp-per-env dedup kernel for GPU;
 serial fused kernel for CPU

Re-add `func_prune_contacts_coop`, the 32-lane warp-cooperative variant of
the fused clamp+prune+sort path. Same pruning logic as Alexis's serial
`func_clamp_prune_and_sort_contacts` (depth coplanarity gate, Andrew's
monotone chain, hull-pen-max deep-penetration restore, Metal memory-fence
workaround), but the per-env work is spread across 32 warp lanes:

  - PARALLEL: contact_keep init, phase-1a (link-pair) key+idx init,
    phase-2 mean-normal / centroid reduction via reduce_all_add_tiled,
    coplanarity check reduction via reduce_all_max_tiled, in-plane
    projection writes.
  - SERIAL on lane 0: insertion sorts (phase 1a + lex), bucket walk,
    monotone chain, hull mark + deep-penetration restore, and the final
    fused compact + spatial sort with sentinel +inf for dropped contacts.

Dispatched only when gs.backend != gs.cpu AND scene is dedup-eligible AND
not requires_grad AND contact_pruning_tolerance > 0. Everything else
(CPU, autodiff, ineligible scenes) falls back to the serial fused kernel,
which has internal qd.static gates that no-op the prune phases when they
don't apply. So dedup still runs on every backend as before; the change
just upgrades the GPU path to coop.
---
 .../engine/solvers/rigid/collider/collider.py |  33 +-
 .../engine/solvers/rigid/collider/contact.py  | 447 ++++++++++++++++++
 2 files changed, 474 insertions(+), 6 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index aaba5d891..86ed55d95 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -48,6 +48,7 @@
     func_rotate_frame,
     func_set_upstream_grad,
     func_clamp_prune_and_sort_contacts,
+    func_prune_contacts_coop,
 )
 from . import narrowphase
 from .narrowphase import (
@@ -818,13 +819,33 @@ def detection(self) -> None:
                 self._solver._errno,
             )
 
-        func_clamp_prune_and_sort_contacts(
-            self._collider_state,
-            self._collider_info,
-            self._solver._rigid_global_info,
-            self._solver._static_rigid_sim_config,
-            self._collider_static_config,
+        # On GPU backends, when the scene is dedup-eligible and we're not in autodiff mode, dispatch the cooperative
+        # warp-per-env kernel (32 lanes/block; parallel reductions + lex-stride writes; serial sorts + hull build on
+        # lane 0; fused compact+spatial-sort in the final phase). This beats the serial fused kernel on dex_hand /
+        # g1_fall by ~2-3% by spreading the per-env work across the warp instead of running one env per block thread.
+        # Everything else (CPU, autodiff, scenes where link_pair_pruning_supported=False) falls through to the serial
+        # fused kernel, which has internal qd.static gates that drop the prune phases when they're not eligible.
+        ran_fused_dedup_coop = (
+            gs.backend != gs.cpu
+            and self._collider_static_config.link_pair_pruning_supported
+            and not self._solver._static_rigid_sim_config.requires_grad
+            and (self._solver._options.contact_pruning_tolerance or 0.0) > 0.0
         )
+        if ran_fused_dedup_coop:
+            func_prune_contacts_coop(
+                self._collider_state,
+                self._collider_info,
+                self._solver._rigid_global_info,
+                self._solver._static_rigid_sim_config,
+            )
+        else:
+            func_clamp_prune_and_sort_contacts(
+                self._collider_state,
+                self._collider_info,
+                self._solver._rigid_global_info,
+                self._solver._static_rigid_sim_config,
+                self._collider_static_config,
+            )
 
     def get_contacts(self, as_tensor: bool = True, to_torch: bool = True, keep_batch_dim: bool = False):
         # Early return if already pre-computed
diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py
index 0f682df8f..169002bc3 100644
--- a/genesis/engine/solvers/rigid/collider/contact.py
+++ b/genesis/engine/solvers/rigid/collider/contact.py
@@ -1025,6 +1025,453 @@ def func_clamp_prune_and_sort_contacts(
                     collider_state.contact_sort_idx[j, i_b] = j
 
 
+@qd.kernel(fastcache=True)
+def func_prune_contacts_coop(
+    collider_state: array_class.ColliderState,
+    collider_info: array_class.ColliderInfo,
+    rigid_global_info: array_class.RigidGlobalInfo,
+    static_rigid_sim_config: qd.template(),
+):
+    """GPU-only cooperative warp-per-env variant of the fused clamp+prune+sort kernel.
+
+    Pruning logic matches Alexis's serial `func_clamp_prune_and_sort_contacts` (same depth coplanarity gate, same
+    Andrew's monotone chain, same hull-pen-max deep-penetration restore, same Metal memory-fence workaround). The
+    difference is the per-env work is spread across 32 warp lanes:
+      - PARALLEL: contact_keep init, phase-1a (link-pair) key+idx init, phase-2 mean-normal / centroid reduction
+        (via qd.simt.subgroup.reduce_all_add_tiled), coplanarity check reduction (via reduce_all_max_tiled), in-plane
+        projection writes.
+      - SERIAL on lane 0: insertion sorts (phase 1a + lex), bucket walk control, Andrew's monotone chain, hull-mark,
+        deep-penetration restore, and the final fused compact + spatial sort.
+
+    The fused final phase (lane 0) replaces both dedup phase-3 compact and the standalone func_clamp_and_sort_contacts
+    with a single permutation pass: dropped contacts get a sentinel +inf sort key (pushed past the end), kept contacts
+    get the geom-pair group's x-pos (preserving narrowphase intra-group order via stable insertion sort), then one
+    cycle-decomposition permute moves each kept record exactly once. Saves one full kernel launch + one full 11-field
+    permute pass vs the serial fused kernel on dex_hand / g1_fall workloads.
+
+    This kernel is dispatched only when gs.backend is GPU AND link_pair_pruning_supported AND not requires_grad AND
+    contact_pruning_tolerance > 0. Everything else falls through to the serial `func_clamp_prune_and_sort_contacts`.
+    """
+    _B = collider_state.n_contacts.shape[0]
+    max_contact_pairs = collider_info.max_contact_pairs[None]
+    tol = collider_info.contact_pruning_tolerance[None]
+    prune_deep_penetration_ratio = collider_info.prune_deep_penetration_ratio[None]
+    LP_KEY_STRIDE = gs.qd_float(1.0e7)
+    EPS = rigid_global_info.EPS[None]
+
+    # deskai6 opt 9d (real warp-coop, stage 1): pull contact_keep init and phase-1a key+idx init OUT of `if tid == 0:`
+    # and parallelize them across the 32 lanes (stride-tid). Phase 1a sort, phase 2 bucket walk, and phase 3 (sort +
+    # cycle-permute) stay serial on lane 0. Subsequent stages add coop reductions (phase-2 mean / centroid) and parallel
+    # cycle-permute.
+    _K = qd.static(32)
+    qd.loop_config(name="prune_contacts_coop", block_dim=_K)
+    for i_flat in range(_B * _K):
+        tid = i_flat % _K
+        i_b = i_flat // _K
+        # All lanes compute n_con (cheap, no memory write on non-lane-0).
+        n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs)
+        if tid == 0:
+            collider_state.n_contacts[i_b] = n_con
+
+        # PARALLEL: contact_keep init. Default-keep marked here unconditionally so the fused phase 3 (compact + spatial
+        # sort) below produces a correct result for envs with n_con < 5 (no dedup buckets, but still need spatial sort).
+        # 32 lanes stride.
+        ii = tid
+        while ii < n_con:
+            collider_state.contact_keep[ii, i_b] = 1
+            ii += _K
+
+        if n_con >= 5:
+            # PARALLEL: phase 1a key + idx init, 32 lanes stride.
+            ii = tid
+            while ii < n_con:
+                la = collider_state.contact_data.link_a[ii, i_b]
+                lb = collider_state.contact_data.link_b[ii, i_b]
+                la_min = qd.min(la, lb)
+                la_max = qd.max(la, lb)
+                collider_state.contact_sort_key[ii, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast(
+                    la_max, gs.qd_float
+                )
+                collider_state.contact_sort_idx[ii, i_b] = ii
+                ii += _K
+
+            if tid == 0:
+                # SERIAL on lane 0: phase 1a insertion sort + phase 2 bucket walk.
+                for i in range(1, n_con):
+                    ck = collider_state.contact_sort_key[i, i_b]
+                    if collider_state.contact_sort_key[i - 1, i_b] <= ck:
+                        continue
+                    ci = collider_state.contact_sort_idx[i, i_b]
+                    j = i - 1
+                    while j >= 0:
+                        if collider_state.contact_sort_key[j, i_b] <= ck:
+                            break
+                        collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                        collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                        j = j - 1
+                    collider_state.contact_sort_key[j + 1, i_b] = ck
+                    collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+            qd.simt.subgroup.sync()
+
+            # Phase 2 (deskai6 opt 9d stage 2): bucket walk runs on ALL 32 lanes. The outer control flow (find b_end,
+            # iterate buckets) is duplicated across lanes since the inputs are all in DRAM (cache-friendly). Inside a
+            # bucket, the mean-normal / centroid sum is done coop via 6 reduce_all_add_tiled calls. The rest of the
+            # bucket processing (coplanarity check with early-exit, in-plane basis, projection, lex sort, hull build,
+            # mark survivors) stays serial on lane 0.
+            b_start = 0
+            while b_start < n_con:
+                key0 = collider_state.contact_sort_key[b_start, i_b]
+                b_end = b_start + 1
+                while b_end < n_con:
+                    if collider_state.contact_sort_key[b_end, i_b] != key0:
+                        break
+                    b_end += 1
+                b_size = b_end - b_start
+
+                if b_size >= 5:
+                    ref_src = collider_state.contact_sort_idx[b_start, i_b]
+                    ref_n = collider_state.contact_data.normal[ref_src, i_b]
+                    rnx = ref_n[0]
+                    rny = ref_n[1]
+                    rnz = ref_n[2]
+                    mnx_l = gs.qd_float(0.0)
+                    mny_l = gs.qd_float(0.0)
+                    mnz_l = gs.qd_float(0.0)
+                    cx_l = gs.qd_float(0.0)
+                    cy_l = gs.qd_float(0.0)
+                    cz_l = gs.qd_float(0.0)
+                    jj = b_start + tid
+                    while jj < b_end:
+                        src_i = collider_state.contact_sort_idx[jj, i_b]
+                        n_i = collider_state.contact_data.normal[src_i, i_b]
+                        s = gs.qd_float(1.0)
+                        if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0):
+                            s = gs.qd_float(-1.0)
+                        mnx_l += s * n_i[0]
+                        mny_l += s * n_i[1]
+                        mnz_l += s * n_i[2]
+                        p_i = collider_state.contact_data.pos[src_i, i_b]
+                        cx_l += p_i[0]
+                        cy_l += p_i[1]
+                        cz_l += p_i[2]
+                        jj += _K
+
+                    mnx = qd.simt.subgroup.reduce_all_add_tiled(mnx_l, 5)
+                    mny = qd.simt.subgroup.reduce_all_add_tiled(mny_l, 5)
+                    mnz = qd.simt.subgroup.reduce_all_add_tiled(mnz_l, 5)
+                    cx = qd.simt.subgroup.reduce_all_add_tiled(cx_l, 5)
+                    cy = qd.simt.subgroup.reduce_all_add_tiled(cy_l, 5)
+                    cz = qd.simt.subgroup.reduce_all_add_tiled(cz_l, 5)
+
+                    # POST-REDUCE math runs on all 32 lanes (deterministic, cheap; redundant arithmetic is free vs.
+                    # broadcasting the reduce results).
+                    inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float)
+                    cx *= inv_n
+                    cy *= inv_n
+                    cz *= inv_n
+                    mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz)
+
+                    max_in_plane_r2 = gs.qd_float(0.0)
+                    coplanar = mnrm > EPS
+                    if coplanar:
+                        mnx /= mnrm
+                        mny /= mnrm
+                        mnz /= mnrm
+
+                        # COOP coplanarity check (stage 3). Each lane strides [b_start + tid, b_end) by _K, locally
+                        # tracking max_depth / max_in_plane_r2. Wasted work per warp is at most b_size/_K contacts.
+                        # The upstream algo no longer checks per-contact normals (a contact with a diagonal normal at
+                        # the corner of a patch still participates in the 2D hull because its position is a vertex), so
+                        # we only do the depth coplanarity gate here.
+                        max_depth_l = gs.qd_float(0.0)
+                        max_r2_l = gs.qd_float(0.0)
+                        jj = b_start + tid
+                        while jj < b_end:
+                            src_i = collider_state.contact_sort_idx[jj, i_b]
+                            p_i = collider_state.contact_data.pos[src_i, i_b]
+                            dx = p_i[0] - cx
+                            dy = p_i[1] - cy
+                            dz = p_i[2] - cz
+                            depth = qd.abs(dx * mnx + dy * mny + dz * mnz)
+                            if depth > max_depth_l:
+                                max_depth_l = depth
+                            r2 = dx * dx + dy * dy + dz * dz - depth * depth
+                            if r2 > max_r2_l:
+                                max_r2_l = r2
+                            jj += _K
+
+                        max_depth = qd.simt.subgroup.reduce_all_max_tiled(max_depth_l, 5)
+                        max_in_plane_r2 = qd.simt.subgroup.reduce_all_max_tiled(max_r2_l, 5)
+
+                        if max_depth > tol * qd.sqrt(max_in_plane_r2):
+                            coplanar = False
+
+                    if coplanar:
+                        # Basis on all lanes (deterministic from mnx/mny/mnz which the reduce broadcast to every lane).
+                        abs_mnx = qd.abs(mnx)
+                        abs_mny = qd.abs(mny)
+                        abs_mnz = qd.abs(mnz)
+                        ax = gs.qd_float(1.0)
+                        ay = gs.qd_float(0.0)
+                        az = gs.qd_float(0.0)
+                        if abs_mny < abs_mnx and abs_mny < abs_mnz:
+                            ax = gs.qd_float(0.0)
+                            ay = gs.qd_float(1.0)
+                            az = gs.qd_float(0.0)
+                        elif abs_mnz < abs_mnx and abs_mnz <= abs_mny:
+                            ax = gs.qd_float(0.0)
+                            ay = gs.qd_float(0.0)
+                            az = gs.qd_float(1.0)
+                        adn = ax * mnx + ay * mny + az * mnz
+                        ux = ax - adn * mnx
+                        uy = ay - adn * mny
+                        uz = az - adn * mnz
+                        unrm = qd.sqrt(ux * ux + uy * uy + uz * uz)
+                        ux /= unrm
+                        uy /= unrm
+                        uz /= unrm
+                        vx = mny * uz - mnz * uy
+                        vy = mnz * ux - mnx * uz
+                        vz = mnx * uy - mny * ux
+
+                        # COOP projection: 32 lanes stride writes to contact_sort_key + contact_proj_v.
+                        jj = b_start + tid
+                        while jj < b_end:
+                            src_i = collider_state.contact_sort_idx[jj, i_b]
+                            p_i = collider_state.contact_data.pos[src_i, i_b]
+                            collider_state.contact_sort_key[jj, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz
+                            collider_state.contact_proj_v[jj, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz
+                            jj += _K
+
+                        # COOP mark-drop: stride writes to contact_keep[orig].
+                        jj = b_start + tid
+                        while jj < b_end:
+                            orig = collider_state.contact_sort_idx[jj, i_b]
+                            collider_state.contact_keep[orig, i_b] = 0
+                            jj += _K
+
+                        # COOP lex_idx init: stride writes.
+                        jj = b_start + tid
+                        while jj < b_end:
+                            collider_state.contact_lex_idx[jj, i_b] = jj
+                            jj += _K
+
+                        # SYNC between coop writes (sort_key, proj_v, lex_idx, contact_keep[orig]) and the lane-0 lex
+                        # sort + hull build that reads them.
+                        qd.simt.subgroup.sync()
+
+                    if tid == 0 and coplanar:
+                        # SERIAL on lane 0: lex sort + Andrew monotone-chain hull build + mark survivors. These walk a
+                        # stack and have data-dependent inner loops that don't decompose across warp lanes.
+                        #
+                        # The sort_u_tol on the u comparison is critical for correctness, not just a perf tweak:
+                        # contacts whose u values differ only by sub-millimeter MPR noise need to sort by v, otherwise
+                        # mid-edge points get sorted between the two corners they sit between and survive the lower-
+                        # hull pass as spurious hull vertices (cf. upstream test_contact_pruning regression when this
+                        # tolerance is missing).
+                        sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2)
+                        for i in range(b_start + 1, b_end):
+                            ci = collider_state.contact_lex_idx[i, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            j = i - 1
+                            while j >= b_start:
+                                pj = collider_state.contact_lex_idx[j, i_b]
+                                pu = collider_state.contact_sort_key[pj, i_b]
+                                pv = collider_state.contact_proj_v[pj, i_b]
+                                if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv):
+                                    break
+                                collider_state.contact_lex_idx[j + 1, i_b] = pj
+                                j -= 1
+                            collider_state.contact_lex_idx[j + 1, i_b] = ci
+
+                        # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check
+                        # fails on numerically-near-collinear edge points (cross is a tiny positive epsilon from float
+                        # roundoff), so genuine midpoints would survive as spurious hull vertices.
+                        hull_collinear_tol = tol * max_in_plane_r2
+
+                        k = 0
+                        for i in range(b_start, b_end):
+                            ci = collider_state.contact_lex_idx[i, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            while k >= 2:
+                                idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                au = collider_state.contact_sort_key[idx_a, i_b]
+                                av = collider_state.contact_proj_v[idx_a, i_b]
+                                bu = collider_state.contact_sort_key[idx_b, i_b]
+                                bv = collider_state.contact_proj_v[idx_b, i_b]
+                                cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                if cross <= hull_collinear_tol:
+                                    k -= 1
+                                else:
+                                    break
+                            collider_state.contact_hull_stack[b_start + k, i_b] = ci
+                            k += 1
+
+                        upper_start = k
+                        # Memory-fence workaround for a Quadrants codegen issue on Metal _B >= 2 (genesis PR #2831):
+                        # without an explicit scratch store between the lower- and upper-hull lane-0 ``for`` passes,
+                        # the upper-hull pop-loop's reads of contact_hull_stack don't observe the lower-hull writes,
+                        # so the cross-product check effectively runs on stale data and every candidate is kept (hull
+                        # size == bucket size, no pruning). Writing any value to a non-overlapping slot here forces
+                        # write-then-read ordering on the shared buffer. The value is unused. See
+                        # perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py for a standalone reproduction.
+                        collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0
+                        for k_step in range(b_size - 1):
+                            ii_lex = b_end - 2 - k_step
+                            ci = collider_state.contact_lex_idx[ii_lex, i_b]
+                            cu = collider_state.contact_sort_key[ci, i_b]
+                            cv = collider_state.contact_proj_v[ci, i_b]
+                            while k >= upper_start + 1:
+                                idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b]
+                                idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b]
+                                au = collider_state.contact_sort_key[idx_a, i_b]
+                                av = collider_state.contact_proj_v[idx_a, i_b]
+                                bu = collider_state.contact_sort_key[idx_b, i_b]
+                                bv = collider_state.contact_proj_v[idx_b, i_b]
+                                cross = (bu - au) * (cv - av) - (bv - av) * (cu - au)
+                                if cross <= hull_collinear_tol:
+                                    k -= 1
+                                else:
+                                    break
+                            if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size:
+                                collider_state.contact_hull_stack[b_start + k, i_b] = ci
+                                k += 1
+
+                        for hk in range(k):
+                            survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b]
+                            survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b]
+                            collider_state.contact_keep[survivor_orig, i_b] = 1
+
+                        # Restore non-hull contacts whose penetration is much deeper than the hull boundary's max
+                        # (PR #2831 switched from avg to max). Rationale: a contact whose penetration substantially
+                        # exceeds the hull's deepest vertex represents a distinct physical support (deep body of a
+                        # fork beyond its tines, deep middle of a long body) that the support-polygon argument
+                        # doesn't actually authorize dropping (the argument only holds when all contacts share
+                        # normal AND penetration). The 3x factor over the hull max is well above the typical ~1.x
+                        # penetration spread on transient/rocking faces but well below the deep interior penetrations
+                        # seen when a non-flat body rests inside its convex envelope. Indices here live in orig-space
+                        # (because the cycle-permute is fused into the phase 3 below in opt 10 -- contact_data is
+                        # still in pre-sort order, so we translate sort-space hull/bucket indices through
+                        # contact_sort_idx).
+                        hull_pen_max = gs.qd_float(0.0)
+                        for hk in range(k):
+                            survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b]
+                            survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b]
+                            p = collider_state.contact_data.penetration[survivor_orig, i_b]
+                            if p > hull_pen_max:
+                                hull_pen_max = p
+                        deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max
+                        for jj_idx in range(b_start, b_end):
+                            orig = collider_state.contact_sort_idx[jj_idx, i_b]
+                            if collider_state.contact_keep[orig, i_b] == 0:
+                                if collider_state.contact_data.penetration[orig, i_b] > deep_keep_threshold:
+                                    collider_state.contact_keep[orig, i_b] = 1
+
+                b_start = b_end
+
+        if tid == 0:
+            # Phase 3 (deskai6 opt 10): FUSED compact + spatial sort. Replaces:
+            #   - dedup phase 3 (compact based on contact_keep)
+            #   - func_clamp_and_sort_contacts (assign group-x sort key + insertion sort + cycle-permute)
+            # with a single permutation pass: compute a sort key per slot that pushes dropped contacts past the end
+            # (sentinel +inf) and orders kept contacts by their geom-pair group's x-pos (anchored to the first kept
+            # contact in the group, preserving narrowphase intra-group order via stable sort). Then sort sort-keys +
+            # sort-indices, then cycle-permute the 9 contact_data fields exactly once.
+            #
+            # On dex_hand this saves ~25 displaced contacts worth of field copies that used to be done in dedup phase 3
+            # before clamp_and_sort re-permuted everything. Same 9 fields per contact (force & pair_idx still skipped
+            # per opt 8a; clamp_and_sort still runs in the grad path so its own opt 8b gate stays in place).
+            SENTINEL_BIG = gs.qd_float(1e30)
+            group_key = gs.qd_float(0.0)
+            prev_ga = -1
+            prev_gb = -1
+            for i in range(n_con):
+                if collider_state.contact_keep[i, i_b] != 0:
+                    ga = collider_state.contact_data.geom_a[i, i_b]
+                    gb = collider_state.contact_data.geom_b[i, i_b]
+                    if ga != prev_ga or gb != prev_gb:
+                        group_key = collider_state.contact_data.pos[i, i_b][0]
+                        prev_ga = ga
+                        prev_gb = gb
+                    collider_state.contact_sort_key[i, i_b] = group_key
+                else:
+                    collider_state.contact_sort_key[i, i_b] = SENTINEL_BIG
+                collider_state.contact_sort_idx[i, i_b] = i
+
+            for i in range(1, n_con):
+                ck = collider_state.contact_sort_key[i, i_b]
+                if collider_state.contact_sort_key[i - 1, i_b] <= ck:
+                    continue
+                ci = collider_state.contact_sort_idx[i, i_b]
+                j = i - 1
+                while j >= 0:
+                    if collider_state.contact_sort_key[j, i_b] <= ck:
+                        break
+                    collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b]
+                    collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b]
+                    j = j - 1
+                collider_state.contact_sort_key[j + 1, i_b] = ck
+                collider_state.contact_sort_idx[j + 1, i_b] = ci
+
+            # n_contacts is the count of non-sentinel sort keys (the dropped contacts sit at the tail and are ignored
+            # by downstream consumers via the new n_contacts).
+            n_kept = 0
+            for i in range(n_con):
+                if collider_state.contact_sort_key[i, i_b] < SENTINEL_BIG:
+                    n_kept += 1
+                else:
+                    break
+            collider_state.n_contacts[i_b] = n_kept
+
+            # Cycle-permute contact_data fields into their sorted positions. Only n_kept positions need correct values;
+            # the tail (dropped) slots are scratched but not read again this step.
+            for i in range(n_kept):
+                if collider_state.contact_sort_idx[i, i_b] != i:
+                    tmp_geom_a = collider_state.contact_data.geom_a[i, i_b]
+                    tmp_geom_b = collider_state.contact_data.geom_b[i, i_b]
+                    tmp_penetration = collider_state.contact_data.penetration[i, i_b]
+                    tmp_normal = collider_state.contact_data.normal[i, i_b]
+                    tmp_pos = collider_state.contact_data.pos[i, i_b]
+                    tmp_friction = collider_state.contact_data.friction[i, i_b]
+                    tmp_sol_params = collider_state.contact_data.sol_params[i, i_b]
+                    tmp_link_a = collider_state.contact_data.link_a[i, i_b]
+                    tmp_link_b = collider_state.contact_data.link_b[i, i_b]
+
+                    j = i
+                    while collider_state.contact_sort_idx[j, i_b] != i:
+                        src = collider_state.contact_sort_idx[j, i_b]
+                        collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b]
+                        collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b]
+                        collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b]
+                        collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b]
+                        collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b]
+                        collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[
+                            src, i_b
+                        ]
+                        collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b]
+                        collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b]
+                        collider_state.contact_sort_idx[j, i_b] = j
+                        j = src
+
+                    collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a
+                    collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b
+                    collider_state.contact_data.penetration[j, i_b] = tmp_penetration
+                    collider_state.contact_data.normal[j, i_b] = tmp_normal
+                    collider_state.contact_data.pos[j, i_b] = tmp_pos
+                    collider_state.contact_data.friction[j, i_b] = tmp_friction
+                    collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params
+                    collider_state.contact_data.link_a[j, i_b] = tmp_link_a
+                    collider_state.contact_data.link_b[j, i_b] = tmp_link_b
+                    collider_state.contact_sort_idx[j, i_b] = j
+
+
 @qd.kernel
 def func_set_upstream_grad(
     dL_dposition: qd.types.ndarray(),

From bf38466a1717deda1158cc4cdd9dc9744a19ee46 Mon Sep 17 00:00:00 2001
From: Hugh Perkins <hughperkins@gmail.com>
Date: Mon, 25 May 2026 12:30:32 -0700
Subject: [PATCH 7/7] Gate cooperative dedup kernel on n_envs <= gpu_cores / 2

The cooperative warp-per-env dedup kernel (func_prune_contacts_coop)
only beats the serial fused kernel while the GPU has spare occupancy.
Once _B exceeds half the GPU's CUDA-core count the coop launch fills
the device, the warp-cooperation overhead stops paying off, and
Alexis's serial fused kernel (one thread per env) is faster. The
half-core threshold also leaves headroom for the other kernels
already sharing the SMs (narrowphase MPR/GJK, constraint solver) so
we don't push them into second-wave scheduling on tight scenes like
dex_hand and g1_fall.

Concrete thresholds (gpu_cores / 2):
- RTX 5090 (170 SMs x 128):       _B <= 10880
- RTX 6000 Blackwell (188 x 128): _B <= 12032
- MI300X (304 CUs x 64):          _B <=  9728
- Fallback (16384):               _B <=  8192

Factored the per-backend gpu_cores computation out of the
_use_split_narrowphase branch into a single Collider._gpu_cores
field so both the contact0 chunking heuristic and the new dispatch
gate share the same number. No behavior change for split-narrowphase
chunking.
---
 .../engine/solvers/rigid/collider/collider.py | 46 +++++++++++--------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py
index 86ed55d95..02eb074f7 100644
--- a/genesis/engine/solvers/rigid/collider/collider.py
+++ b/genesis/engine/solvers/rigid/collider/collider.py
@@ -221,27 +221,30 @@ def _init_collision_fields(self) -> None:
         # 'contact_data_cache' is not used in Quadrants kernels, so keep it outside of the collider state / info
         self._contact_data_cache: dict[tuple[bool, bool], dict[str, torch.Tensor | tuple[torch.Tensor]]] = {}
 
-        # Contact0 & multicontact scratch states only needed when split narrowphase is active.
+        # GPU core count (used by split-narrowphase chunking + the cooperative dedup dispatch gate).
         # FIXME: Quadrants should expose a unified API to query GPU core count across all backends.
         # Falling back to upper bound for backends where torch.cuda is unavailable (e.g., CPU-only torch). Benchmarks
         # on RTX 6000 Blackwell (Genesis-Embodied-AI/Genesis#2616) showed that switching from hardcoded 40000 threads
         # to hardware-derived 21760 had marginal performance impact, so it should be fine.
+        if torch.cuda.is_available():
+            gpu_props = torch.cuda.get_device_properties(torch.cuda.current_device())
+            # NVIDIA: 128 CUDA cores per SM. AMD/ROCm: 64 stream processors per CU.
+            cores_per_unit = 64 if torch.version.hip else 128
+            gpu_cores = gpu_props.multi_processor_count * cores_per_unit
+        elif gs.backend == gs.metal:
+            # Upper-bound estimate for Apple Silicon: 40 GPU cores, each GPU core having 128 ALUs
+            cores_per_unit = 128
+            gpu_cores = 5120
+        else:
+            # Using AMD GPU as a baseline. AMD MI350X has 256 SM (so-called Compute Units) with 64 cores each.
+            # See: https://www.amd.com/en/products/accelerators/instinct/mi350/mi350x.html
+            # For comparison, RTX6000 Blackwell boasts 188 SMs, compared to 170 SMs for RTX5090 with 128 cores each.
+            cores_per_unit = 64
+            gpu_cores = 16384
+        self._gpu_cores = gpu_cores
+
+        # Contact0 & multicontact scratch states only needed when split narrowphase is active.
         if self._use_split_narrowphase:
-            if torch.cuda.is_available():
-                gpu_props = torch.cuda.get_device_properties(torch.cuda.current_device())
-                # NVIDIA: 128 CUDA cores per SM. AMD/ROCm: 64 stream processors per CU.
-                cores_per_unit = 64 if torch.version.hip else 128
-                gpu_cores = gpu_props.multi_processor_count * cores_per_unit
-            elif gs.backend == gs.metal:
-                # Upper-bound estimate for Apple Silicon: 40 GPU cores, each GPU core having 128 ALUs
-                cores_per_unit = 128
-                gpu_cores = 5120
-            else:
-                # Using AMD GPU as a baseline. AMD MI350X has 256 SM (so-called Compute Units) with 64 cores each.
-                # See: https://www.amd.com/en/products/accelerators/instinct/mi350/mi350x.html
-                # For comparison, RTX6000 Blackwell boasts 188 SMs, compared to 170 SMs for RTX5090 with 128 cores each.
-                cores_per_unit = 64
-                gpu_cores = 16384
             self._contact0_n_chunks = max(1, math.ceil(gpu_cores / self._solver._B))
             self._contact0_grid_size = self._solver._B * self._contact0_n_chunks
             self._contact0_mpr_state = array_class.get_mpr_state(self._contact0_grid_size)
@@ -822,14 +825,19 @@ def detection(self) -> None:
         # On GPU backends, when the scene is dedup-eligible and we're not in autodiff mode, dispatch the cooperative
         # warp-per-env kernel (32 lanes/block; parallel reductions + lex-stride writes; serial sorts + hull build on
         # lane 0; fused compact+spatial-sort in the final phase). This beats the serial fused kernel on dex_hand /
-        # g1_fall by ~2-3% by spreading the per-env work across the warp instead of running one env per block thread.
-        # Everything else (CPU, autodiff, scenes where link_pair_pruning_supported=False) falls through to the serial
-        # fused kernel, which has internal qd.static gates that drop the prune phases when they're not eligible.
+        # g1_fall by spreading the per-env work across the warp instead of running one env per block thread, but only
+        # while the GPU has spare occupancy. Once n_envs exceeds half the GPU's CUDA core count, the coop launch fills
+        # the device and the warp-cooperation overhead stops paying off — the serial fused kernel (one thread per env)
+        # wins. The half-core threshold leaves room for the existing kernels already sharing the SMs (narrowphase,
+        # constraint solver) so we don't push them to second-wave scheduling. Everything else (CPU, autodiff, scenes
+        # where link_pair_pruning_supported=False, oversubscribed n_envs) falls through to the serial fused kernel,
+        # which has internal qd.static gates that drop the prune phases when they're not eligible.
         ran_fused_dedup_coop = (
             gs.backend != gs.cpu
             and self._collider_static_config.link_pair_pruning_supported
             and not self._solver._static_rigid_sim_config.requires_grad
             and (self._solver._options.contact_pruning_tolerance or 0.0) > 0.0
+            and self._solver._B * 2 <= self._gpu_cores
         )
         if ran_fused_dedup_coop:
             func_prune_contacts_coop(