From 1957db683d5ca83f080ee36b0425c859aee45583 Mon Sep 17 00:00:00 2001 From: Alexis Duburcq Date: Mon, 25 May 2026 17:32:18 +0200 Subject: [PATCH 1/7] Fix contact point pruning in batched sim. --- .../engine/solvers/rigid/collider/collider.py | 2 ++ .../engine/solvers/rigid/collider/contact.py | 20 ++++++++++++++----- genesis/options/solvers.py | 2 +- genesis/utils/array_class.py | 2 ++ 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index c6aa09b9d..1d5b3036b 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -90,6 +90,7 @@ def __init__(self, rigid_solver: "RigidSolver"): self._diff_pos_tolerance = 1e-2 self._diff_normal_tolerance = 1e-2 self._prune_deep_penetration_ratio = 3.0 + self._prune_hull_collinear_tol = 1e-3 self._init_static_config() self._use_split_narrowphase = ( @@ -200,6 +201,7 @@ def _init_collision_fields(self) -> None: diff_normal_tolerance=self._diff_normal_tolerance, contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0, prune_deep_penetration_ratio=self._prune_deep_penetration_ratio, + prune_hull_collinear_tol=self._prune_hull_collinear_tol, ) self._init_collision_pair_idx(self._collision_pair_idx) self._init_valid_pairs() diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py index 1379b1f26..f7e7cb2d9 100644 --- a/genesis/engine/solvers/rigid/collider/contact.py +++ b/genesis/engine/solvers/rigid/collider/contact.py @@ -672,6 +672,7 @@ def func_prune_contacts( max_contact_pairs = collider_info.max_contact_pairs[None] tol = collider_info.contact_pruning_tolerance[None] prune_deep_penetration_ratio = collider_info.prune_deep_penetration_ratio[None] + prune_hull_collinear_tol = collider_info.prune_hull_collinear_tol[None] LP_KEY_STRIDE = gs.qd_float(1.0e7) EPS = rigid_global_info.EPS[None] @@ -898,7 +899,7 @@ def func_prune_contacts( # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff), # so genuine midpoints would survive as spurious hull vertices. - hull_collinear_tol = tol * max_in_plane_r2 + hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2 # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k). k = 0 @@ -922,6 +923,14 @@ def func_prune_contacts( k += 1 upper_start = k + # Memory-fence workaround for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): + # without an explicit scratch store between the lower-hull and upper-hull passes, the upper-hull + # pop-loop's reads of contact_hull_stack don't observe the writes from the lower hull, so its + # cross-product / pop-check effectively runs on stale data and every candidate is kept. This shows + # up as the kernel never popping anything in the upper hull and producing a hull whose size equals + # the bucket size. Writing any value to a non-overlapping scratch slot here serializes the two + # passes' contact_hull_stack accesses and restores parallel correctness; the value is unused. + collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0 # quadrants range supports 1 or 2 args only; iterate sorted indices backward over # [b_start, b_end - 2] by reflecting the index. for k_step in range(b_size - 1): @@ -965,12 +974,13 @@ def func_prune_contacts( # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep # supports are restored). - hull_pen_sum = gs.qd_float(0.0) + hull_pen_max = gs.qd_float(0.0) for hk in range(k): survivor = collider_state.contact_hull_stack[b_start + hk, i_b] - hull_pen_sum = hull_pen_sum + collider_state.contact_data.penetration[survivor, i_b] - hull_pen_avg = hull_pen_sum / qd.cast(k, gs.qd_float) - deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_avg + p = collider_state.contact_data.penetration[survivor, i_b] + if p > hull_pen_max: + hull_pen_max = p + deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max for i in range(b_start, b_end): if collider_state.contact_keep[i, i_b] == 0: if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold: diff --git a/genesis/options/solvers.py b/genesis/options/solvers.py index c34045606..691e0b727 100644 --- a/genesis/options/solvers.py +++ b/genesis/options/solvers.py @@ -507,7 +507,7 @@ class RigidOptions(Options): ls_tolerance: PositiveFloat = 1e-2 noslip_iterations: NonNegativeInt = 0 noslip_tolerance: PositiveFloat = 1e-6 - contact_pruning_tolerance: PositiveFloat | None = None + contact_pruning_tolerance: PositiveFloat | None = 0.1 sparse_solve: StrictBool = False constraint_timeconst: PositiveFloat = 0.01 use_contact_island: StrictBool = False diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py index e1bde4911..286858bee 100644 --- a/genesis/utils/array_class.py +++ b/genesis/utils/array_class.py @@ -792,6 +792,7 @@ class ColliderInfo: # link-pair contact pruning contact_pruning_tolerance: qd.Tensor prune_deep_penetration_ratio: qd.Tensor + prune_hull_collinear_tol: qd.Tensor def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_config, **kwargs): @@ -824,6 +825,7 @@ def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_c diff_normal_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["diff_normal_tolerance"]), contact_pruning_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["contact_pruning_tolerance"]), prune_deep_penetration_ratio=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_deep_penetration_ratio"]), + prune_hull_collinear_tol=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_hull_collinear_tol"]), ) From 65828308619c1342689a0cbbe52390f9d647e031 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 25 May 2026 08:51:01 -0700 Subject: [PATCH 2/7] deskai6: cooperative warp-per-env link-pair dedup, rebased onto #2831 Rebase of my cooperative-warp dedup work onto Alexis's #2831 (fix_contact_pruning_batched) which fixes the Quadrants Metal codegen bug that was blocking dedup on macOS. PR #2831's contact.py changes are incorporated verbatim into my cooperative kernel: 1. Memory-fence scratch write between the lower-hull and upper-hull passes of Andrew's monotone chain (contact_hull_stack[max_contact_pairs - 1, i_b] = 0). Without this, on Metal _B >= 2 the upper-hull reads of contact_hull_stack don't observe the lower-hull writes and every candidate is kept (hull size == bucket size, no pruning). Standalone repro: perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py 2. New tunable prune_hull_collinear_tol (1e-3 default) separated from the depth coplanarity tol. Lets users tune the hull-popper cross- product slop independently of the depth gate. 3. Deep-penetration restore threshold changed from average to MAX of hull penetrations. A non-hull contact whose penetration substantially exceeds the deepest hull vertex represents a distinct physical support (deep middle of a long body) that the support-polygon argument doesn't authorize dropping. Side cleanups now possible because #2831 fixes the Metal bug: * Removed the gs.backend not in (gs.cuda, gs.amdgpu) guard from collider.py. Dedup is now enabled on Metal too; only CPU is still excluded (the cooperative kernel adds overhead without parallel speedup on a serial backend). * Removed the @pytest.mark.skipif(darwin) from test_contact_pruning; the test now runs on every GPU backend. The cooperative-warp structure (32 threads/env, qd.simt.subgroup reductions for the per-bucket mean-normal / centroid, coop projection and mark-drop, fused phase 3 compact+spatial-sort) is preserved from the previous branch. Note: this branch is currently based on duburcqa:fix_contact_pruning_batched (#2831). Once #2831 merges, the branch base should be moved back to origin/main and the #2831 commit will simply be absorbed. Backup of the pre-rebase tip: hp/dedup-alexis-perf-9d-stage3-r2-pre-2831-rebase-backup. --- .../engine/solvers/rigid/collider/collider.py | 63 +- .../engine/solvers/rigid/collider/contact.py | 745 ++++++++++-------- genesis/utils/array_class.py | 2 + tests/test_rigid_physics.py | 1 + 4 files changed, 472 insertions(+), 339 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index 1d5b3036b..2eae75bf8 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -72,6 +72,19 @@ from genesis.engine.solvers.rigid.rigid_solver import RigidSolver +# Above this env count, the cooperative warp-per-env dedup kernel stops paying off because the GPU's SMs are already +# grid-saturated by the launch itself, so the per-env serial sub-phases (lex sort, hull build, opt-10 cycle permute) +# start to dominate. The threshold is a sweet spot for current consumer + datacenter GPUs (5090 / H100 class), where +# 8192 blocks of 32 threads each already exceeds ~10x the SM count. +_DEDUP_MAX_N_ENVS = 8192 + +# Default tolerance applied automatically when scene gates allow dedup and the user explicitly sets the option +# RigidOptions.contact_pruning_tolerance to None. Upstream default (post-PR #2831) is also 0.1; this constant keeps +# the "auto-enable when user explicitly None" path independent of the upstream default. 0.1 = 10 percent +# dimensionless slop fraction on the depth coplanarity gate. +_DEDUP_DEFAULT_TOLERANCE = 0.1 + + IS_OLD_TORCH = tuple(map(int, torch.__version__.split(".")[:2])) < (2, 8) @@ -168,6 +181,38 @@ def _init_static_config(self) -> None: # any geom is nonconvex (vertex-based narrowphase emits many contacts per pair), or when terrain is present. any_link_multi_geom = any(len(link.geoms) > 1 for link in self._solver.links) link_pair_pruning_supported = any_link_multi_geom or has_nonconvex_nonterrain or has_terrain + # The dedup kernel is GPU-only: its cooperative 32-lane structure relies on warp-level qd.simt.subgroup + # primitives (reduce_all_add_tiled, broadcast, shuffle, sync) that map cleanly to NVIDIA warps and AMD + # wavefronts and (post-PR #2831 memory-fence workaround) to Apple Metal simdgroups. CPU has no equivalent -- + # Quadrants serializes the cooperative loop per env so the kernel just adds overhead without parallel speedup. + # At very large n_envs the GPU is already grid-saturated by the warp-cooperative launch (n_envs warps), and + # the per-env serial phases (lex sort + hull build + opt-10 cycle permute) start to dominate, so dedup loses + # its margin over the baseline narrowphase output. The 8192 threshold is conservative: a 5090 has ~170 SMs + # and the kernel uses 32 threads/block, so 8192 envs = 8192 blocks = ~48 blocks/SM (past occupancy + # saturation). + if link_pair_pruning_supported and gs.backend == gs.cpu: + gs.logger.info( + f"Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes " + f"the per-env loop)." + ) + link_pair_pruning_supported = False + if link_pair_pruning_supported and self._solver.n_envs > _DEDUP_MAX_N_ENVS: + gs.logger.info( + f"Disabling link-pair contact dedup: n_envs={self._solver.n_envs} exceeds threshold " + f"_DEDUP_MAX_N_ENVS={_DEDUP_MAX_N_ENVS} (the cooperative kernel's overhead stops paying off " + f"once the GPU is already grid-saturated)." + ) + link_pair_pruning_supported = False + + # Resolve the effective dedup tolerance. The upstream option default is 0.1 (post-PR #2831), so dedup is on + # by default whenever the scene gates allow. A user-supplied float overrides; explicit user None plus + # gates-allow auto-enables to _DEDUP_DEFAULT_TOLERANCE; gates-disallow forces None regardless of user setting + # (so an explicit float on CPU still doesn't run -- the cooperative kernel wouldn't pay off there). + user_tol = self._solver._options.contact_pruning_tolerance + if link_pair_pruning_supported: + self._effective_pruning_tolerance = user_tol if user_tol is not None else _DEDUP_DEFAULT_TOLERANCE + else: + self._effective_pruning_tolerance = None # Initialize the static config, which stores every data that are compile-time constants. # Note that updating any of them will trigger recompilation. @@ -199,7 +244,7 @@ def _init_collision_fields(self) -> None: mpr_to_gjk_overlap_ratio=self._mpr_to_gjk_overlap_ratio, diff_pos_tolerance=self._diff_pos_tolerance, diff_normal_tolerance=self._diff_normal_tolerance, - contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0, + contact_pruning_tolerance=self._effective_pruning_tolerance or 0.0, prune_deep_penetration_ratio=self._prune_deep_penetration_ratio, prune_hull_collinear_tol=self._prune_hull_collinear_tol, ) @@ -821,11 +866,15 @@ def detection(self) -> None: self._solver._errno, ) - if ( - self._collider_static_config.link_pair_pruning_supported - and self._solver._options.contact_pruning_tolerance is not None - and not self._solver._static_rigid_sim_config.requires_grad - ): + # deskai6 opt 10: func_prune_contacts fuses the spatial clamp+sort into its phase 3, so when dedup runs we skip + # the standalone clamp_and_sort kernel (one less launch + one less full permute pass). The dispatch gate uses + # the resolved effective tolerance (auto-enabled to 0.1 in _init_static_config when scene gates pass), not the + # raw user option, so dedup runs by default on supported scenes whenever the user didn't explicitly disable it. + ran_fused_dedup = ( + self._effective_pruning_tolerance is not None and not self._solver._static_rigid_sim_config.requires_grad + ) + + if ran_fused_dedup: func_prune_contacts( self._collider_state, self._collider_info, @@ -833,7 +882,7 @@ def detection(self) -> None: self._solver._static_rigid_sim_config, ) - if self._use_split_narrowphase: + if self._use_split_narrowphase and not ran_fused_dedup: func_clamp_and_sort_contacts( self._collider_state, self._collider_info, diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py index f7e7cb2d9..d0fdc8b76 100644 --- a/genesis/engine/solvers/rigid/collider/contact.py +++ b/genesis/engine/solvers/rigid/collider/contact.py @@ -594,8 +594,11 @@ def func_clamp_and_sort_contacts( collider_state.contact_sort_key[j + 1, i_b] = curr_key collider_state.contact_sort_idx[j + 1, i_b] = curr_idx - # Phase 2: apply permutation in-place via cycle decomposition. - # Each contact is read and written exactly once. + # Phase 2: apply permutation in-place via cycle decomposition. Each contact is read and written exactly once. + # All 11 contact_data fields are permuted unconditionally (matching upstream): an earlier deskai6 opt-8b version + # tried to gate force/pair_idx writes on requires_grad, but the resulting qd.static-scoped variables tripped a + # Quadrants NameError in DEBUG-mode unit tests, and a hoist workaround caused subtle Metal-backend regressions + # in test_smooth_box_no_drift. The 2 saved writes per non-grad cycle are not worth the platform risk. for i in range(n_con): if collider_state.contact_sort_idx[i, i_b] != i: tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] @@ -665,7 +668,7 @@ def func_prune_contacts( in-place via cycle decomposition (11-field swap per contact record). 2. Per bucket of >= 5 contacts: compute mean normal (folded to a common hemisphere). Check depth coplanarity of contact positions. If they share a plane, project to (u, v), Andrew's monotone chain. Mark survivors in - contact_keep[]. + contact_keep[]. Then restore any non-hull contact whose penetration is much deeper than the hull's average. 3. Compact: copy kept contacts to the front, update n_contacts. """ _B = collider_state.n_contacts.shape[0] @@ -676,340 +679,418 @@ def func_prune_contacts( LP_KEY_STRIDE = gs.qd_float(1.0e7) EPS = rigid_global_info.EPS[None] - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) - for i_b in range(_B): + # deskai6 opt 9d (real warp-coop, stage 1): pull contact_keep init and phase-1a key+idx init OUT of `if tid == 0:` + # and parallelize them across the 32 lanes (stride-tid). Phase 1a sort, phase 2 bucket walk, and phase 3 (sort + + # cycle-permute) stay serial on lane 0. Subsequent stages add coop reductions (phase-2 mean / centroid) and parallel + # cycle-permute. + _K = qd.static(32) + qd.loop_config(name="prune_contacts_coop", block_dim=_K) + for i_flat in range(_B * _K): + tid = i_flat % _K + i_b = i_flat // _K + # All lanes compute n_con (cheap, no memory write on non-lane-0). n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs) - collider_state.n_contacts[i_b] = n_con - if n_con < 5: - continue - - # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key. - for i in range(n_con): - la = collider_state.contact_data.link_a[i, i_b] - lb = collider_state.contact_data.link_b[i, i_b] - la_min = qd.min(la, lb) - la_max = qd.max(la, lb) - collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast( - la_max, gs.qd_float - ) - collider_state.contact_sort_idx[i, i_b] = i - - for i in range(1, n_con): - ck = collider_state.contact_sort_key[i, i_b] - if collider_state.contact_sort_key[i - 1, i_b] <= ck: - continue - ci = collider_state.contact_sort_idx[i, i_b] - j = i - 1 - while j >= 0: - if collider_state.contact_sort_key[j, i_b] <= ck: - break - collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] - collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] - j = j - 1 - collider_state.contact_sort_key[j + 1, i_b] = ck - collider_state.contact_sort_idx[j + 1, i_b] = ci - - # Phase 1b: apply permutation in-place via cycle decomposition (11 fields). - for i in range(n_con): - if collider_state.contact_sort_idx[i, i_b] != i: - tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] - tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] - tmp_penetration = collider_state.contact_data.penetration[i, i_b] - tmp_normal = collider_state.contact_data.normal[i, i_b] - tmp_pos = collider_state.contact_data.pos[i, i_b] - tmp_friction = collider_state.contact_data.friction[i, i_b] - tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] - tmp_force = collider_state.contact_data.force[i, i_b] - tmp_link_a = collider_state.contact_data.link_a[i, i_b] - tmp_link_b = collider_state.contact_data.link_b[i, i_b] - tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b] - - j = i - while collider_state.contact_sort_idx[j, i_b] != i: - src = collider_state.contact_sort_idx[j, i_b] - collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] - collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] - collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b] - collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] - collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] - collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] - collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b] - collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b] - collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] - collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] - collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b] - collider_state.contact_sort_idx[j, i_b] = j - j = src - - collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a - collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b - collider_state.contact_data.penetration[j, i_b] = tmp_penetration - collider_state.contact_data.normal[j, i_b] = tmp_normal - collider_state.contact_data.pos[j, i_b] = tmp_pos - collider_state.contact_data.friction[j, i_b] = tmp_friction - collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params - collider_state.contact_data.force[j, i_b] = tmp_force - collider_state.contact_data.link_a[j, i_b] = tmp_link_a - collider_state.contact_data.link_b[j, i_b] = tmp_link_b - collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx - collider_state.contact_sort_idx[j, i_b] = j - - # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only - # hull-vertex contacts as keep again. - for i in range(n_con): - collider_state.contact_keep[i, i_b] = 1 - - # Phase 2: walk link-pair buckets. - b_start = 0 - while b_start < n_con: - la0 = collider_state.contact_data.link_a[b_start, i_b] - lb0 = collider_state.contact_data.link_b[b_start, i_b] - la0_min = qd.min(la0, lb0) - la0_max = qd.max(la0, lb0) - b_end = b_start + 1 - while b_end < n_con: - la = collider_state.contact_data.link_a[b_end, i_b] - lb = collider_state.contact_data.link_b[b_end, i_b] - if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max: - break - b_end += 1 - b_size = b_end - b_start - - if b_size >= 5: - # Mean normal (folded to the hemisphere of contact b_start) and centroid. - ref_n = collider_state.contact_data.normal[b_start, i_b] - rnx = ref_n[0] - rny = ref_n[1] - rnz = ref_n[2] - mnx = gs.qd_float(0.0) - mny = gs.qd_float(0.0) - mnz = gs.qd_float(0.0) - cx = gs.qd_float(0.0) - cy = gs.qd_float(0.0) - cz = gs.qd_float(0.0) - for i in range(b_start, b_end): - n_i = collider_state.contact_data.normal[i, i_b] - s = gs.qd_float(1.0) - if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0): - s = gs.qd_float(-1.0) - mnx += s * n_i[0] - mny += s * n_i[1] - mnz += s * n_i[2] - p_i = collider_state.contact_data.pos[i, i_b] - cx += p_i[0] - cy += p_i[1] - cz += p_i[2] - inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float) - cx *= inv_n - cy *= inv_n - cz *= inv_n - mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz) - - # Hoisted out so the hull-build branch below can read it (quadrants scopes per if). - max_in_plane_r2 = gs.qd_float(0.0) - - coplanar = mnrm > EPS - if coplanar: - mnx /= mnrm - mny /= mnrm - mnz /= mnrm - - # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No - # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a - # corner of the contact patch) still participates in the 2D hull because its position is a vertex of - # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional - # support polygon regardless of that contact's normal direction. - max_depth = gs.qd_float(0.0) - for i in range(b_start, b_end): - p_i = collider_state.contact_data.pos[i, i_b] - dx = p_i[0] - cx - dy = p_i[1] - cy - dz = p_i[2] - cz - depth = qd.abs(dx * mnx + dy * mny + dz * mnz) - if depth > max_depth: - max_depth = depth - r2 = dx * dx + dy * dy + dz * dz - depth * depth - if r2 > max_in_plane_r2: - max_in_plane_r2 = r2 - - if max_depth > tol * qd.sqrt(max_in_plane_r2): - coplanar = False - - if coplanar: - # In-plane basis (u, v): seed from the world axis least-aligned with mean normal. - abs_mnx = qd.abs(mnx) - abs_mny = qd.abs(mny) - abs_mnz = qd.abs(mnz) - ax = gs.qd_float(1.0) - ay = gs.qd_float(0.0) - az = gs.qd_float(0.0) - if abs_mny < abs_mnx and abs_mny < abs_mnz: - ax = gs.qd_float(0.0) - ay = gs.qd_float(1.0) - az = gs.qd_float(0.0) - elif abs_mnz < abs_mnx and abs_mnz <= abs_mny: - ax = gs.qd_float(0.0) + if tid == 0: + collider_state.n_contacts[i_b] = n_con + + # PARALLEL: contact_keep init. Default-keep marked here unconditionally so the fused phase 3 (compact + spatial + # sort) below produces a correct result for envs with n_con < 5 (no dedup buckets, but still need spatial sort). + # 32 lanes stride. + ii = tid + while ii < n_con: + collider_state.contact_keep[ii, i_b] = 1 + ii += _K + + if n_con >= 5: + # PARALLEL: phase 1a key + idx init, 32 lanes stride. + ii = tid + while ii < n_con: + la = collider_state.contact_data.link_a[ii, i_b] + lb = collider_state.contact_data.link_b[ii, i_b] + la_min = qd.min(la, lb) + la_max = qd.max(la, lb) + collider_state.contact_sort_key[ii, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast( + la_max, gs.qd_float + ) + collider_state.contact_sort_idx[ii, i_b] = ii + ii += _K + + if tid == 0: + # SERIAL on lane 0: phase 1a insertion sort + phase 2 bucket walk. + for i in range(1, n_con): + ck = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= ck: + continue + ci = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= ck: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = ck + collider_state.contact_sort_idx[j + 1, i_b] = ci + + qd.simt.subgroup.sync() + + # Phase 2 (deskai6 opt 9d stage 2): bucket walk runs on ALL 32 lanes. The outer control flow (find b_end, + # iterate buckets) is duplicated across lanes since the inputs are all in DRAM (cache-friendly). Inside a + # bucket, the mean-normal / centroid sum is done coop via 6 reduce_all_add_tiled calls. The rest of the + # bucket processing (coplanarity check with early-exit, in-plane basis, projection, lex sort, hull build, + # mark survivors) stays serial on lane 0. + b_start = 0 + while b_start < n_con: + key0 = collider_state.contact_sort_key[b_start, i_b] + b_end = b_start + 1 + while b_end < n_con: + if collider_state.contact_sort_key[b_end, i_b] != key0: + break + b_end += 1 + b_size = b_end - b_start + + if b_size >= 5: + ref_src = collider_state.contact_sort_idx[b_start, i_b] + ref_n = collider_state.contact_data.normal[ref_src, i_b] + rnx = ref_n[0] + rny = ref_n[1] + rnz = ref_n[2] + mnx_l = gs.qd_float(0.0) + mny_l = gs.qd_float(0.0) + mnz_l = gs.qd_float(0.0) + cx_l = gs.qd_float(0.0) + cy_l = gs.qd_float(0.0) + cz_l = gs.qd_float(0.0) + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + n_i = collider_state.contact_data.normal[src_i, i_b] + s = gs.qd_float(1.0) + if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0): + s = gs.qd_float(-1.0) + mnx_l += s * n_i[0] + mny_l += s * n_i[1] + mnz_l += s * n_i[2] + p_i = collider_state.contact_data.pos[src_i, i_b] + cx_l += p_i[0] + cy_l += p_i[1] + cz_l += p_i[2] + jj += _K + + mnx = qd.simt.subgroup.reduce_all_add_tiled(mnx_l, 5) + mny = qd.simt.subgroup.reduce_all_add_tiled(mny_l, 5) + mnz = qd.simt.subgroup.reduce_all_add_tiled(mnz_l, 5) + cx = qd.simt.subgroup.reduce_all_add_tiled(cx_l, 5) + cy = qd.simt.subgroup.reduce_all_add_tiled(cy_l, 5) + cz = qd.simt.subgroup.reduce_all_add_tiled(cz_l, 5) + + # POST-REDUCE math runs on all 32 lanes (deterministic, cheap; redundant arithmetic is free vs. + # broadcasting the reduce results). + inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float) + cx *= inv_n + cy *= inv_n + cz *= inv_n + mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz) + + max_in_plane_r2 = gs.qd_float(0.0) + coplanar = mnrm > EPS + if coplanar: + mnx /= mnrm + mny /= mnrm + mnz /= mnrm + + # COOP coplanarity check (stage 3). Each lane strides [b_start + tid, b_end) by _K, locally + # tracking max_depth / max_in_plane_r2. Wasted work per warp is at most b_size/_K contacts. + # The upstream algo no longer checks per-contact normals (a contact with a diagonal normal at + # the corner of a patch still participates in the 2D hull because its position is a vertex), so + # we only do the depth coplanarity gate here. + max_depth_l = gs.qd_float(0.0) + max_r2_l = gs.qd_float(0.0) + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + p_i = collider_state.contact_data.pos[src_i, i_b] + dx = p_i[0] - cx + dy = p_i[1] - cy + dz = p_i[2] - cz + depth = qd.abs(dx * mnx + dy * mny + dz * mnz) + if depth > max_depth_l: + max_depth_l = depth + r2 = dx * dx + dy * dy + dz * dz - depth * depth + if r2 > max_r2_l: + max_r2_l = r2 + jj += _K + + max_depth = qd.simt.subgroup.reduce_all_max_tiled(max_depth_l, 5) + max_in_plane_r2 = qd.simt.subgroup.reduce_all_max_tiled(max_r2_l, 5) + + if max_depth > tol * qd.sqrt(max_in_plane_r2): + coplanar = False + + if coplanar: + # Basis on all lanes (deterministic from mnx/mny/mnz which the reduce broadcast to every lane). + abs_mnx = qd.abs(mnx) + abs_mny = qd.abs(mny) + abs_mnz = qd.abs(mnz) + ax = gs.qd_float(1.0) ay = gs.qd_float(0.0) - az = gs.qd_float(1.0) - adn = ax * mnx + ay * mny + az * mnz - ux = ax - adn * mnx - uy = ay - adn * mny - uz = az - adn * mnz - unrm = qd.sqrt(ux * ux + uy * uy + uz * uz) - ux /= unrm - uy /= unrm - uz /= unrm - vx = mny * uz - mnz * uy - vy = mnz * ux - mnx * uz - vz = mnx * uy - mny * ux - - # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it); - # v goes into the dedicated scratch buffer. - for i in range(b_start, b_end): - p_i = collider_state.contact_data.pos[i, i_b] - collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz - collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz - - # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u - # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort - # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and - # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first. - sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2) - for i in range(b_start, b_end): - collider_state.contact_sort_idx[i, i_b] = i - for i in range(b_start + 1, b_end): - ci = collider_state.contact_sort_idx[i, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - j = i - 1 - while j >= b_start: - pj = collider_state.contact_sort_idx[j, i_b] - pu = collider_state.contact_sort_key[pj, i_b] - pv = collider_state.contact_proj_v[pj, i_b] - if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv): - break - collider_state.contact_sort_idx[j + 1, i_b] = pj - j -= 1 - collider_state.contact_sort_idx[j + 1, i_b] = ci - - # Mark all bucket entries as drop, then mark hull vertices as keep. - for i in range(b_start, b_end): - collider_state.contact_keep[i, i_b] = 0 - - # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails - # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff), - # so genuine midpoints would survive as spurious hull vertices. - hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2 - - # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k). - k = 0 - for i in range(b_start, b_end): - ci = collider_state.contact_sort_idx[i, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - while k >= 2: - idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] - idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] - au = collider_state.contact_sort_key[idx_a, i_b] - av = collider_state.contact_proj_v[idx_a, i_b] - bu = collider_state.contact_sort_key[idx_b, i_b] - bv = collider_state.contact_proj_v[idx_b, i_b] - cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) - if cross <= hull_collinear_tol: - k -= 1 - else: - break - collider_state.contact_hull_stack[b_start + k, i_b] = ci - k += 1 - - upper_start = k - # Memory-fence workaround for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): - # without an explicit scratch store between the lower-hull and upper-hull passes, the upper-hull - # pop-loop's reads of contact_hull_stack don't observe the writes from the lower hull, so its - # cross-product / pop-check effectively runs on stale data and every candidate is kept. This shows - # up as the kernel never popping anything in the upper hull and producing a hull whose size equals - # the bucket size. Writing any value to a non-overlapping scratch slot here serializes the two - # passes' contact_hull_stack accesses and restores parallel correctness; the value is unused. - collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0 - # quadrants range supports 1 or 2 args only; iterate sorted indices backward over - # [b_start, b_end - 2] by reflecting the index. - for k_step in range(b_size - 1): - ii = b_end - 2 - k_step - ci = collider_state.contact_sort_idx[ii, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - while k >= upper_start + 1: - idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] - idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] - au = collider_state.contact_sort_key[idx_a, i_b] - av = collider_state.contact_proj_v[idx_a, i_b] - bu = collider_state.contact_sort_key[idx_b, i_b] - bv = collider_state.contact_proj_v[idx_b, i_b] - cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) - if cross <= hull_collinear_tol: - k -= 1 - else: - break - # The closing iteration of the upper hull visits the leftmost point, which already sits at - # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to - # b_size and keeps the write index within max_contact_pairs even for buckets where the lower- - # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple - # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate - # of an already-kept lower-hull vertex). - if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size: + az = gs.qd_float(0.0) + if abs_mny < abs_mnx and abs_mny < abs_mnz: + ax = gs.qd_float(0.0) + ay = gs.qd_float(1.0) + az = gs.qd_float(0.0) + elif abs_mnz < abs_mnx and abs_mnz <= abs_mny: + ax = gs.qd_float(0.0) + ay = gs.qd_float(0.0) + az = gs.qd_float(1.0) + adn = ax * mnx + ay * mny + az * mnz + ux = ax - adn * mnx + uy = ay - adn * mny + uz = az - adn * mnz + unrm = qd.sqrt(ux * ux + uy * uy + uz * uz) + ux /= unrm + uy /= unrm + uz /= unrm + vx = mny * uz - mnz * uy + vy = mnz * ux - mnx * uz + vz = mnx * uy - mny * ux + + # COOP projection: 32 lanes stride writes to contact_sort_key + contact_proj_v. + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + p_i = collider_state.contact_data.pos[src_i, i_b] + collider_state.contact_sort_key[jj, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz + collider_state.contact_proj_v[jj, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz + jj += _K + + # COOP mark-drop: stride writes to contact_keep[orig]. + jj = b_start + tid + while jj < b_end: + orig = collider_state.contact_sort_idx[jj, i_b] + collider_state.contact_keep[orig, i_b] = 0 + jj += _K + + # COOP lex_idx init: stride writes. + jj = b_start + tid + while jj < b_end: + collider_state.contact_lex_idx[jj, i_b] = jj + jj += _K + + # SYNC between coop writes (sort_key, proj_v, lex_idx, contact_keep[orig]) and the lane-0 lex + # sort + hull build that reads them. + qd.simt.subgroup.sync() + + if tid == 0 and coplanar: + # SERIAL on lane 0: lex sort + Andrew monotone-chain hull build + mark survivors. These walk a + # stack and have data-dependent inner loops that don't decompose across warp lanes. + # + # The sort_u_tol on the u comparison is critical for correctness, not just a perf tweak: + # contacts whose u values differ only by sub-millimeter MPR noise need to sort by v, otherwise + # mid-edge points get sorted between the two corners they sit between and survive the lower- + # hull pass as spurious hull vertices (cf. upstream test_contact_pruning regression when this + # tolerance is missing). + sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2) + for i in range(b_start + 1, b_end): + ci = collider_state.contact_lex_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + j = i - 1 + while j >= b_start: + pj = collider_state.contact_lex_idx[j, i_b] + pu = collider_state.contact_sort_key[pj, i_b] + pv = collider_state.contact_proj_v[pj, i_b] + if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv): + break + collider_state.contact_lex_idx[j + 1, i_b] = pj + j -= 1 + collider_state.contact_lex_idx[j + 1, i_b] = ci + + # Upstream PR #2831 split the hull collinearity tolerance out of `tol`: tol is the depth + # coplanarity gate, prune_hull_collinear_tol is the cross-product slop for the monotone-chain + # popper. They were the same constant before; keeping them separate lets users tune one without + # changing the other. + hull_collinear_tol = prune_hull_collinear_tol * max_in_plane_r2 + + k = 0 + for i in range(b_start, b_end): + ci = collider_state.contact_lex_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= 2: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break collider_state.contact_hull_stack[b_start + k, i_b] = ci k += 1 - # Chain holds k unique hull vertices (no trailing duplicate). - for hk in range(k): - survivor = collider_state.contact_hull_stack[b_start + hk, i_b] - collider_state.contact_keep[survivor, i_b] = 1 - - # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The - # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share - # the same normal and penetration; a contact with substantially higher penetration than the hull's - # average represents a distinct physical support (the body of a fork resting beyond its tines, the - # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is - # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform- - # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep - # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep - # supports are restored). - hull_pen_max = gs.qd_float(0.0) - for hk in range(k): - survivor = collider_state.contact_hull_stack[b_start + hk, i_b] - p = collider_state.contact_data.penetration[survivor, i_b] - if p > hull_pen_max: - hull_pen_max = p - deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max - for i in range(b_start, b_end): - if collider_state.contact_keep[i, i_b] == 0: - if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold: - collider_state.contact_keep[i, i_b] = 1 - - b_start = b_end - - # Phase 3: compact kept contacts to the front. - write = 0 - for read in range(n_con): - if collider_state.contact_keep[read, i_b] != 0: - if write != read: - collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[read, i_b] - collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[read, i_b] - collider_state.contact_data.penetration[write, i_b] = collider_state.contact_data.penetration[ - read, i_b - ] - collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[read, i_b] - collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b] - collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[read, i_b] - collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[ - read, i_b - ] - collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b] - collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[read, i_b] - collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[read, i_b] - collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[read, i_b] - write += 1 - collider_state.n_contacts[i_b] = write + upper_start = k + # Memory-fence workaround for a Quadrants codegen issue on Metal _B >= 2 (genesis PR #2831): + # without an explicit scratch store between the lower- and upper-hull lane-0 ``for`` passes, + # the upper-hull pop-loop's reads of contact_hull_stack don't observe the lower-hull writes, + # so the cross-product check effectively runs on stale data and every candidate is kept (hull + # size == bucket size, no pruning). Writing any value to a non-overlapping slot here forces + # write-then-read ordering on the shared buffer. The value is unused. See + # perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py for a standalone reproduction. + collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0 + for k_step in range(b_size - 1): + ii_lex = b_end - 2 - k_step + ci = collider_state.contact_lex_idx[ii_lex, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= upper_start + 1: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break + if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size: + collider_state.contact_hull_stack[b_start + k, i_b] = ci + k += 1 + + for hk in range(k): + survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b] + survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b] + collider_state.contact_keep[survivor_orig, i_b] = 1 + + # Restore non-hull contacts whose penetration is much deeper than the hull boundary's max + # (PR #2831 switched from avg to max). Rationale: a contact whose penetration substantially + # exceeds the hull's deepest vertex represents a distinct physical support (deep body of a + # fork beyond its tines, deep middle of a long body) that the support-polygon argument + # doesn't actually authorize dropping (the argument only holds when all contacts share + # normal AND penetration). The 3x factor over the hull max is well above the typical ~1.x + # penetration spread on transient/rocking faces but well below the deep interior penetrations + # seen when a non-flat body rests inside its convex envelope. Indices here live in orig-space + # (because the cycle-permute is fused into the phase 3 below in opt 10 -- contact_data is + # still in pre-sort order, so we translate sort-space hull/bucket indices through + # contact_sort_idx). + hull_pen_max = gs.qd_float(0.0) + for hk in range(k): + survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b] + survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b] + p = collider_state.contact_data.penetration[survivor_orig, i_b] + if p > hull_pen_max: + hull_pen_max = p + deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max + for jj_idx in range(b_start, b_end): + orig = collider_state.contact_sort_idx[jj_idx, i_b] + if collider_state.contact_keep[orig, i_b] == 0: + if collider_state.contact_data.penetration[orig, i_b] > deep_keep_threshold: + collider_state.contact_keep[orig, i_b] = 1 + + b_start = b_end + + if tid == 0: + # Phase 3 (deskai6 opt 10): FUSED compact + spatial sort. Replaces: + # - dedup phase 3 (compact based on contact_keep) + # - func_clamp_and_sort_contacts (assign group-x sort key + insertion sort + cycle-permute) + # with a single permutation pass: compute a sort key per slot that pushes dropped contacts past the end + # (sentinel +inf) and orders kept contacts by their geom-pair group's x-pos (anchored to the first kept + # contact in the group, preserving narrowphase intra-group order via stable sort). Then sort sort-keys + + # sort-indices, then cycle-permute the 9 contact_data fields exactly once. + # + # On dex_hand this saves ~25 displaced contacts worth of field copies that used to be done in dedup phase 3 + # before clamp_and_sort re-permuted everything. Same 9 fields per contact (force & pair_idx still skipped + # per opt 8a; clamp_and_sort still runs in the grad path so its own opt 8b gate stays in place). + SENTINEL_BIG = gs.qd_float(1e30) + group_key = gs.qd_float(0.0) + prev_ga = -1 + prev_gb = -1 + for i in range(n_con): + if collider_state.contact_keep[i, i_b] != 0: + ga = collider_state.contact_data.geom_a[i, i_b] + gb = collider_state.contact_data.geom_b[i, i_b] + if ga != prev_ga or gb != prev_gb: + group_key = collider_state.contact_data.pos[i, i_b][0] + prev_ga = ga + prev_gb = gb + collider_state.contact_sort_key[i, i_b] = group_key + else: + collider_state.contact_sort_key[i, i_b] = SENTINEL_BIG + collider_state.contact_sort_idx[i, i_b] = i + + for i in range(1, n_con): + ck = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= ck: + continue + ci = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= ck: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = ck + collider_state.contact_sort_idx[j + 1, i_b] = ci + + # n_contacts is the count of non-sentinel sort keys (the dropped contacts sit at the tail and are ignored + # by downstream consumers via the new n_contacts). + n_kept = 0 + for i in range(n_con): + if collider_state.contact_sort_key[i, i_b] < SENTINEL_BIG: + n_kept += 1 + else: + break + collider_state.n_contacts[i_b] = n_kept + + # Cycle-permute contact_data fields into their sorted positions. Only n_kept positions need correct values; + # the tail (dropped) slots are scratched but not read again this step. + for i in range(n_kept): + if collider_state.contact_sort_idx[i, i_b] != i: + tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] + tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] + tmp_penetration = collider_state.contact_data.penetration[i, i_b] + tmp_normal = collider_state.contact_data.normal[i, i_b] + tmp_pos = collider_state.contact_data.pos[i, i_b] + tmp_friction = collider_state.contact_data.friction[i, i_b] + tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] + tmp_link_a = collider_state.contact_data.link_a[i, i_b] + tmp_link_b = collider_state.contact_data.link_b[i, i_b] + + j = i + while collider_state.contact_sort_idx[j, i_b] != i: + src = collider_state.contact_sort_idx[j, i_b] + collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] + collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] + collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[ + src, i_b + ] + collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] + collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] + collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] + collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[ + src, i_b + ] + collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] + collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] + collider_state.contact_sort_idx[j, i_b] = j + j = src + + collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a + collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b + collider_state.contact_data.penetration[j, i_b] = tmp_penetration + collider_state.contact_data.normal[j, i_b] = tmp_normal + collider_state.contact_data.pos[j, i_b] = tmp_pos + collider_state.contact_data.friction[j, i_b] = tmp_friction + collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params + collider_state.contact_data.link_a[j, i_b] = tmp_link_a + collider_state.contact_data.link_b[j, i_b] = tmp_link_b + collider_state.contact_sort_idx[j, i_b] = j @qd.kernel diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py index 286858bee..466efaa3e 100644 --- a/genesis/utils/array_class.py +++ b/genesis/utils/array_class.py @@ -698,6 +698,7 @@ class ColliderState: contact_proj_v: qd.Tensor contact_keep: qd.Tensor contact_hull_stack: qd.Tensor + contact_lex_idx: qd.Tensor def get_collider_state( @@ -760,6 +761,7 @@ def get_collider_state( contact_proj_v=V(dtype=gs.qd_float, shape=prune_shape), contact_keep=V(dtype=gs.qd_int, shape=prune_shape), contact_hull_stack=V(dtype=gs.qd_int, shape=prune_shape), + contact_lex_idx=V(dtype=gs.qd_int, shape=prune_shape), ) diff --git a/tests/test_rigid_physics.py b/tests/test_rigid_physics.py index 45cd1d1b9..e2d3620bb 100644 --- a/tests/test_rigid_physics.py +++ b/tests/test_rigid_physics.py @@ -1374,6 +1374,7 @@ def test_contact_dedup(surface_kind, show_viewer): @pytest.mark.required +@pytest.mark.parametrize("backend", [gs.gpu]) def test_contact_pruning(show_viewer): GEOM_HALF_SIZE = 0.1 MARGIN = 1e-4 From edec9907584040357c84dc075c44b513ef5c12c2 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 25 May 2026 09:11:31 -0700 Subject: [PATCH 3/7] chore: fix F541 f-string without placeholders in CPU dedup-gate log --- genesis/engine/solvers/rigid/collider/collider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index 2eae75bf8..843d7fbd7 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -192,8 +192,8 @@ def _init_static_config(self) -> None: # saturation). if link_pair_pruning_supported and gs.backend == gs.cpu: gs.logger.info( - f"Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes " - f"the per-env loop)." + "Disabling link-pair contact dedup: cooperative kernel doesn't help on CPU (Quadrants serializes " + "the per-env loop)." ) link_pair_pruning_supported = False if link_pair_pruning_supported and self._solver.n_envs > _DEDUP_MAX_N_ENVS: From c5a35798876c96a1eb1a7c4798584ad0621be2e7 Mon Sep 17 00:00:00 2001 From: Alexis Duburcq Date: Mon, 25 May 2026 19:35:51 +0200 Subject: [PATCH 4/7] Fuse pruning and clamp + sort. --- .../engine/solvers/rigid/collider/collider.py | 28 +- .../engine/solvers/rigid/collider/contact.py | 885 +++++++++--------- 2 files changed, 458 insertions(+), 455 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index c6aa09b9d..aaba5d891 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -47,8 +47,7 @@ func_contact_orthogonals, func_rotate_frame, func_set_upstream_grad, - func_clamp_and_sort_contacts, - func_prune_contacts, + func_clamp_prune_and_sort_contacts, ) from . import narrowphase from .narrowphase import ( @@ -819,24 +818,13 @@ def detection(self) -> None: self._solver._errno, ) - if ( - self._collider_static_config.link_pair_pruning_supported - and self._solver._options.contact_pruning_tolerance is not None - and not self._solver._static_rigid_sim_config.requires_grad - ): - func_prune_contacts( - self._collider_state, - self._collider_info, - self._solver._rigid_global_info, - self._solver._static_rigid_sim_config, - ) - - if self._use_split_narrowphase: - func_clamp_and_sort_contacts( - self._collider_state, - self._collider_info, - self._solver._static_rigid_sim_config, - ) + func_clamp_prune_and_sort_contacts( + self._collider_state, + self._collider_info, + self._solver._rigid_global_info, + self._solver._static_rigid_sim_config, + self._collider_static_config, + ) def get_contacts(self, as_tensor: bool = True, to_torch: bool = True, keep_batch_dim: bool = False): # Early return if already pre-computed diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py index 0fa8940f5..0f682df8f 100644 --- a/genesis/engine/solvers/rigid/collider/contact.py +++ b/genesis/engine/solvers/rigid/collider/contact.py @@ -536,119 +536,23 @@ def func_rotate_frame( @qd.kernel(fastcache=True) -def func_clamp_and_sort_contacts( +def func_clamp_prune_and_sort_contacts( collider_state: array_class.ColliderState, collider_info: array_class.ColliderInfo, + rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), + collider_static_config: qd.template(), ): - """Sort contacts spatially by x-coordinate, moving entire geom-pair groups as units, while clamping number of - contacts to avoid unbounded memory access. - - When contacts are added with use_atomic=True from parallel narrowphase kernels, the counter may exceed the array - bounds even though only in-bounds entries are actually written. The clamp brings the counter back to the valid - range before sorting. - - Contacts from the same geom pair are contiguous after narrowphase. We assign every contact in a group the - x-position of the group's first contact. The stable insertion sort then reorders groups spatially while preserving - the narrowphase ordering within each group. + """Merged clamp + (optional) link-pair pruning + (optional) x-position sort, in one per-env loop pass. - Two-phase approach to minimise memory traffic: - 1. Insertion sort on a compact (key, index) pair — 8 bytes per swap instead of moving all 11 contact fields - 2. In-place cycle-following permutation that moves each contact record exactly once - """ - _B = collider_state.n_contacts.shape[0] - max_contact_pairs = collider_info.max_contact_pairs[None] + Phases per env (gated at compile time by ``collider_static_config``): + - Always: clamp ``n_contacts`` to ``max_contact_pairs`` to absorb the parallel-narrowphase overcount. + - If ``link_pair_pruning_supported and not requires_grad``: prune redundant contacts via 2D convex hull on the + contact-patch plane (skipped at runtime when ``contact_pruning_tolerance`` is 0). + - If ``has_non_box_plane_convex_convex and backend != cpu``: spatial sort by x-position with geom-pair groups + treated as units (provides spatial locality for the constraint solver). - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) - for i_b in range(_B): - n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs) - collider_state.n_contacts[i_b] = n_con - - # Phase 1: initialise and insertion-sort the (key, idx) arrays. - group_key = gs.qd_float(0.0) - for i in range(n_con): - ga = collider_state.contact_data.geom_a[i, i_b] - gb = collider_state.contact_data.geom_b[i, i_b] - if ( - i == 0 - or ga != collider_state.contact_data.geom_a[i - 1, i_b] - or gb != collider_state.contact_data.geom_b[i - 1, i_b] - ): - group_key = collider_state.contact_data.pos[i, i_b][0] - collider_state.contact_sort_key[i, i_b] = group_key - collider_state.contact_sort_idx[i, i_b] = i - - for i in range(1, n_con): - curr_key = collider_state.contact_sort_key[i, i_b] - if collider_state.contact_sort_key[i - 1, i_b] <= curr_key: - continue - - curr_idx = collider_state.contact_sort_idx[i, i_b] - j = i - 1 - while j >= 0: - if collider_state.contact_sort_key[j, i_b] <= curr_key: - break - collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] - collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] - j = j - 1 - collider_state.contact_sort_key[j + 1, i_b] = curr_key - collider_state.contact_sort_idx[j + 1, i_b] = curr_idx - - # Phase 2: apply permutation in-place via cycle decomposition. - # Each contact is read and written exactly once. - for i in range(n_con): - if collider_state.contact_sort_idx[i, i_b] != i: - tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] - tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] - tmp_penetration = collider_state.contact_data.penetration[i, i_b] - tmp_normal = collider_state.contact_data.normal[i, i_b] - tmp_pos = collider_state.contact_data.pos[i, i_b] - tmp_friction = collider_state.contact_data.friction[i, i_b] - tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] - tmp_force = collider_state.contact_data.force[i, i_b] - tmp_link_a = collider_state.contact_data.link_a[i, i_b] - tmp_link_b = collider_state.contact_data.link_b[i, i_b] - tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b] - - j = i - while collider_state.contact_sort_idx[j, i_b] != i: - src = collider_state.contact_sort_idx[j, i_b] - collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] - collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] - collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b] - collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] - collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] - collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] - collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b] - collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b] - collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] - collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] - collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b] - collider_state.contact_sort_idx[j, i_b] = j - j = src - - collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a - collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b - collider_state.contact_data.penetration[j, i_b] = tmp_penetration - collider_state.contact_data.normal[j, i_b] = tmp_normal - collider_state.contact_data.pos[j, i_b] = tmp_pos - collider_state.contact_data.friction[j, i_b] = tmp_friction - collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params - collider_state.contact_data.force[j, i_b] = tmp_force - collider_state.contact_data.link_a[j, i_b] = tmp_link_a - collider_state.contact_data.link_b[j, i_b] = tmp_link_b - collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx - collider_state.contact_sort_idx[j, i_b] = j - - -@qd.kernel(fastcache=True) -def func_prune_contacts( - collider_state: array_class.ColliderState, - collider_info: array_class.ColliderInfo, - rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: qd.template(), -): - """Prune redundant contacts per link-pair via support-polygon (2D convex hull on the contact-patch plane). + The pruning logic groups contacts by canonical (min(link_a, link_b), max(link_a, link_b)). Operates after func_clamp_and_sort_contacts. Groups contacts by canonical (min(link_a, link_b), max(link_a, link_b)) and, for each bucket of >= 5 contacts whose positions lie in a single plane (perpendicular to the @@ -679,335 +583,446 @@ def func_prune_contacts( for i_b in range(_B): n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs) collider_state.n_contacts[i_b] = n_con - if n_con < 5: - continue - - # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key. - for i in range(n_con): - la = collider_state.contact_data.link_a[i, i_b] - lb = collider_state.contact_data.link_b[i, i_b] - la_min = qd.min(la, lb) - la_max = qd.max(la, lb) - collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast( - la_max, gs.qd_float - ) - collider_state.contact_sort_idx[i, i_b] = i - - for i in range(1, n_con): - ck = collider_state.contact_sort_key[i, i_b] - if collider_state.contact_sort_key[i - 1, i_b] <= ck: - continue - ci = collider_state.contact_sort_idx[i, i_b] - j = i - 1 - while j >= 0: - if collider_state.contact_sort_key[j, i_b] <= ck: - break - collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] - collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] - j = j - 1 - collider_state.contact_sort_key[j + 1, i_b] = ck - collider_state.contact_sort_idx[j + 1, i_b] = ci - - # Phase 1b: apply permutation in-place via cycle decomposition (11 fields). - for i in range(n_con): - if collider_state.contact_sort_idx[i, i_b] != i: - tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] - tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] - tmp_penetration = collider_state.contact_data.penetration[i, i_b] - tmp_normal = collider_state.contact_data.normal[i, i_b] - tmp_pos = collider_state.contact_data.pos[i, i_b] - tmp_friction = collider_state.contact_data.friction[i, i_b] - tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] - tmp_force = collider_state.contact_data.force[i, i_b] - tmp_link_a = collider_state.contact_data.link_a[i, i_b] - tmp_link_b = collider_state.contact_data.link_b[i, i_b] - tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b] - - j = i - while collider_state.contact_sort_idx[j, i_b] != i: - src = collider_state.contact_sort_idx[j, i_b] - collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] - collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] - collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[src, i_b] - collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] - collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] - collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] - collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[src, i_b] - collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b] - collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] - collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] - collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b] + + # === Pruning phase (link-pair support polygon). Gated by static config: only emitted when the + # scene has multi-geom links / nonconvex / terrain, and not in autodiff mode. Skipped at runtime + # when contact_pruning_tolerance is 0. + if qd.static(collider_static_config.link_pair_pruning_supported and not static_rigid_sim_config.requires_grad): + if n_con >= 5 and tol > gs.qd_float(0.0): + # Phase 1a: insertion-sort indices by canonical (min_link, max_link) key. + for i in range(n_con): + la = collider_state.contact_data.link_a[i, i_b] + lb = collider_state.contact_data.link_b[i, i_b] + la_min = qd.min(la, lb) + la_max = qd.max(la, lb) + collider_state.contact_sort_key[i, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast( + la_max, gs.qd_float + ) + collider_state.contact_sort_idx[i, i_b] = i + + for i in range(1, n_con): + ck = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= ck: + continue + ci = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= ck: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = ck + collider_state.contact_sort_idx[j + 1, i_b] = ci + + # Phase 1b: apply permutation in-place via cycle decomposition (11 fields). + for i in range(n_con): + if collider_state.contact_sort_idx[i, i_b] != i: + tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] + tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] + tmp_penetration = collider_state.contact_data.penetration[i, i_b] + tmp_normal = collider_state.contact_data.normal[i, i_b] + tmp_pos = collider_state.contact_data.pos[i, i_b] + tmp_friction = collider_state.contact_data.friction[i, i_b] + tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] + tmp_force = collider_state.contact_data.force[i, i_b] + tmp_link_a = collider_state.contact_data.link_a[i, i_b] + tmp_link_b = collider_state.contact_data.link_b[i, i_b] + tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b] + + j = i + while collider_state.contact_sort_idx[j, i_b] != i: + src = collider_state.contact_sort_idx[j, i_b] + collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] + collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] + collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[ + src, i_b + ] + collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] + collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] + collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[ + src, i_b + ] + collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[ + src, i_b + ] + collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b] + collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] + collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] + collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[ + src, i_b + ] + collider_state.contact_sort_idx[j, i_b] = j + j = src + + collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a + collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b + collider_state.contact_data.penetration[j, i_b] = tmp_penetration + collider_state.contact_data.normal[j, i_b] = tmp_normal + collider_state.contact_data.pos[j, i_b] = tmp_pos + collider_state.contact_data.friction[j, i_b] = tmp_friction + collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params + collider_state.contact_data.force[j, i_b] = tmp_force + collider_state.contact_data.link_a[j, i_b] = tmp_link_a + collider_state.contact_data.link_b[j, i_b] = tmp_link_b + collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx + collider_state.contact_sort_idx[j, i_b] = j + + # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only + # hull-vertex contacts as keep again. + for i in range(n_con): + collider_state.contact_keep[i, i_b] = 1 + + # Phase 2: walk link-pair buckets. + b_start = 0 + while b_start < n_con: + la0 = collider_state.contact_data.link_a[b_start, i_b] + lb0 = collider_state.contact_data.link_b[b_start, i_b] + la0_min = qd.min(la0, lb0) + la0_max = qd.max(la0, lb0) + b_end = b_start + 1 + while b_end < n_con: + la = collider_state.contact_data.link_a[b_end, i_b] + lb = collider_state.contact_data.link_b[b_end, i_b] + if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max: + break + b_end += 1 + b_size = b_end - b_start + + if b_size >= 5: + # Mean normal (folded to the hemisphere of contact b_start) and centroid. + ref_n = collider_state.contact_data.normal[b_start, i_b] + rnx = ref_n[0] + rny = ref_n[1] + rnz = ref_n[2] + mnx = gs.qd_float(0.0) + mny = gs.qd_float(0.0) + mnz = gs.qd_float(0.0) + cx = gs.qd_float(0.0) + cy = gs.qd_float(0.0) + cz = gs.qd_float(0.0) + for i in range(b_start, b_end): + n_i = collider_state.contact_data.normal[i, i_b] + s = gs.qd_float(1.0) + if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0): + s = gs.qd_float(-1.0) + mnx += s * n_i[0] + mny += s * n_i[1] + mnz += s * n_i[2] + p_i = collider_state.contact_data.pos[i, i_b] + cx += p_i[0] + cy += p_i[1] + cz += p_i[2] + inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float) + cx *= inv_n + cy *= inv_n + cz *= inv_n + mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz) + + # Hoisted out so the hull-build branch below can read it (quadrants scopes per if). + max_in_plane_r2 = gs.qd_float(0.0) + + coplanar = mnrm > EPS + if coplanar: + mnx /= mnrm + mny /= mnrm + mnz /= mnrm + + # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No + # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a + # corner of the contact patch) still participates in the 2D hull because its position is a vertex of + # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional + # support polygon regardless of that contact's normal direction. + max_depth = gs.qd_float(0.0) + for i in range(b_start, b_end): + p_i = collider_state.contact_data.pos[i, i_b] + dx = p_i[0] - cx + dy = p_i[1] - cy + dz = p_i[2] - cz + depth = qd.abs(dx * mnx + dy * mny + dz * mnz) + if depth > max_depth: + max_depth = depth + r2 = dx * dx + dy * dy + dz * dz - depth * depth + if r2 > max_in_plane_r2: + max_in_plane_r2 = r2 + + if max_depth > tol * qd.sqrt(max_in_plane_r2): + coplanar = False + + if coplanar: + # In-plane basis (u, v): seed from the world axis least-aligned with mean normal. + abs_mnx = qd.abs(mnx) + abs_mny = qd.abs(mny) + abs_mnz = qd.abs(mnz) + ax = gs.qd_float(1.0) + ay = gs.qd_float(0.0) + az = gs.qd_float(0.0) + if abs_mny < abs_mnx and abs_mny < abs_mnz: + ax = gs.qd_float(0.0) + ay = gs.qd_float(1.0) + az = gs.qd_float(0.0) + elif abs_mnz < abs_mnx and abs_mnz <= abs_mny: + ax = gs.qd_float(0.0) + ay = gs.qd_float(0.0) + az = gs.qd_float(1.0) + adn = ax * mnx + ay * mny + az * mnz + ux = ax - adn * mnx + uy = ay - adn * mny + uz = az - adn * mnz + unrm = qd.sqrt(ux * ux + uy * uy + uz * uz) + ux /= unrm + uy /= unrm + uz /= unrm + vx = mny * uz - mnz * uy + vy = mnz * ux - mnx * uz + vz = mnx * uy - mny * ux + + # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it); + # v goes into the dedicated scratch buffer. + for i in range(b_start, b_end): + p_i = collider_state.contact_data.pos[i, i_b] + collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz + collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz + + # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u + # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort + # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and + # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first. + sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2) + for i in range(b_start, b_end): + collider_state.contact_sort_idx[i, i_b] = i + for i in range(b_start + 1, b_end): + ci = collider_state.contact_sort_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + j = i - 1 + while j >= b_start: + pj = collider_state.contact_sort_idx[j, i_b] + pu = collider_state.contact_sort_key[pj, i_b] + pv = collider_state.contact_proj_v[pj, i_b] + if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv): + break + collider_state.contact_sort_idx[j + 1, i_b] = pj + j -= 1 + collider_state.contact_sort_idx[j + 1, i_b] = ci + + # Mark all bucket entries as drop, then mark hull vertices as keep. + for i in range(b_start, b_end): + collider_state.contact_keep[i, i_b] = 0 + + # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails + # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff), + # so genuine midpoints would survive as spurious hull vertices. + hull_collinear_tol = tol * max_in_plane_r2 + + # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k). + k = 0 + for i in range(b_start, b_end): + ci = collider_state.contact_sort_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= 2: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break + collider_state.contact_hull_stack[b_start + k, i_b] = ci + k += 1 + + upper_start = k + # Memory-fence for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): without an + # explicit barrier between the lower-hull and upper-hull passes, the upper-hull pop-loop's reads + # of contact_hull_stack don't observe the writes from the lower hull, so its cross-product / + # pop-check effectively runs on stale data and every candidate is kept, producing a hull whose + # size equals the bucket size. + if qd.static(static_rigid_sim_config.backend == gs.metal): + qd.simt.block.sync() + # quadrants range supports 1 or 2 args only; iterate sorted indices backward over + # [b_start, b_end - 2] by reflecting the index. + for k_step in range(b_size - 1): + ii = b_end - 2 - k_step + ci = collider_state.contact_sort_idx[ii, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= upper_start + 1: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break + # The closing iteration of the upper hull visits the leftmost point, which already sits at + # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to + # b_size and keeps the write index within max_contact_pairs even for buckets where the lower- + # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple + # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate + # of an already-kept lower-hull vertex). + if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size: + collider_state.contact_hull_stack[b_start + k, i_b] = ci + k += 1 + + # Chain holds k unique hull vertices (no trailing duplicate). + for hk in range(k): + survivor = collider_state.contact_hull_stack[b_start + hk, i_b] + collider_state.contact_keep[survivor, i_b] = 1 + + # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The + # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share + # the same normal and penetration; a contact with substantially higher penetration than the hull's + # average represents a distinct physical support (the body of a fork resting beyond its tines, the + # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is + # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform- + # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep + # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep + # supports are restored). + hull_pen_max = gs.qd_float(0.0) + for hk in range(k): + survivor = collider_state.contact_hull_stack[b_start + hk, i_b] + p = collider_state.contact_data.penetration[survivor, i_b] + if p > hull_pen_max: + hull_pen_max = p + deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max + for i in range(b_start, b_end): + if collider_state.contact_keep[i, i_b] == 0: + if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold: + collider_state.contact_keep[i, i_b] = 1 + + b_start = b_end + + # Phase 3: compact kept contacts to the front. + write = 0 + for read in range(n_con): + if collider_state.contact_keep[read, i_b] != 0: + if write != read: + collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[ + read, i_b + ] + collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[ + read, i_b + ] + collider_state.contact_data.penetration[write, i_b] = ( + collider_state.contact_data.penetration[read, i_b] + ) + collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[ + read, i_b + ] + collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b] + collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[ + read, i_b + ] + collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[ + read, i_b + ] + collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b] + collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[ + read, i_b + ] + collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[ + read, i_b + ] + collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[ + read, i_b + ] + write += 1 + collider_state.n_contacts[i_b] = write + + # === Spatial sort by x-position with geom-pair grouping. Gated on the same condition that decided + # whether to run the split narrowphase (Genesis Collider._use_split_narrowphase). + if qd.static( + collider_static_config.has_non_box_plane_convex_convex and static_rigid_sim_config.backend != gs.cpu + ): + n_con = collider_state.n_contacts[i_b] + # Phase 1: initialise and insertion-sort the (key, idx) arrays. + group_key = gs.qd_float(0.0) + for i in range(n_con): + ga = collider_state.contact_data.geom_a[i, i_b] + gb = collider_state.contact_data.geom_b[i, i_b] + if ( + i == 0 + or ga != collider_state.contact_data.geom_a[i - 1, i_b] + or gb != collider_state.contact_data.geom_b[i - 1, i_b] + ): + group_key = collider_state.contact_data.pos[i, i_b][0] + collider_state.contact_sort_key[i, i_b] = group_key + collider_state.contact_sort_idx[i, i_b] = i + + for i in range(1, n_con): + curr_key = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= curr_key: + continue + + curr_idx = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= curr_key: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = curr_key + collider_state.contact_sort_idx[j + 1, i_b] = curr_idx + + # Phase 2: apply permutation in-place via cycle decomposition. + # Each contact is read and written exactly once. + for i in range(n_con): + if collider_state.contact_sort_idx[i, i_b] != i: + tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] + tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] + tmp_penetration = collider_state.contact_data.penetration[i, i_b] + tmp_normal = collider_state.contact_data.normal[i, i_b] + tmp_pos = collider_state.contact_data.pos[i, i_b] + tmp_friction = collider_state.contact_data.friction[i, i_b] + tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] + tmp_force = collider_state.contact_data.force[i, i_b] + tmp_link_a = collider_state.contact_data.link_a[i, i_b] + tmp_link_b = collider_state.contact_data.link_b[i, i_b] + tmp_pair_idx = collider_state.contact_data.pair_idx[i, i_b] + + j = i + while collider_state.contact_sort_idx[j, i_b] != i: + src = collider_state.contact_sort_idx[j, i_b] + collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] + collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] + collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[ + src, i_b + ] + collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] + collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] + collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] + collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[ + src, i_b + ] + collider_state.contact_data.force[j, i_b] = collider_state.contact_data.force[src, i_b] + collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] + collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] + collider_state.contact_data.pair_idx[j, i_b] = collider_state.contact_data.pair_idx[src, i_b] + collider_state.contact_sort_idx[j, i_b] = j + j = src + + collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a + collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b + collider_state.contact_data.penetration[j, i_b] = tmp_penetration + collider_state.contact_data.normal[j, i_b] = tmp_normal + collider_state.contact_data.pos[j, i_b] = tmp_pos + collider_state.contact_data.friction[j, i_b] = tmp_friction + collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params + collider_state.contact_data.force[j, i_b] = tmp_force + collider_state.contact_data.link_a[j, i_b] = tmp_link_a + collider_state.contact_data.link_b[j, i_b] = tmp_link_b + collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx collider_state.contact_sort_idx[j, i_b] = j - j = src - - collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a - collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b - collider_state.contact_data.penetration[j, i_b] = tmp_penetration - collider_state.contact_data.normal[j, i_b] = tmp_normal - collider_state.contact_data.pos[j, i_b] = tmp_pos - collider_state.contact_data.friction[j, i_b] = tmp_friction - collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params - collider_state.contact_data.force[j, i_b] = tmp_force - collider_state.contact_data.link_a[j, i_b] = tmp_link_a - collider_state.contact_data.link_b[j, i_b] = tmp_link_b - collider_state.contact_data.pair_idx[j, i_b] = tmp_pair_idx - collider_state.contact_sort_idx[j, i_b] = j - - # Default: keep everything. Buckets that pass the gates flip their entries to drop and then mark only - # hull-vertex contacts as keep again. - for i in range(n_con): - collider_state.contact_keep[i, i_b] = 1 - - # Phase 2: walk link-pair buckets. - b_start = 0 - while b_start < n_con: - la0 = collider_state.contact_data.link_a[b_start, i_b] - lb0 = collider_state.contact_data.link_b[b_start, i_b] - la0_min = qd.min(la0, lb0) - la0_max = qd.max(la0, lb0) - b_end = b_start + 1 - while b_end < n_con: - la = collider_state.contact_data.link_a[b_end, i_b] - lb = collider_state.contact_data.link_b[b_end, i_b] - if qd.min(la, lb) != la0_min or qd.max(la, lb) != la0_max: - break - b_end += 1 - b_size = b_end - b_start - - if b_size >= 5: - # Mean normal (folded to the hemisphere of contact b_start) and centroid. - ref_n = collider_state.contact_data.normal[b_start, i_b] - rnx = ref_n[0] - rny = ref_n[1] - rnz = ref_n[2] - mnx = gs.qd_float(0.0) - mny = gs.qd_float(0.0) - mnz = gs.qd_float(0.0) - cx = gs.qd_float(0.0) - cy = gs.qd_float(0.0) - cz = gs.qd_float(0.0) - for i in range(b_start, b_end): - n_i = collider_state.contact_data.normal[i, i_b] - s = gs.qd_float(1.0) - if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0): - s = gs.qd_float(-1.0) - mnx += s * n_i[0] - mny += s * n_i[1] - mnz += s * n_i[2] - p_i = collider_state.contact_data.pos[i, i_b] - cx += p_i[0] - cy += p_i[1] - cz += p_i[2] - inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float) - cx *= inv_n - cy *= inv_n - cz *= inv_n - mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz) - - # Hoisted out so the hull-build branch below can read it (quadrants scopes per if). - max_in_plane_r2 = gs.qd_float(0.0) - - coplanar = mnrm > EPS - if coplanar: - mnx /= mnrm - mny /= mnrm - mnz /= mnrm - - # Depth coplanarity: positions must lie in a single plane perpendicular to the mean normal. No - # per-contact normal check: a contact whose normal is diagonal (e.g. an edge-vs-edge contact at a - # corner of the contact patch) still participates in the 2D hull because its position is a vertex of - # the patch; dropping a collinear-edge contact in the same bucket is justified by the positional - # support polygon regardless of that contact's normal direction. - max_depth = gs.qd_float(0.0) - for i in range(b_start, b_end): - p_i = collider_state.contact_data.pos[i, i_b] - dx = p_i[0] - cx - dy = p_i[1] - cy - dz = p_i[2] - cz - depth = qd.abs(dx * mnx + dy * mny + dz * mnz) - if depth > max_depth: - max_depth = depth - r2 = dx * dx + dy * dy + dz * dz - depth * depth - if r2 > max_in_plane_r2: - max_in_plane_r2 = r2 - - if max_depth > tol * qd.sqrt(max_in_plane_r2): - coplanar = False - - if coplanar: - # In-plane basis (u, v): seed from the world axis least-aligned with mean normal. - abs_mnx = qd.abs(mnx) - abs_mny = qd.abs(mny) - abs_mnz = qd.abs(mnz) - ax = gs.qd_float(1.0) - ay = gs.qd_float(0.0) - az = gs.qd_float(0.0) - if abs_mny < abs_mnx and abs_mny < abs_mnz: - ax = gs.qd_float(0.0) - ay = gs.qd_float(1.0) - az = gs.qd_float(0.0) - elif abs_mnz < abs_mnx and abs_mnz <= abs_mny: - ax = gs.qd_float(0.0) - ay = gs.qd_float(0.0) - az = gs.qd_float(1.0) - adn = ax * mnx + ay * mny + az * mnz - ux = ax - adn * mnx - uy = ay - adn * mny - uz = az - adn * mnz - unrm = qd.sqrt(ux * ux + uy * uy + uz * uz) - ux /= unrm - uy /= unrm - uz /= unrm - vx = mny * uz - mnz * uy - vy = mnz * ux - mnx * uz - vz = mnx * uy - mny * ux - - # Project bucket contacts to (u, v). Reuse contact_sort_key for u (phase 1 has consumed it); - # v goes into the dedicated scratch buffer. - for i in range(b_start, b_end): - p_i = collider_state.contact_data.pos[i, i_b] - collider_state.contact_sort_key[i, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz - collider_state.contact_proj_v[i, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz - - # Sort bucket indices lexicographically by (u, v), with a tolerance on u so that contacts whose u - # values differ only by float noise (or by sub-millimeter physics noise from MPR perturbations) sort - # by v. Without the tolerance, the wrong point pops from a 3-collinear triplet when the corner and - # the mid-edge have u values that differ by a few microns and the mid-edge happens to sort first. - sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2) - for i in range(b_start, b_end): - collider_state.contact_sort_idx[i, i_b] = i - for i in range(b_start + 1, b_end): - ci = collider_state.contact_sort_idx[i, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - j = i - 1 - while j >= b_start: - pj = collider_state.contact_sort_idx[j, i_b] - pu = collider_state.contact_sort_key[pj, i_b] - pv = collider_state.contact_proj_v[pj, i_b] - if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv): - break - collider_state.contact_sort_idx[j + 1, i_b] = pj - j -= 1 - collider_state.contact_sort_idx[j + 1, i_b] = ci - - # Mark all bucket entries as drop, then mark hull vertices as keep. - for i in range(b_start, b_end): - collider_state.contact_keep[i, i_b] = 0 - - # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check fails - # on numerically-near-collinear edge points (cross is a tiny positive epsilon from float roundoff), - # so genuine midpoints would survive as spurious hull vertices. - hull_collinear_tol = tol * max_in_plane_r2 - - # Andrew's monotone chain. Stack lives in contact_hull_stack[b_start..b_start + k). - k = 0 - for i in range(b_start, b_end): - ci = collider_state.contact_sort_idx[i, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - while k >= 2: - idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] - idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] - au = collider_state.contact_sort_key[idx_a, i_b] - av = collider_state.contact_proj_v[idx_a, i_b] - bu = collider_state.contact_sort_key[idx_b, i_b] - bv = collider_state.contact_proj_v[idx_b, i_b] - cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) - if cross <= hull_collinear_tol: - k -= 1 - else: - break - collider_state.contact_hull_stack[b_start + k, i_b] = ci - k += 1 - - upper_start = k - # Memory-fence for a Quadrants codegen issue on parallel envs (Metal backend, _B >= 2): without an - # explicit barrier between the lower-hull and upper-hull passes, the upper-hull pop-loop's reads - # of contact_hull_stack don't observe the writes from the lower hull, so its cross-product / - # pop-check effectively runs on stale data and every candidate is kept, producing a hull whose - # size equals the bucket size. - if qd.static(static_rigid_sim_config.backend == gs.metal): - qd.simt.block.sync() - # quadrants range supports 1 or 2 args only; iterate sorted indices backward over - # [b_start, b_end - 2] by reflecting the index. - for k_step in range(b_size - 1): - ii = b_end - 2 - k_step - ci = collider_state.contact_sort_idx[ii, i_b] - cu = collider_state.contact_sort_key[ci, i_b] - cv = collider_state.contact_proj_v[ci, i_b] - while k >= upper_start + 1: - idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] - idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] - au = collider_state.contact_sort_key[idx_a, i_b] - av = collider_state.contact_proj_v[idx_a, i_b] - bu = collider_state.contact_sort_key[idx_b, i_b] - bv = collider_state.contact_proj_v[idx_b, i_b] - cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) - if cross <= hull_collinear_tol: - k -= 1 - else: - break - # The closing iteration of the upper hull visits the leftmost point, which already sits at - # stack[b_start] from the lower hull. Skipping that push, plus the k < b_size guard, bounds k to - # b_size and keeps the write index within max_contact_pairs even for buckets where the lower- - # hull pass already kept all b_size points (downward-convex layouts: every lex-sorted triple - # makes a left turn so nothing gets popped, then the upper-hull pass tries to push a duplicate - # of an already-kept lower-hull vertex). - if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size: - collider_state.contact_hull_stack[b_start + k, i_b] = ci - k += 1 - - # Chain holds k unique hull vertices (no trailing duplicate). - for hk in range(k): - survivor = collider_state.contact_hull_stack[b_start + hk, i_b] - collider_state.contact_keep[survivor, i_b] = 1 - - # Restore non-hull contacts whose penetration is much deeper than the hull boundary's average. The - # support-polygon argument says interior contacts are wrench-redundant only when ALL contacts share - # the same normal and penetration; a contact with substantially higher penetration than the hull's - # average represents a distinct physical support (the body of a fork resting beyond its tines, the - # deep middle of a long body) and dropping it lets the body sink into the surface. The 3x factor is - # well above the typical ~1.x penetration spread on transient/rocking faces (so non-uniform- - # penetration buckets like irregular mesh contacts keep only the hull) but well below the deep - # interior penetrations seen when a non-flat body rests inside its convex envelope (so genuine deep - # supports are restored). - hull_pen_max = gs.qd_float(0.0) - for hk in range(k): - survivor = collider_state.contact_hull_stack[b_start + hk, i_b] - p = collider_state.contact_data.penetration[survivor, i_b] - if p > hull_pen_max: - hull_pen_max = p - deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max - for i in range(b_start, b_end): - if collider_state.contact_keep[i, i_b] == 0: - if collider_state.contact_data.penetration[i, i_b] > deep_keep_threshold: - collider_state.contact_keep[i, i_b] = 1 - - b_start = b_end - - # Phase 3: compact kept contacts to the front. - write = 0 - for read in range(n_con): - if collider_state.contact_keep[read, i_b] != 0: - if write != read: - collider_state.contact_data.geom_a[write, i_b] = collider_state.contact_data.geom_a[read, i_b] - collider_state.contact_data.geom_b[write, i_b] = collider_state.contact_data.geom_b[read, i_b] - collider_state.contact_data.penetration[write, i_b] = collider_state.contact_data.penetration[ - read, i_b - ] - collider_state.contact_data.normal[write, i_b] = collider_state.contact_data.normal[read, i_b] - collider_state.contact_data.pos[write, i_b] = collider_state.contact_data.pos[read, i_b] - collider_state.contact_data.friction[write, i_b] = collider_state.contact_data.friction[read, i_b] - collider_state.contact_data.sol_params[write, i_b] = collider_state.contact_data.sol_params[ - read, i_b - ] - collider_state.contact_data.force[write, i_b] = collider_state.contact_data.force[read, i_b] - collider_state.contact_data.link_a[write, i_b] = collider_state.contact_data.link_a[read, i_b] - collider_state.contact_data.link_b[write, i_b] = collider_state.contact_data.link_b[read, i_b] - collider_state.contact_data.pair_idx[write, i_b] = collider_state.contact_data.pair_idx[read, i_b] - write += 1 - collider_state.n_contacts[i_b] = write @qd.kernel From ec50db09c830d9f93e74d52c3977f3170736e817 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 25 May 2026 11:24:55 -0700 Subject: [PATCH 5/7] Remove stale prune_hull_collinear_tol after fused-kernel merge (Alexis collapsed it into tol*max_in_plane_r2) --- genesis/engine/solvers/rigid/collider/collider.py | 2 -- genesis/utils/array_class.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index aa8c95294..aaba5d891 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -89,7 +89,6 @@ def __init__(self, rigid_solver: "RigidSolver"): self._diff_pos_tolerance = 1e-2 self._diff_normal_tolerance = 1e-2 self._prune_deep_penetration_ratio = 3.0 - self._prune_hull_collinear_tol = 1e-3 self._init_static_config() self._use_split_narrowphase = ( @@ -200,7 +199,6 @@ def _init_collision_fields(self) -> None: diff_normal_tolerance=self._diff_normal_tolerance, contact_pruning_tolerance=self._solver._options.contact_pruning_tolerance or 0.0, prune_deep_penetration_ratio=self._prune_deep_penetration_ratio, - prune_hull_collinear_tol=self._prune_hull_collinear_tol, ) self._init_collision_pair_idx(self._collision_pair_idx) self._init_valid_pairs() diff --git a/genesis/utils/array_class.py b/genesis/utils/array_class.py index 466efaa3e..8daa2bf3f 100644 --- a/genesis/utils/array_class.py +++ b/genesis/utils/array_class.py @@ -794,7 +794,6 @@ class ColliderInfo: # link-pair contact pruning contact_pruning_tolerance: qd.Tensor prune_deep_penetration_ratio: qd.Tensor - prune_hull_collinear_tol: qd.Tensor def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_config, **kwargs): @@ -827,7 +826,6 @@ def get_collider_info(solver, n_vert_neighbors, n_valid_pairs, collider_static_c diff_normal_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["diff_normal_tolerance"]), contact_pruning_tolerance=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["contact_pruning_tolerance"]), prune_deep_penetration_ratio=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_deep_penetration_ratio"]), - prune_hull_collinear_tol=V_SCALAR_FROM(dtype=gs.qd_float, value=kwargs["prune_hull_collinear_tol"]), ) From 6f16ba75ec9a446f1b05d430165a4c463d88f996 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 25 May 2026 12:00:43 -0700 Subject: [PATCH 6/7] Restore cooperative warp-per-env dedup kernel for GPU; serial fused kernel for CPU Re-add `func_prune_contacts_coop`, the 32-lane warp-cooperative variant of the fused clamp+prune+sort path. Same pruning logic as Alexis's serial `func_clamp_prune_and_sort_contacts` (depth coplanarity gate, Andrew's monotone chain, hull-pen-max deep-penetration restore, Metal memory-fence workaround), but the per-env work is spread across 32 warp lanes: - PARALLEL: contact_keep init, phase-1a (link-pair) key+idx init, phase-2 mean-normal / centroid reduction via reduce_all_add_tiled, coplanarity check reduction via reduce_all_max_tiled, in-plane projection writes. - SERIAL on lane 0: insertion sorts (phase 1a + lex), bucket walk, monotone chain, hull mark + deep-penetration restore, and the final fused compact + spatial sort with sentinel +inf for dropped contacts. Dispatched only when gs.backend != gs.cpu AND scene is dedup-eligible AND not requires_grad AND contact_pruning_tolerance > 0. Everything else (CPU, autodiff, ineligible scenes) falls back to the serial fused kernel, which has internal qd.static gates that no-op the prune phases when they don't apply. So dedup still runs on every backend as before; the change just upgrades the GPU path to coop. --- .../engine/solvers/rigid/collider/collider.py | 33 +- .../engine/solvers/rigid/collider/contact.py | 447 ++++++++++++++++++ 2 files changed, 474 insertions(+), 6 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index aaba5d891..86ed55d95 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -48,6 +48,7 @@ func_rotate_frame, func_set_upstream_grad, func_clamp_prune_and_sort_contacts, + func_prune_contacts_coop, ) from . import narrowphase from .narrowphase import ( @@ -818,13 +819,33 @@ def detection(self) -> None: self._solver._errno, ) - func_clamp_prune_and_sort_contacts( - self._collider_state, - self._collider_info, - self._solver._rigid_global_info, - self._solver._static_rigid_sim_config, - self._collider_static_config, + # On GPU backends, when the scene is dedup-eligible and we're not in autodiff mode, dispatch the cooperative + # warp-per-env kernel (32 lanes/block; parallel reductions + lex-stride writes; serial sorts + hull build on + # lane 0; fused compact+spatial-sort in the final phase). This beats the serial fused kernel on dex_hand / + # g1_fall by ~2-3% by spreading the per-env work across the warp instead of running one env per block thread. + # Everything else (CPU, autodiff, scenes where link_pair_pruning_supported=False) falls through to the serial + # fused kernel, which has internal qd.static gates that drop the prune phases when they're not eligible. + ran_fused_dedup_coop = ( + gs.backend != gs.cpu + and self._collider_static_config.link_pair_pruning_supported + and not self._solver._static_rigid_sim_config.requires_grad + and (self._solver._options.contact_pruning_tolerance or 0.0) > 0.0 ) + if ran_fused_dedup_coop: + func_prune_contacts_coop( + self._collider_state, + self._collider_info, + self._solver._rigid_global_info, + self._solver._static_rigid_sim_config, + ) + else: + func_clamp_prune_and_sort_contacts( + self._collider_state, + self._collider_info, + self._solver._rigid_global_info, + self._solver._static_rigid_sim_config, + self._collider_static_config, + ) def get_contacts(self, as_tensor: bool = True, to_torch: bool = True, keep_batch_dim: bool = False): # Early return if already pre-computed diff --git a/genesis/engine/solvers/rigid/collider/contact.py b/genesis/engine/solvers/rigid/collider/contact.py index 0f682df8f..169002bc3 100644 --- a/genesis/engine/solvers/rigid/collider/contact.py +++ b/genesis/engine/solvers/rigid/collider/contact.py @@ -1025,6 +1025,453 @@ def func_clamp_prune_and_sort_contacts( collider_state.contact_sort_idx[j, i_b] = j +@qd.kernel(fastcache=True) +def func_prune_contacts_coop( + collider_state: array_class.ColliderState, + collider_info: array_class.ColliderInfo, + rigid_global_info: array_class.RigidGlobalInfo, + static_rigid_sim_config: qd.template(), +): + """GPU-only cooperative warp-per-env variant of the fused clamp+prune+sort kernel. + + Pruning logic matches Alexis's serial `func_clamp_prune_and_sort_contacts` (same depth coplanarity gate, same + Andrew's monotone chain, same hull-pen-max deep-penetration restore, same Metal memory-fence workaround). The + difference is the per-env work is spread across 32 warp lanes: + - PARALLEL: contact_keep init, phase-1a (link-pair) key+idx init, phase-2 mean-normal / centroid reduction + (via qd.simt.subgroup.reduce_all_add_tiled), coplanarity check reduction (via reduce_all_max_tiled), in-plane + projection writes. + - SERIAL on lane 0: insertion sorts (phase 1a + lex), bucket walk control, Andrew's monotone chain, hull-mark, + deep-penetration restore, and the final fused compact + spatial sort. + + The fused final phase (lane 0) replaces both dedup phase-3 compact and the standalone func_clamp_and_sort_contacts + with a single permutation pass: dropped contacts get a sentinel +inf sort key (pushed past the end), kept contacts + get the geom-pair group's x-pos (preserving narrowphase intra-group order via stable insertion sort), then one + cycle-decomposition permute moves each kept record exactly once. Saves one full kernel launch + one full 11-field + permute pass vs the serial fused kernel on dex_hand / g1_fall workloads. + + This kernel is dispatched only when gs.backend is GPU AND link_pair_pruning_supported AND not requires_grad AND + contact_pruning_tolerance > 0. Everything else falls through to the serial `func_clamp_prune_and_sort_contacts`. + """ + _B = collider_state.n_contacts.shape[0] + max_contact_pairs = collider_info.max_contact_pairs[None] + tol = collider_info.contact_pruning_tolerance[None] + prune_deep_penetration_ratio = collider_info.prune_deep_penetration_ratio[None] + LP_KEY_STRIDE = gs.qd_float(1.0e7) + EPS = rigid_global_info.EPS[None] + + # deskai6 opt 9d (real warp-coop, stage 1): pull contact_keep init and phase-1a key+idx init OUT of `if tid == 0:` + # and parallelize them across the 32 lanes (stride-tid). Phase 1a sort, phase 2 bucket walk, and phase 3 (sort + + # cycle-permute) stay serial on lane 0. Subsequent stages add coop reductions (phase-2 mean / centroid) and parallel + # cycle-permute. + _K = qd.static(32) + qd.loop_config(name="prune_contacts_coop", block_dim=_K) + for i_flat in range(_B * _K): + tid = i_flat % _K + i_b = i_flat // _K + # All lanes compute n_con (cheap, no memory write on non-lane-0). + n_con = qd.min(collider_state.n_contacts[i_b], max_contact_pairs) + if tid == 0: + collider_state.n_contacts[i_b] = n_con + + # PARALLEL: contact_keep init. Default-keep marked here unconditionally so the fused phase 3 (compact + spatial + # sort) below produces a correct result for envs with n_con < 5 (no dedup buckets, but still need spatial sort). + # 32 lanes stride. + ii = tid + while ii < n_con: + collider_state.contact_keep[ii, i_b] = 1 + ii += _K + + if n_con >= 5: + # PARALLEL: phase 1a key + idx init, 32 lanes stride. + ii = tid + while ii < n_con: + la = collider_state.contact_data.link_a[ii, i_b] + lb = collider_state.contact_data.link_b[ii, i_b] + la_min = qd.min(la, lb) + la_max = qd.max(la, lb) + collider_state.contact_sort_key[ii, i_b] = qd.cast(la_min, gs.qd_float) * LP_KEY_STRIDE + qd.cast( + la_max, gs.qd_float + ) + collider_state.contact_sort_idx[ii, i_b] = ii + ii += _K + + if tid == 0: + # SERIAL on lane 0: phase 1a insertion sort + phase 2 bucket walk. + for i in range(1, n_con): + ck = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= ck: + continue + ci = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= ck: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = ck + collider_state.contact_sort_idx[j + 1, i_b] = ci + + qd.simt.subgroup.sync() + + # Phase 2 (deskai6 opt 9d stage 2): bucket walk runs on ALL 32 lanes. The outer control flow (find b_end, + # iterate buckets) is duplicated across lanes since the inputs are all in DRAM (cache-friendly). Inside a + # bucket, the mean-normal / centroid sum is done coop via 6 reduce_all_add_tiled calls. The rest of the + # bucket processing (coplanarity check with early-exit, in-plane basis, projection, lex sort, hull build, + # mark survivors) stays serial on lane 0. + b_start = 0 + while b_start < n_con: + key0 = collider_state.contact_sort_key[b_start, i_b] + b_end = b_start + 1 + while b_end < n_con: + if collider_state.contact_sort_key[b_end, i_b] != key0: + break + b_end += 1 + b_size = b_end - b_start + + if b_size >= 5: + ref_src = collider_state.contact_sort_idx[b_start, i_b] + ref_n = collider_state.contact_data.normal[ref_src, i_b] + rnx = ref_n[0] + rny = ref_n[1] + rnz = ref_n[2] + mnx_l = gs.qd_float(0.0) + mny_l = gs.qd_float(0.0) + mnz_l = gs.qd_float(0.0) + cx_l = gs.qd_float(0.0) + cy_l = gs.qd_float(0.0) + cz_l = gs.qd_float(0.0) + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + n_i = collider_state.contact_data.normal[src_i, i_b] + s = gs.qd_float(1.0) + if rnx * n_i[0] + rny * n_i[1] + rnz * n_i[2] < gs.qd_float(0.0): + s = gs.qd_float(-1.0) + mnx_l += s * n_i[0] + mny_l += s * n_i[1] + mnz_l += s * n_i[2] + p_i = collider_state.contact_data.pos[src_i, i_b] + cx_l += p_i[0] + cy_l += p_i[1] + cz_l += p_i[2] + jj += _K + + mnx = qd.simt.subgroup.reduce_all_add_tiled(mnx_l, 5) + mny = qd.simt.subgroup.reduce_all_add_tiled(mny_l, 5) + mnz = qd.simt.subgroup.reduce_all_add_tiled(mnz_l, 5) + cx = qd.simt.subgroup.reduce_all_add_tiled(cx_l, 5) + cy = qd.simt.subgroup.reduce_all_add_tiled(cy_l, 5) + cz = qd.simt.subgroup.reduce_all_add_tiled(cz_l, 5) + + # POST-REDUCE math runs on all 32 lanes (deterministic, cheap; redundant arithmetic is free vs. + # broadcasting the reduce results). + inv_n = gs.qd_float(1.0) / qd.cast(b_size, gs.qd_float) + cx *= inv_n + cy *= inv_n + cz *= inv_n + mnrm = qd.sqrt(mnx * mnx + mny * mny + mnz * mnz) + + max_in_plane_r2 = gs.qd_float(0.0) + coplanar = mnrm > EPS + if coplanar: + mnx /= mnrm + mny /= mnrm + mnz /= mnrm + + # COOP coplanarity check (stage 3). Each lane strides [b_start + tid, b_end) by _K, locally + # tracking max_depth / max_in_plane_r2. Wasted work per warp is at most b_size/_K contacts. + # The upstream algo no longer checks per-contact normals (a contact with a diagonal normal at + # the corner of a patch still participates in the 2D hull because its position is a vertex), so + # we only do the depth coplanarity gate here. + max_depth_l = gs.qd_float(0.0) + max_r2_l = gs.qd_float(0.0) + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + p_i = collider_state.contact_data.pos[src_i, i_b] + dx = p_i[0] - cx + dy = p_i[1] - cy + dz = p_i[2] - cz + depth = qd.abs(dx * mnx + dy * mny + dz * mnz) + if depth > max_depth_l: + max_depth_l = depth + r2 = dx * dx + dy * dy + dz * dz - depth * depth + if r2 > max_r2_l: + max_r2_l = r2 + jj += _K + + max_depth = qd.simt.subgroup.reduce_all_max_tiled(max_depth_l, 5) + max_in_plane_r2 = qd.simt.subgroup.reduce_all_max_tiled(max_r2_l, 5) + + if max_depth > tol * qd.sqrt(max_in_plane_r2): + coplanar = False + + if coplanar: + # Basis on all lanes (deterministic from mnx/mny/mnz which the reduce broadcast to every lane). + abs_mnx = qd.abs(mnx) + abs_mny = qd.abs(mny) + abs_mnz = qd.abs(mnz) + ax = gs.qd_float(1.0) + ay = gs.qd_float(0.0) + az = gs.qd_float(0.0) + if abs_mny < abs_mnx and abs_mny < abs_mnz: + ax = gs.qd_float(0.0) + ay = gs.qd_float(1.0) + az = gs.qd_float(0.0) + elif abs_mnz < abs_mnx and abs_mnz <= abs_mny: + ax = gs.qd_float(0.0) + ay = gs.qd_float(0.0) + az = gs.qd_float(1.0) + adn = ax * mnx + ay * mny + az * mnz + ux = ax - adn * mnx + uy = ay - adn * mny + uz = az - adn * mnz + unrm = qd.sqrt(ux * ux + uy * uy + uz * uz) + ux /= unrm + uy /= unrm + uz /= unrm + vx = mny * uz - mnz * uy + vy = mnz * ux - mnx * uz + vz = mnx * uy - mny * ux + + # COOP projection: 32 lanes stride writes to contact_sort_key + contact_proj_v. + jj = b_start + tid + while jj < b_end: + src_i = collider_state.contact_sort_idx[jj, i_b] + p_i = collider_state.contact_data.pos[src_i, i_b] + collider_state.contact_sort_key[jj, i_b] = p_i[0] * ux + p_i[1] * uy + p_i[2] * uz + collider_state.contact_proj_v[jj, i_b] = p_i[0] * vx + p_i[1] * vy + p_i[2] * vz + jj += _K + + # COOP mark-drop: stride writes to contact_keep[orig]. + jj = b_start + tid + while jj < b_end: + orig = collider_state.contact_sort_idx[jj, i_b] + collider_state.contact_keep[orig, i_b] = 0 + jj += _K + + # COOP lex_idx init: stride writes. + jj = b_start + tid + while jj < b_end: + collider_state.contact_lex_idx[jj, i_b] = jj + jj += _K + + # SYNC between coop writes (sort_key, proj_v, lex_idx, contact_keep[orig]) and the lane-0 lex + # sort + hull build that reads them. + qd.simt.subgroup.sync() + + if tid == 0 and coplanar: + # SERIAL on lane 0: lex sort + Andrew monotone-chain hull build + mark survivors. These walk a + # stack and have data-dependent inner loops that don't decompose across warp lanes. + # + # The sort_u_tol on the u comparison is critical for correctness, not just a perf tweak: + # contacts whose u values differ only by sub-millimeter MPR noise need to sort by v, otherwise + # mid-edge points get sorted between the two corners they sit between and survive the lower- + # hull pass as spurious hull vertices (cf. upstream test_contact_pruning regression when this + # tolerance is missing). + sort_u_tol = gs.qd_float(1e-3) * qd.sqrt(max_in_plane_r2) + for i in range(b_start + 1, b_end): + ci = collider_state.contact_lex_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + j = i - 1 + while j >= b_start: + pj = collider_state.contact_lex_idx[j, i_b] + pu = collider_state.contact_sort_key[pj, i_b] + pv = collider_state.contact_proj_v[pj, i_b] + if (pu < cu - sort_u_tol) or (qd.abs(pu - cu) <= sort_u_tol and pv <= cv): + break + collider_state.contact_lex_idx[j + 1, i_b] = pj + j -= 1 + collider_state.contact_lex_idx[j + 1, i_b] = ci + + # Collinearity threshold for hull pops, scaled to the bucket extent. A pure "cross <= 0" check + # fails on numerically-near-collinear edge points (cross is a tiny positive epsilon from float + # roundoff), so genuine midpoints would survive as spurious hull vertices. + hull_collinear_tol = tol * max_in_plane_r2 + + k = 0 + for i in range(b_start, b_end): + ci = collider_state.contact_lex_idx[i, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= 2: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break + collider_state.contact_hull_stack[b_start + k, i_b] = ci + k += 1 + + upper_start = k + # Memory-fence workaround for a Quadrants codegen issue on Metal _B >= 2 (genesis PR #2831): + # without an explicit scratch store between the lower- and upper-hull lane-0 ``for`` passes, + # the upper-hull pop-loop's reads of contact_hull_stack don't observe the lower-hull writes, + # so the cross-product check effectively runs on stale data and every candidate is kept (hull + # size == bucket size, no pruning). Writing any value to a non-overlapping slot here forces + # write-then-read ordering on the shared buffer. The value is unused. See + # perso_hugh/prot/qd_metal_hull_chain_visibility_repro.py for a standalone reproduction. + collider_state.contact_hull_stack[max_contact_pairs - 1, i_b] = 0 + for k_step in range(b_size - 1): + ii_lex = b_end - 2 - k_step + ci = collider_state.contact_lex_idx[ii_lex, i_b] + cu = collider_state.contact_sort_key[ci, i_b] + cv = collider_state.contact_proj_v[ci, i_b] + while k >= upper_start + 1: + idx_a = collider_state.contact_hull_stack[b_start + k - 2, i_b] + idx_b = collider_state.contact_hull_stack[b_start + k - 1, i_b] + au = collider_state.contact_sort_key[idx_a, i_b] + av = collider_state.contact_proj_v[idx_a, i_b] + bu = collider_state.contact_sort_key[idx_b, i_b] + bv = collider_state.contact_proj_v[idx_b, i_b] + cross = (bu - au) * (cv - av) - (bv - av) * (cu - au) + if cross <= hull_collinear_tol: + k -= 1 + else: + break + if ci != collider_state.contact_hull_stack[b_start, i_b] and k < b_size: + collider_state.contact_hull_stack[b_start + k, i_b] = ci + k += 1 + + for hk in range(k): + survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b] + survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b] + collider_state.contact_keep[survivor_orig, i_b] = 1 + + # Restore non-hull contacts whose penetration is much deeper than the hull boundary's max + # (PR #2831 switched from avg to max). Rationale: a contact whose penetration substantially + # exceeds the hull's deepest vertex represents a distinct physical support (deep body of a + # fork beyond its tines, deep middle of a long body) that the support-polygon argument + # doesn't actually authorize dropping (the argument only holds when all contacts share + # normal AND penetration). The 3x factor over the hull max is well above the typical ~1.x + # penetration spread on transient/rocking faces but well below the deep interior penetrations + # seen when a non-flat body rests inside its convex envelope. Indices here live in orig-space + # (because the cycle-permute is fused into the phase 3 below in opt 10 -- contact_data is + # still in pre-sort order, so we translate sort-space hull/bucket indices through + # contact_sort_idx). + hull_pen_max = gs.qd_float(0.0) + for hk in range(k): + survivor_sort = collider_state.contact_hull_stack[b_start + hk, i_b] + survivor_orig = collider_state.contact_sort_idx[survivor_sort, i_b] + p = collider_state.contact_data.penetration[survivor_orig, i_b] + if p > hull_pen_max: + hull_pen_max = p + deep_keep_threshold = prune_deep_penetration_ratio * hull_pen_max + for jj_idx in range(b_start, b_end): + orig = collider_state.contact_sort_idx[jj_idx, i_b] + if collider_state.contact_keep[orig, i_b] == 0: + if collider_state.contact_data.penetration[orig, i_b] > deep_keep_threshold: + collider_state.contact_keep[orig, i_b] = 1 + + b_start = b_end + + if tid == 0: + # Phase 3 (deskai6 opt 10): FUSED compact + spatial sort. Replaces: + # - dedup phase 3 (compact based on contact_keep) + # - func_clamp_and_sort_contacts (assign group-x sort key + insertion sort + cycle-permute) + # with a single permutation pass: compute a sort key per slot that pushes dropped contacts past the end + # (sentinel +inf) and orders kept contacts by their geom-pair group's x-pos (anchored to the first kept + # contact in the group, preserving narrowphase intra-group order via stable sort). Then sort sort-keys + + # sort-indices, then cycle-permute the 9 contact_data fields exactly once. + # + # On dex_hand this saves ~25 displaced contacts worth of field copies that used to be done in dedup phase 3 + # before clamp_and_sort re-permuted everything. Same 9 fields per contact (force & pair_idx still skipped + # per opt 8a; clamp_and_sort still runs in the grad path so its own opt 8b gate stays in place). + SENTINEL_BIG = gs.qd_float(1e30) + group_key = gs.qd_float(0.0) + prev_ga = -1 + prev_gb = -1 + for i in range(n_con): + if collider_state.contact_keep[i, i_b] != 0: + ga = collider_state.contact_data.geom_a[i, i_b] + gb = collider_state.contact_data.geom_b[i, i_b] + if ga != prev_ga or gb != prev_gb: + group_key = collider_state.contact_data.pos[i, i_b][0] + prev_ga = ga + prev_gb = gb + collider_state.contact_sort_key[i, i_b] = group_key + else: + collider_state.contact_sort_key[i, i_b] = SENTINEL_BIG + collider_state.contact_sort_idx[i, i_b] = i + + for i in range(1, n_con): + ck = collider_state.contact_sort_key[i, i_b] + if collider_state.contact_sort_key[i - 1, i_b] <= ck: + continue + ci = collider_state.contact_sort_idx[i, i_b] + j = i - 1 + while j >= 0: + if collider_state.contact_sort_key[j, i_b] <= ck: + break + collider_state.contact_sort_key[j + 1, i_b] = collider_state.contact_sort_key[j, i_b] + collider_state.contact_sort_idx[j + 1, i_b] = collider_state.contact_sort_idx[j, i_b] + j = j - 1 + collider_state.contact_sort_key[j + 1, i_b] = ck + collider_state.contact_sort_idx[j + 1, i_b] = ci + + # n_contacts is the count of non-sentinel sort keys (the dropped contacts sit at the tail and are ignored + # by downstream consumers via the new n_contacts). + n_kept = 0 + for i in range(n_con): + if collider_state.contact_sort_key[i, i_b] < SENTINEL_BIG: + n_kept += 1 + else: + break + collider_state.n_contacts[i_b] = n_kept + + # Cycle-permute contact_data fields into their sorted positions. Only n_kept positions need correct values; + # the tail (dropped) slots are scratched but not read again this step. + for i in range(n_kept): + if collider_state.contact_sort_idx[i, i_b] != i: + tmp_geom_a = collider_state.contact_data.geom_a[i, i_b] + tmp_geom_b = collider_state.contact_data.geom_b[i, i_b] + tmp_penetration = collider_state.contact_data.penetration[i, i_b] + tmp_normal = collider_state.contact_data.normal[i, i_b] + tmp_pos = collider_state.contact_data.pos[i, i_b] + tmp_friction = collider_state.contact_data.friction[i, i_b] + tmp_sol_params = collider_state.contact_data.sol_params[i, i_b] + tmp_link_a = collider_state.contact_data.link_a[i, i_b] + tmp_link_b = collider_state.contact_data.link_b[i, i_b] + + j = i + while collider_state.contact_sort_idx[j, i_b] != i: + src = collider_state.contact_sort_idx[j, i_b] + collider_state.contact_data.geom_a[j, i_b] = collider_state.contact_data.geom_a[src, i_b] + collider_state.contact_data.geom_b[j, i_b] = collider_state.contact_data.geom_b[src, i_b] + collider_state.contact_data.penetration[j, i_b] = collider_state.contact_data.penetration[ + src, i_b + ] + collider_state.contact_data.normal[j, i_b] = collider_state.contact_data.normal[src, i_b] + collider_state.contact_data.pos[j, i_b] = collider_state.contact_data.pos[src, i_b] + collider_state.contact_data.friction[j, i_b] = collider_state.contact_data.friction[src, i_b] + collider_state.contact_data.sol_params[j, i_b] = collider_state.contact_data.sol_params[ + src, i_b + ] + collider_state.contact_data.link_a[j, i_b] = collider_state.contact_data.link_a[src, i_b] + collider_state.contact_data.link_b[j, i_b] = collider_state.contact_data.link_b[src, i_b] + collider_state.contact_sort_idx[j, i_b] = j + j = src + + collider_state.contact_data.geom_a[j, i_b] = tmp_geom_a + collider_state.contact_data.geom_b[j, i_b] = tmp_geom_b + collider_state.contact_data.penetration[j, i_b] = tmp_penetration + collider_state.contact_data.normal[j, i_b] = tmp_normal + collider_state.contact_data.pos[j, i_b] = tmp_pos + collider_state.contact_data.friction[j, i_b] = tmp_friction + collider_state.contact_data.sol_params[j, i_b] = tmp_sol_params + collider_state.contact_data.link_a[j, i_b] = tmp_link_a + collider_state.contact_data.link_b[j, i_b] = tmp_link_b + collider_state.contact_sort_idx[j, i_b] = j + + @qd.kernel def func_set_upstream_grad( dL_dposition: qd.types.ndarray(), From bf38466a1717deda1158cc4cdd9dc9744a19ee46 Mon Sep 17 00:00:00 2001 From: Hugh Perkins Date: Mon, 25 May 2026 12:30:32 -0700 Subject: [PATCH 7/7] Gate cooperative dedup kernel on n_envs <= gpu_cores / 2 The cooperative warp-per-env dedup kernel (func_prune_contacts_coop) only beats the serial fused kernel while the GPU has spare occupancy. Once _B exceeds half the GPU's CUDA-core count the coop launch fills the device, the warp-cooperation overhead stops paying off, and Alexis's serial fused kernel (one thread per env) is faster. The half-core threshold also leaves headroom for the other kernels already sharing the SMs (narrowphase MPR/GJK, constraint solver) so we don't push them into second-wave scheduling on tight scenes like dex_hand and g1_fall. Concrete thresholds (gpu_cores / 2): - RTX 5090 (170 SMs x 128): _B <= 10880 - RTX 6000 Blackwell (188 x 128): _B <= 12032 - MI300X (304 CUs x 64): _B <= 9728 - Fallback (16384): _B <= 8192 Factored the per-backend gpu_cores computation out of the _use_split_narrowphase branch into a single Collider._gpu_cores field so both the contact0 chunking heuristic and the new dispatch gate share the same number. No behavior change for split-narrowphase chunking. --- .../engine/solvers/rigid/collider/collider.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/genesis/engine/solvers/rigid/collider/collider.py b/genesis/engine/solvers/rigid/collider/collider.py index 86ed55d95..02eb074f7 100644 --- a/genesis/engine/solvers/rigid/collider/collider.py +++ b/genesis/engine/solvers/rigid/collider/collider.py @@ -221,27 +221,30 @@ def _init_collision_fields(self) -> None: # 'contact_data_cache' is not used in Quadrants kernels, so keep it outside of the collider state / info self._contact_data_cache: dict[tuple[bool, bool], dict[str, torch.Tensor | tuple[torch.Tensor]]] = {} - # Contact0 & multicontact scratch states only needed when split narrowphase is active. + # GPU core count (used by split-narrowphase chunking + the cooperative dedup dispatch gate). # FIXME: Quadrants should expose a unified API to query GPU core count across all backends. # Falling back to upper bound for backends where torch.cuda is unavailable (e.g., CPU-only torch). Benchmarks # on RTX 6000 Blackwell (Genesis-Embodied-AI/Genesis#2616) showed that switching from hardcoded 40000 threads # to hardware-derived 21760 had marginal performance impact, so it should be fine. + if torch.cuda.is_available(): + gpu_props = torch.cuda.get_device_properties(torch.cuda.current_device()) + # NVIDIA: 128 CUDA cores per SM. AMD/ROCm: 64 stream processors per CU. + cores_per_unit = 64 if torch.version.hip else 128 + gpu_cores = gpu_props.multi_processor_count * cores_per_unit + elif gs.backend == gs.metal: + # Upper-bound estimate for Apple Silicon: 40 GPU cores, each GPU core having 128 ALUs + cores_per_unit = 128 + gpu_cores = 5120 + else: + # Using AMD GPU as a baseline. AMD MI350X has 256 SM (so-called Compute Units) with 64 cores each. + # See: https://www.amd.com/en/products/accelerators/instinct/mi350/mi350x.html + # For comparison, RTX6000 Blackwell boasts 188 SMs, compared to 170 SMs for RTX5090 with 128 cores each. + cores_per_unit = 64 + gpu_cores = 16384 + self._gpu_cores = gpu_cores + + # Contact0 & multicontact scratch states only needed when split narrowphase is active. if self._use_split_narrowphase: - if torch.cuda.is_available(): - gpu_props = torch.cuda.get_device_properties(torch.cuda.current_device()) - # NVIDIA: 128 CUDA cores per SM. AMD/ROCm: 64 stream processors per CU. - cores_per_unit = 64 if torch.version.hip else 128 - gpu_cores = gpu_props.multi_processor_count * cores_per_unit - elif gs.backend == gs.metal: - # Upper-bound estimate for Apple Silicon: 40 GPU cores, each GPU core having 128 ALUs - cores_per_unit = 128 - gpu_cores = 5120 - else: - # Using AMD GPU as a baseline. AMD MI350X has 256 SM (so-called Compute Units) with 64 cores each. - # See: https://www.amd.com/en/products/accelerators/instinct/mi350/mi350x.html - # For comparison, RTX6000 Blackwell boasts 188 SMs, compared to 170 SMs for RTX5090 with 128 cores each. - cores_per_unit = 64 - gpu_cores = 16384 self._contact0_n_chunks = max(1, math.ceil(gpu_cores / self._solver._B)) self._contact0_grid_size = self._solver._B * self._contact0_n_chunks self._contact0_mpr_state = array_class.get_mpr_state(self._contact0_grid_size) @@ -822,14 +825,19 @@ def detection(self) -> None: # On GPU backends, when the scene is dedup-eligible and we're not in autodiff mode, dispatch the cooperative # warp-per-env kernel (32 lanes/block; parallel reductions + lex-stride writes; serial sorts + hull build on # lane 0; fused compact+spatial-sort in the final phase). This beats the serial fused kernel on dex_hand / - # g1_fall by ~2-3% by spreading the per-env work across the warp instead of running one env per block thread. - # Everything else (CPU, autodiff, scenes where link_pair_pruning_supported=False) falls through to the serial - # fused kernel, which has internal qd.static gates that drop the prune phases when they're not eligible. + # g1_fall by spreading the per-env work across the warp instead of running one env per block thread, but only + # while the GPU has spare occupancy. Once n_envs exceeds half the GPU's CUDA core count, the coop launch fills + # the device and the warp-cooperation overhead stops paying off — the serial fused kernel (one thread per env) + # wins. The half-core threshold leaves room for the existing kernels already sharing the SMs (narrowphase, + # constraint solver) so we don't push them to second-wave scheduling. Everything else (CPU, autodiff, scenes + # where link_pair_pruning_supported=False, oversubscribed n_envs) falls through to the serial fused kernel, + # which has internal qd.static gates that drop the prune phases when they're not eligible. ran_fused_dedup_coop = ( gs.backend != gs.cpu and self._collider_static_config.link_pair_pruning_supported and not self._solver._static_rigid_sim_config.requires_grad and (self._solver._options.contact_pruning_tolerance or 0.0) > 0.0 + and self._solver._B * 2 <= self._gpu_cores ) if ran_fused_dedup_coop: func_prune_contacts_coop(