AMD-AGI · chao-xu-spec · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/src/minisweagent/kernel_languages/contract.py b/src/minisweagent/kernel_languages/contract.py
diff --git a/src/minisweagent/kernel_languages/hip/builder_hints.md b/src/minisweagent/kernel_languages/hip/builder_hints.md
@@ -41,3 +41,52 @@ the user's test file:
 
 - Warmup 5 iterations; measure 100 and take the median (not mean).
 - `hipDeviceSynchronize()` before/after each measurement.
+
+## aiter kernels — MANDATORY worktree routing (do NOT use sys.path alone)
+
+If the kernel lives in the **aiter** repo (`csrc/kernels/*.cu` invoked via a
+Python-callable like `aiter.add_rmsnorm_quant(...)`, built by aiter's JIT
+`@compile_ops` mechanism), the generic "prepend the worktree to `sys.path`"
+rule is NOT enough and will silently evaluate the BASELINE kernel:
+
+- `aiter` is installed **editable** via a `sys.meta_path` finder
+  (`__editable__...amd_aiter...finder.py`). That finder is consulted BEFORE the
+  `sys.path`-based finder, so `import aiter` resolves to the ORIGINAL repo
+  (`/sgl-workspace/aiter`) no matter what you insert at `sys.path[0]`.
+- aiter's JIT derives its source dir from `AITER_META_DIR` →
+  `AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc"` (see `aiter/jit/core.py`). If you
+  don't override it, it points at the baseline tree, so patches to
+  `$GEAK_WORK_DIR/csrc/...cu` are never compiled. Result: correctness always
+  PASSes and every speedup is ~1.00x (the worktree-bypass bug).
+
+You MUST route aiter's JIT to the worktree by setting these env vars at the very
+top of the harness, BEFORE `import aiter` (this recipe is verified to make
+sentinel-injected worktree corruption fail correctness, i.e. the worktree IS
+evaluated):
+
+```python
+import os
+WORK_DIR = os.path.abspath(os.environ.get("GEAK_WORK_DIR", "/sgl-workspace/aiter"))
+# Fail loud if the kernel under test is not actually in the worktree.
+_kernel_rel = "csrc/kernels/<this_kernel>.cu"
+assert os.path.exists(os.path.join(WORK_DIR, _kernel_rel)), \
+    f"kernel not found under WORK_DIR={WORK_DIR}"
+# Route aiter's JIT source dir (AITER_CSRC_DIR = $AITER_META_DIR/csrc) to the worktree.
+os.environ["AITER_META_DIR"] = WORK_DIR
+# Per-slot JIT build cache so worktrees don't share artifacts.
+_gpu = os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
+os.environ["AITER_JIT_DIR"] = os.path.join(WORK_DIR, f"_geak_jit_gpu{_gpu}")
+# Force recompile ONLY when the worktree kernel source is newer than the cached
+# .so (incremental); a patched candidate has a newer mtime so it still rebuilds.
+_so = os.path.join(os.environ["AITER_JIT_DIR"], "<module>.so")
+_src = os.path.join(WORK_DIR, _kernel_rel)
+if (not os.path.exists(_so)) or os.path.getmtime(_src) > os.path.getmtime(_so):
+    os.environ["AITER_REBUILD"] = "2"
+else:
+    os.environ.pop("AITER_REBUILD", None)
+import aiter  # now resolves source/build under $GEAK_WORK_DIR
+```
+
+Do NOT rely on deleting the `.so` plus `sys.path.insert` — that does not change
+where aiter's editable finder resolves `import aiter`, nor where `AITER_CSRC_DIR`
+points. The `AITER_META_DIR` override is the only mechanism that works.
diff --git a/src/minisweagent/kernel_languages/hip/commandment.j2 b/src/minisweagent/kernel_languages/hip/commandment.j2
@@ -32,25 +32,25 @@ export PYTHONPATH="${GEAK_WORK_DIR}:${GEAK_REPO_ROOT:-{{ repo_root }}}:${PYTHONP
 ## Correctness
 
 ```bash
-cd "${GEAK_WORK_DIR}" && {{ correctness_command | default("python3 " + harness_path + " --correctness") }}
+cd "${GEAK_WORK_DIR}" && {{ correctness_command | default("python3 " + harness_path + " --correctness", true) }}
 ```
 
 ## Benchmark
 
 ```bash
-cd "${GEAK_WORK_DIR}" && {{ performance_command | default("python3 " + harness_path + " --benchmark") }}
+cd "${GEAK_WORK_DIR}" && {{ performance_command | default("python3 " + harness_path + " --benchmark", true) }}
 ```
 
 ## Full Benchmark
 
 ```bash
-cd "${GEAK_WORK_DIR}" && {{ performance_command | default("python3 " + harness_path + " --full-benchmark") }}
+cd "${GEAK_WORK_DIR}" && {{ performance_command | default("python3 " + harness_path + " --full-benchmark", true) }}
 ```
 
 ## Profile
 
 ```bash
-kernel-profile "cd ${GEAK_WORK_DIR} && {{ performance_command | default("python3 " + harness_path + " --profile") }}" \
+kernel-profile "cd ${GEAK_WORK_DIR} && {{ performance_command | default("python3 " + harness_path + " --profile", true) }}" \
   --gpu-devices ${GEAK_GPU_DEVICE:-0} \
   --replays {{ profile_replays | default(3) }} \
   --json -o ${GEAK_WORK_DIR}/profile.json

diff --git a/src/minisweagent/models/litellm_model.py b/src/minisweagent/models/litellm_model.py
@@ -328,10 +328,21 @@ def _query(self, messages: list[dict[str, Any]], **kwargs: Any) -> Any:
             # as a request header. Preserve any other caller-supplied
             # ``extra_headers`` verbatim.
             filtered["extra_headers"] = dict(existing_headers)
+        # Bound every request with a wall-clock timeout. Without it, a stalled
+        # gateway socket (server accepted the connection but never sends bytes)
+        # blocks the read forever, the tenacity ``@retry`` never fires, and the
+        # whole preprocess/agent loop hangs indefinitely. ``litellm.Timeout`` is
+        # NOT in the no-retry exclusion list above (it does not subclass
+        # ``litellm.exceptions.APIError``), so a timed-out call is retried with
+        # exponential backoff. Override via ``GEAK_LLM_REQUEST_TIMEOUT`` (seconds).
+        request_timeout = filtered.pop("timeout", None)
+        if request_timeout is None:
+            request_timeout = float(os.getenv("GEAK_LLM_REQUEST_TIMEOUT", "600"))
         try:
             return litellm.completion(
                 model=self.config.model_name,
                 messages=messages,
+                timeout=request_timeout,
                 **filtered,
             )
         except litellm.exceptions.AuthenticationError as e:

diff --git a/src/minisweagent/run/postprocess/evaluation.py b/src/minisweagent/run/postprocess/evaluation.py
@@ -356,15 +356,54 @@ def preflight_commandment_contract(
     from minisweagent.run.preprocess.harness_utils import harness_supports_iterations
 
     repo_root_path = Path(repo_root).resolve()
-    is_git = (repo_root_path / ".git").exists()
 
-    # For git repos, run in a temporary worktree so side effects (run.sh,
-    # JIT caches, profile artifacts) never dirty the original repo.
-    if is_git and output_dir is not None:
+    # Resolve the git TOPLEVEL via ``git rev-parse`` rather than a naive
+    # ``.git``-child check. For monorepo SUBDIRECTORIES (e.g.
+    # ``sglang/sgl-kernel``) there is no ``.git`` directly under ``repo_root``,
+    # so the old check evaluated to False and the smoke test fell back to
+    # running against the ORIGINAL source tree. That is a real isolation hole:
+    # it (a) lets a harness that depends on preprocess-seeded build state
+    # (e.g. a pre-existing ``_geak_build/``) pass spuriously, and (b) lets a
+    # harness that evaluates the baseline (instead of the worktree) pass — both
+    # surface later as the "always ~1.00x speedup" bug. Always run the preflight
+    # in a FRESH worktree of the toplevel so the harness is forced to be
+    # self-contained and fully worktree-resolved.
+    toplevel: Path | None = None
+    try:
+        _tl = subprocess.run(
+            ["git", "rev-parse", "--show-toplevel"],
+            cwd=str(repo_root_path),
+            capture_output=True,
+            text=True,
+            env=get_git_safe_env(repo_root_path),
+        )
+        if _tl.returncode == 0 and _tl.stdout.strip():
+            toplevel = Path(_tl.stdout.strip()).resolve()
+    except Exception:  # noqa: BLE001 — non-git or git missing: fall back below
+        toplevel = None
+
+    # Cleanup bookkeeping: the registered git worktree is always the TOPLEVEL
+    # checkout (``worktree_root``); for a subdir kernel ``preflight_dir`` is a
+    # path INSIDE it, so we must remove ``worktree_root`` via the toplevel repo,
+    # not ``preflight_dir`` (otherwise the monorepo checkout + a stale
+    # ``git worktree`` entry leak on disk).
+    cleanup_root: Path | None = None
+    cleanup_repo: str | None = None
+
+    if toplevel is not None and output_dir is not None:
         from minisweagent.run.task_file import create_worktree
 
-        preflight_dir = (Path(output_dir) / "_preflight_worktree").resolve()
-        create_worktree(repo_root_path, preflight_dir)
+        try:
+            rel = repo_root_path.relative_to(toplevel)
+        except ValueError:
+            rel = Path(".")
+        worktree_root = (Path(output_dir) / "_preflight_worktree").resolve()
+        create_worktree(toplevel, worktree_root)
+        # For a subdir kernel, GEAK_WORK_DIR must point at the subdir INSIDE the
+        # worktree (where ``csrc/`` etc. live), not the monorepo root.
+        preflight_dir = (worktree_root / rel).resolve()
+        cleanup_root = worktree_root
+        cleanup_repo = str(toplevel)
     else:
         preflight_dir = repo_root_path
 
@@ -430,8 +469,8 @@ def preflight_commandment_contract(
         )
         raise CommandmentExecutionError("PREFLIGHT", last_result.returncode, detail)
     finally:
-        if preflight_dir != repo_root_path:
-            cleanup_eval_worktree(repo_root, preflight_dir)
+        if cleanup_root is not None and cleanup_repo is not None:
+            cleanup_eval_worktree(cleanup_repo, cleanup_root)
 
 
 def recapture_commandment_baseline(

diff --git a/src/minisweagent/run/preprocess/harness_utils.py b/src/minisweagent/run/preprocess/harness_utils.py
@@ -1279,6 +1279,27 @@ def validate_harness(harness_path: str) -> tuple[bool, list[str]]:
         if flag not in code_only_source:
             errors.append(f"Harness source does not define '{flag}' flag")
 
+    # Worktree-bypass gate: a harness that hardcodes an absolute path into the
+    # source repo (REPO_ROOT="/...", hipcc -I /..., sys.path.insert(0, "/..."))
+    # compiles/imports the UNPATCHED baseline instead of the $GEAK_WORK_DIR
+    # worktree, so every speedup silently reads ~1.00x. Mechanism-agnostic:
+    # keys on "absolute literal into the repo root, not env-derived". Opt out
+    # with GEAK_ALLOW_HARDCODED_PATHS=1.
+    if not os.environ.get("GEAK_ALLOW_HARDCODED_PATHS"):
+        try:
+            from minisweagent.kernel_languages.contract import find_source_repo_path_leaks
+
+            leaks = find_source_repo_path_leaks(code_only_source)
+        except Exception:  # noqa: BLE001 — never let the detector break validation
+            leaks = []
+        for lineno, snippet, reason in leaks:
+            errors.append(
+                f"Line {lineno}: {reason}. This bypasses the GEAK worktree so the "
+                f"harness evaluates the BASELINE (every speedup ~1.00x). Derive the "
+                f"path from os.environ['GEAK_WORK_DIR'] (build artifacts under it too). "
+                f"Offending line: {snippet}"
+            )
+
     for flag in RECOMMENDED_HARNESS_FLAGS:
         if flag not in code_only_source:
             msg = (

diff --git a/src/minisweagent/run/preprocess/run_harness.py b/src/minisweagent/run/preprocess/run_harness.py
@@ -38,6 +38,10 @@
     "full-benchmark": "--full-benchmark",
 }
 
+#: Per-mode harness subprocess timeouts (seconds). All env-overridable so that
+#: compiled kernels (HIP/CUDA), whose first benchmark run must build the whole
+#: extension from source, are not killed mid-compile (which triggers a costly
+#: recompile-on-retry loop). Env names match ``preprocess_v3/baseline.py``.
 MODE_TIMEOUTS: dict[str, int] = {
     "correctness": int(os.environ.get("GEAK_BENCH_TIMEOUT", os.environ.get("GEAK_CORRECTNESS_TIMEOUT", "900"))),
     "profile": int(os.environ.get("GEAK_PROFILE_TIMEOUT", "120")),
@@ -64,12 +68,28 @@ def _build_env(
     gpu_id: int,
     env_overrides: dict[str, str] | None,
 ) -> dict[str, str]:
-    """Build subprocess environment matching COMMANDMENT SETUP conventions."""
+    """Build subprocess environment matching COMMANDMENT SETUP conventions.
+
+    ``repo_root`` here is the directory the harness must evaluate — at
+    optimization time this is the per-slot worktree, so we (a) put it first on
+    ``PYTHONPATH`` and (b) export it as ``GEAK_WORK_DIR``. A contract-compliant
+    harness resolves every repo path from ``GEAK_WORK_DIR``, which makes each
+    parallel slot self-contained: imports and build artifacts stay inside that
+    slot's worktree. We deliberately do NOT mutate global site-packages (no
+    ``pip install -e``), so the original sglang/vllm/aiter install is untouched
+    and parallel slots can never clobber a shared editable install.
+    """
     env = os.environ.copy()
 
     if repo_root:
         existing = env.get("PYTHONPATH", "")
         env["PYTHONPATH"] = f"{repo_root}:{existing}" if existing else repo_root
+        # Worktree-awareness contract: harnesses MUST read GEAK_WORK_DIR to
+        # locate the patched source. Setting it unconditionally also turns any
+        # ``os.environ.get("GEAK_WORK_DIR", "<baseline>")`` fallback into a
+        # no-op (the env already holds the right per-slot worktree).
+        env["GEAK_WORK_DIR"] = repo_root
+        env.setdefault("GEAK_REPO_ROOT", repo_root)
 
     env["HIP_VISIBLE_DEVICES"] = str(gpu_id)
     env["PYTHONUNBUFFERED"] = "1"

diff --git a/src/minisweagent/run/preprocess/unit_test_agent.py b/src/minisweagent/run/preprocess/unit_test_agent.py
@@ -31,6 +31,52 @@ def __init__(self, model: Model, env: Environment, **kwargs):
 _GEAK_PREPROCESS_INSTRUCTIONS_RELATIVE_PATH = "src/minisweagent/run/preprocess/INSTRUCTIONS.md"
 
 
+# Language-agnostic worktree-path rule injected into EVERY harness-generation
+# context (not per-kernel-language). This is the single most important harness
+# invariant: resolve all repo paths from $GEAK_WORK_DIR so the patched worktree
+# (not the baseline) is evaluated, keep parallel slots isolated, and never touch
+# the global environment. A static validator enforces it and triggers
+# regeneration on violation.
+_WORKTREE_PATH_DISCIPLINE = (
+    "## Worktree Path Discipline (MANDATORY — all kernel languages)\n"
+    "GEAK evaluates each candidate inside a per-slot worktree exported as "
+    "`$GEAK_WORK_DIR` (placed first on PYTHONPATH; SETUP `cd`s into it). The "
+    "harness MUST resolve EVERY repository path from `$GEAK_WORK_DIR`, never a "
+    "hardcoded absolute source path — otherwise it compiles/imports the UNPATCHED "
+    "baseline and every speedup silently reads ~1.00x.\n"
+    "- Derive once: "
+    "`WORK_DIR = os.environ.get('GEAK_WORK_DIR', os.path.dirname(os.path.abspath(__file__)))`.\n"
+    "- C/C++/HIP/CUDA/CK: build include flags as `f'-I{WORK_DIR}/<subdir>'`; put "
+    "compiled artifacts in a DETERMINISTIC build dir UNDER `WORK_DIR` (e.g. "
+    "`f'{WORK_DIR}/_geak_build'`). It is per-worktree-isolated because `WORK_DIR` "
+    "already differs per slot, so use a FIXED name (NOT a random/unique-per-run "
+    "dir) so repeated invocations in the same worktree reuse compiled objects. "
+    "Do an INCREMENTAL rebuild keyed on source mtime/hash: rebuild only when the "
+    "kernel source is newer than the built artifact (so a patched kernel is always "
+    "recompiled), otherwise reuse the cached build. NEVER do an unconditional cold "
+    "rebuild every run — for big compiled repos that turns harness validation into "
+    "a multi-hour recompile loop.\n"
+    "- Python: rely on PYTHONPATH; do NOT `sys.path.insert(0, '/abs')`; if you "
+    "must touch sys.path, derive it from `WORK_DIR` (e.g. `f'{WORK_DIR}/python'`). "
+    "NEVER add a hardcoded source-repo path as a sys.path fallback/candidate — not "
+    "even as an element of a candidate list/tuple. The worktree-derived entry must "
+    "be the ONLY one you insert; the sole permitted absolute literal anywhere is the "
+    "default arg of `os.environ.get('GEAK_WORK_DIR', <default>)`. A second hardcoded "
+    "candidate (e.g. `['/repo/python']`) will, via `sys.path.insert(0, ...)` in a "
+    "loop, end up AHEAD of the worktree and silently import the unpatched baseline.\n"
+    "- Do NOT `pip install -e` and do NOT mutate the global environment; GEAK "
+    "manages import resolution per-worktree so the original install is untouched "
+    "and parallel slots never collide.\n"
+    "- Shape budget (MANDATORY, keeps validation bounded): read "
+    "`_MAX_SHAPES = int(os.environ.get('GEAK_MAX_BENCHMARK_SHAPES', '0') or 0)` once. "
+    "When `_MAX_SHAPES > 0`, EVERY mode — including `--full-benchmark` — must cap its "
+    "selected configs to at most `_MAX_SHAPES` via the same positional `_pick` "
+    "(`--full-benchmark -> _pick(all_configs, _MAX_SHAPES)`). When unset/0, keep full "
+    "coverage. A `--full-benchmark` that ignores this budget times out during "
+    "validation and is rejected."
+)
+
+
 def _extract_test_command(text: str) -> str:
     match = re.search(r"TEST_COMMAND:\s*(.+)\s*$", text.strip(), re.MULTILINE)
     if not match:
@@ -62,6 +108,12 @@ def _extract_test_command(text: str) -> str:
         "This is a HIP kernel (C++ compiled with hipcc).\n"
         "- A build step is REQUIRED before running tests.\n"
         "- Use the project's build system (CMake/Makefile) or compile with `hipcc` directly.\n"
+        "- CACHE the build: compile into a deterministic dir under `$GEAK_WORK_DIR` "
+        "(e.g. `$GEAK_WORK_DIR/_geak_build`) and rebuild ONLY when the kernel source "
+        "is newer than the built artifact (mtime/hash check). The harness is invoked "
+        "many times during validation (correctness + N benchmark repeats); an "
+        "unconditional cold rebuild each invocation makes a large repo take hours. A "
+        "patched kernel has a newer mtime, so the incremental check still recompiles it.\n"
         "- Use host-side validation (compare GPU output against CPU reference).\n"
         "- Use `hipEventElapsedTime` or `torch.cuda.Event` for benchmarking.\n"
         "- NEVER use `sys.path.insert(0, '/absolute/path/...')`. "
@@ -71,6 +123,11 @@ def _extract_test_command(text: str) -> str:
         "This is a CUDA kernel (C++ compiled with nvcc).\n"
         "- A build step is REQUIRED before running tests.\n"
         "- Use the project's build system (CMake/Makefile) or compile with `nvcc` directly.\n"
+        "- CACHE the build: compile into a deterministic dir under `$GEAK_WORK_DIR` "
+        "and rebuild ONLY when the kernel source is newer than the built artifact "
+        "(mtime/hash). An unconditional cold rebuild on each of the many validation "
+        "invocations makes a large repo take hours; a patched kernel still recompiles "
+        "because its mtime is newer.\n"
         "- Use host-side validation (compare GPU output against CPU reference).\n"
         "- Use `cudaEventElapsedTime` or `torch.cuda.Event` for benchmarking."
     ),
@@ -153,6 +210,10 @@ def format_discovery_for_agent(result) -> str:
             lines.append(guidance)
             lines.append("")
 
+        # Universal worktree-path rule (language-agnostic; always injected).
+        lines.append(_WORKTREE_PATH_DISCIPLINE)
+        lines.append("")
+
     # --- FILES YOU MUST READ ---
     must_read: list[tuple[str, str]] = []
     for b in result.benchmarks[:3]: