From 4a83394332d2da93dcb14c3bac3f4ab2c2c5f5b4 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Mon, 23 Mar 2026 09:18:02 +1300
Subject: [PATCH 1/9] Reapply "[ML] Run allowlist validation in PyTorch edge
 pipeline (#2989)" (#3005)

This reverts commit 9cc49ff7a6f96089c2cd98d3d13d715418eb8216.
---
 .buildkite/scripts/steps/run_tests.sh | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/.buildkite/scripts/steps/run_tests.sh b/.buildkite/scripts/steps/run_tests.sh
index 0c5c08125..12b88c1bb 100755
--- a/.buildkite/scripts/steps/run_tests.sh
+++ b/.buildkite/scripts/steps/run_tests.sh
@@ -105,28 +105,6 @@ else
         -P cmake/run-all-tests-parallel.cmake || TEST_OUTCOME=$?
 fi
 
-# --- PyTorch allowlist validation ---
-# When triggered from the PyTorch edge pipeline, run the Python-based
-# allowlist validation which traces live HuggingFace models with the
-# new PyTorch version and verifies every op is in ALLOWED_OPERATIONS.
-VALIDATION_OUTCOME=0
-if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]] && [ -f cmake/run-validation.cmake ]; then
-    echo "--- Validating PyTorch allowlist against HuggingFace models"
-    cmake \
-        -DSOURCE_DIR="$(pwd)" \
-        -DVALIDATE_CONFIG="$(pwd)/dev-tools/extract_model_ops/validation_models.json" \
-        -DVALIDATE_PT_DIR="$(pwd)/dev-tools/extract_model_ops/es_it_models" \
-        -DVALIDATE_VERBOSE=TRUE \
-        -DOPTIONAL=TRUE \
-        -P cmake/run-validation.cmake || VALIDATION_OUTCOME=$?
-
-    if [[ $VALIDATION_OUTCOME -ne 0 ]]; then
-        echo "^^^ +++"
-        echo "Allowlist validation failed — the new PyTorch version may introduce ops not in ALLOWED_OPERATIONS."
-        echo "See dev-tools/extract_model_ops/README.md for how to update the allowlist."
-    fi
-fi
-
 # Upload test results
 echo "--- Uploading test results"
 TEST_RESULTS_ARCHIVE=${OS}-${HARDWARE_ARCH}-unit_test_results.tgz
@@ -139,6 +117,4 @@ else
     echo "No test results archive created"
 fi
 
-if [[ $TEST_OUTCOME -ne 0 || $VALIDATION_OUTCOME -ne 0 ]]; then
-    exit 1
-fi
+exit $TEST_OUTCOME

From 1580b8f2623cb635e9f7cb1464e2711a9df5e341 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 25 Mar 2026 13:09:23 +1300
Subject: [PATCH 2/9] [ML] Run PyTorch allowlist validation as a dedicated
 Buildkite step

The Linux build/test Docker images don't include Python 3 (it's only
used during image builds to compile PyTorch, then dropped in the
multi-stage final image). Move the validation to a dedicated pipeline
step using a python:3 agent image, triggered only for
run_pytorch_tests builds.

Made-with: Cursor
---
 .buildkite/pipeline.json.py                   |  8 +++++
 .../validate_pytorch_allowlist.yml.sh         | 36 +++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100755 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh

diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py
index 9b6d6616e..0ae577685 100755
--- a/.buildkite/pipeline.json.py
+++ b/.buildkite/pipeline.json.py
@@ -84,6 +84,14 @@ def main():
                                                        ".buildkite/pipelines/check_build_regression.yml.sh",
                                                        soft_fail=True))
 
+    # Validate the PyTorch allowlist against HuggingFace models when
+    # triggered from the PyTorch edge pipeline.  Runs in a python:3
+    # container since the build/test images don't include Python.
+    if config.run_pytorch_tests:
+        pipeline_steps.append(pipeline_steps.generate_step("Upload PyTorch allowlist validation",
+                                                           ".buildkite/pipelines/validate_pytorch_allowlist.yml.sh",
+                                                           soft_fail=True))
+
     pipeline["env"] = env
     pipeline["steps"] = pipeline_steps
     print(json.dumps(pipeline, indent=2))
diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
new file mode 100755
index 000000000..5f9d50a6c
--- /dev/null
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License
+# 2.0 and the following additional limitation. Functionality enabled by the
+# files subject to the Elastic License 2.0 may only be used in production when
+# invoked by an Elasticsearch process with a license key installed that permits
+# use of machine learning features. You may not use this file except in
+# compliance with the Elastic License 2.0 and the foregoing additional
+# limitation.
+
+cat <<'EOL'
+steps:
+  - label: "Validate PyTorch allowlist :torch:"
+    key: "validate_pytorch_allowlist"
+    command:
+        - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi"
+        - "pip install -q -r dev-tools/extract_model_ops/requirements.txt"
+        - "python3 dev-tools/extract_model_ops/validate_allowlist.py --config dev-tools/extract_model_ops/validation_models.json --pt-dir dev-tools/extract_model_ops/es_it_models --verbose"
+EOL
+
+# Depend on the build steps so validation doesn't start before the
+# pipeline is fully generated.
+if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then
+    echo '    depends_on:'
+    IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS"
+    for key in "${STEP_KEYS[@]}"; do
+        echo "        - \"${key}\""
+    done
+fi
+
+cat <<'EOL'
+    allow_dependency_failure: true
+    soft_fail: true
+    agents:
+      image: "python:3"
+EOL

From 821afc7ae752730767a38fc165c33d75fa682e82 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 25 Mar 2026 15:38:44 +1300
Subject: [PATCH 3/9] [ML] Pin validation step to python:3.12 for torch 2.7.1
 compatibility

The python:3 tag now resolves to Python 3.14, which doesn't have
torch==2.7.1 wheels. Pin to python:3.12 to match the PyTorch
version we build and ship against.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index 5f9d50a6c..62277c322 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -32,5 +32,5 @@ cat <<'EOL'
     allow_dependency_failure: true
     soft_fail: true
     agents:
-      image: "python:3"
+      image: "python:3.12"
 EOL

From a0a0d5656bb6650bf414cd9769a3b9d91380f864 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 25 Mar 2026 16:19:12 +1300
Subject: [PATCH 4/9] [ML] Add resources and timeout to PyTorch validation step
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The step was being killed (exit -1) with no output — likely OOM or
disk exhaustion from installing torch (800MB+) and tracing 27+ models.

Add memory (16G), ephemeral storage (20G), and a 60-minute timeout.
Remove -q from pip install so progress is visible in logs.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index 62277c322..df6b9b724 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -12,9 +12,10 @@ cat <<'EOL'
 steps:
   - label: "Validate PyTorch allowlist :torch:"
     key: "validate_pytorch_allowlist"
+    timeout_in_minutes: 60
     command:
         - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi"
-        - "pip install -q -r dev-tools/extract_model_ops/requirements.txt"
+        - "pip install -r dev-tools/extract_model_ops/requirements.txt"
         - "python3 dev-tools/extract_model_ops/validate_allowlist.py --config dev-tools/extract_model_ops/validation_models.json --pt-dir dev-tools/extract_model_ops/es_it_models --verbose"
 EOL
 
@@ -33,4 +34,6 @@ cat <<'EOL'
     soft_fail: true
     agents:
       image: "python:3.12"
+      memory: "16G"
+      ephemeralStorage: "20G"
 EOL

From 26715e1c506d056d1bbf8eb543492ad96981dfbb Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 25 Mar 2026 16:31:18 +1300
Subject: [PATCH 5/9] [ML] Make PyTorch allowlist validation a hard failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The validation step should fail the build if it detects allowlist
errors — that's the whole point of running it. The upload step
retains soft_fail in case of pipeline upload issues.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index df6b9b724..42f67f0be 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -31,7 +31,6 @@ fi
 
 cat <<'EOL'
     allow_dependency_failure: true
-    soft_fail: true
     agents:
       image: "python:3.12"
       memory: "16G"

From 905341407589fdf0feb911e37b1964214c79190c Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Wed, 25 Mar 2026 16:41:31 +1300
Subject: [PATCH 6/9] [ML] Add GitHub commit status for PyTorch validation step

Without a notify/github_commit_status block, the step doesn't
appear as a check on the GitHub PR. Add it so the validation
result is visible alongside the other build/test checks.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index 42f67f0be..25be46ae7 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -35,4 +35,7 @@ cat <<'EOL'
       image: "python:3.12"
       memory: "16G"
       ephemeralStorage: "20G"
+    notify:
+      - github_commit_status:
+          context: "Validate PyTorch allowlist"
 EOL

From e1ff7dad8562667bcdc3be90e1a2a040f74387bf Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 26 Mar 2026 10:56:43 +1300
Subject: [PATCH 7/9] [ML] Treat model load failures as skips, not validation
 failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Private Elastic models on HuggingFace (elastic/elser-v2, etc.) can't
be downloaded without a HF_TOKEN, causing the validation step to fail
in CI even though the ops are correct.

Change validate_model() to return "pass"/"fail"/"skip" — load/trace
failures are reported as skips (warnings) while op validation
failures remain hard failures. Also pass auto_class and
config_overrides through to support BART and QA models.

Made-with: Cursor
---
 .../extract_model_ops/validate_allowlist.py   | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/dev-tools/extract_model_ops/validate_allowlist.py b/dev-tools/extract_model_ops/validate_allowlist.py
index dfb39021a..53e6ff72c 100644
--- a/dev-tools/extract_model_ops/validate_allowlist.py
+++ b/dev-tools/extract_model_ops/validate_allowlist.py
@@ -104,30 +104,41 @@ def validate_model(model_name: str,
                    allowed: set[str],
                    forbidden: set[str],
                    verbose: bool,
-                   quantize: bool = False) -> bool:
-    """Validate one HuggingFace model. Returns True if all ops pass."""
+                   quantize: bool = False,
+                   auto_class: str | None = None,
+                   config_overrides: dict | None = None) -> str:
+    """Validate one HuggingFace model.
+
+    Returns "pass", "fail" (op validation failed), or "skip" (could not
+    load/trace — e.g. private model without HF_TOKEN).
+    """
     label = f"{model_name} (quantized)" if quantize else model_name
     print(f"  {label}...", file=sys.stderr)
-    traced = load_and_trace_hf_model(model_name, quantize=quantize)
+    traced = load_and_trace_hf_model(model_name, quantize=quantize,
+                                     auto_class=auto_class,
+                                     config_overrides=config_overrides)
     if traced is None:
-        print(f"    FAILED (could not load/trace)", file=sys.stderr)
-        return False
+        print(f"    SKIPPED (could not load/trace)", file=sys.stderr)
+        return "skip"
     ops = collect_inlined_ops(traced)
-    return check_ops(ops, allowed, forbidden, verbose)
+    return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
 
 
 def validate_pt_file(name: str,
                      pt_path: str,
                      allowed: set[str],
                      forbidden: set[str],
-                     verbose: bool) -> bool:
-    """Validate a local TorchScript .pt file. Returns True if all ops pass."""
+                     verbose: bool) -> str:
+    """Validate a local TorchScript .pt file.
+
+    Returns "pass", "fail", or "skip".
+    """
     print(f"  {name} ({pt_path})...", file=sys.stderr)
     ops = load_pt_and_collect_ops(pt_path)
     if ops is None:
-        print(f"    FAILED (could not load)", file=sys.stderr)
-        return False
-    return check_ops(ops, allowed, forbidden, verbose)
+        print(f"    SKIPPED (could not load)", file=sys.stderr)
+        return "skip"
+    return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
 
 
 def main():
@@ -151,7 +162,7 @@ def main():
     print(f"Parsed {len(allowed)} allowed ops and {len(forbidden)} "
           f"forbidden ops from {SUPPORTED_OPS_CC.name}", file=sys.stderr)
 
-    results: dict[str, bool] = {}
+    results: dict[str, str] = {}
 
     models = load_model_config(args.config)
 
@@ -161,7 +172,9 @@ def main():
     for arch, spec in models.items():
         results[arch] = validate_model(
             spec["model_id"], allowed, forbidden, args.verbose,
-            quantize=spec["quantized"])
+            quantize=spec["quantized"],
+            auto_class=spec.get("auto_class"),
+            config_overrides=spec.get("config_overrides"))
 
     if args.pt_dir and args.pt_dir.is_dir():
         pt_files = sorted(args.pt_dir.glob("*.pt"))
@@ -175,26 +188,32 @@ def main():
 
     print(file=sys.stderr)
     print("=" * 60, file=sys.stderr)
-    all_pass = all(results.values())
-    for key, passed in results.items():
-        status = "PASS" if passed else "FAIL"
+    for key, status in results.items():
+        display = status.upper()
         if key.startswith("pt:"):
-            print(f"  {key}: {status}", file=sys.stderr)
+            print(f"  {key}: {display}", file=sys.stderr)
         else:
             spec = models[key]
             label = spec["model_id"]
             if spec["quantized"]:
                 label += " (quantized)"
-            print(f"  {key} ({label}): {status}", file=sys.stderr)
+            print(f"  {key} ({label}): {display}", file=sys.stderr)
+
+    failed = [a for a, s in results.items() if s == "fail"]
+    skipped = [a for a, s in results.items() if s == "skip"]
+    passed = [a for a, s in results.items() if s == "pass"]
 
     print("=" * 60, file=sys.stderr)
-    if all_pass:
-        print("All models PASS - no false positives.", file=sys.stderr)
-    else:
-        failed = [a for a, p in results.items() if not p]
-        print(f"FAILED models: {', '.join(failed)}", file=sys.stderr)
+    print(f"{len(passed)} passed, {len(failed)} failed, "
+          f"{len(skipped)} skipped", file=sys.stderr)
+
+    if skipped:
+        print(f"Skipped (could not load/trace — may need HF_TOKEN "
+              f"for private models): {', '.join(skipped)}", file=sys.stderr)
+    if failed:
+        print(f"FAILED (op validation): {', '.join(failed)}", file=sys.stderr)
 
-    sys.exit(0 if all_pass else 1)
+    sys.exit(0 if not failed else 1)
 
 
 if __name__ == "__main__":

From cbc8201685dd992d60cac21a9b5ca3ddf4f4bd50 Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 26 Mar 2026 12:47:16 +1300
Subject: [PATCH 8/9] [ML] Increase validation step resources for large models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

facebook/bart-large-mnli is 1.63GB — loading it into memory for
tracing after torch and 30 other models exhausted the 16GB limit.
Bump to 32GB memory and 30GB ephemeral storage.

Made-with: Cursor
---
 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
index 25be46ae7..ef2829976 100755
--- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
+++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh
@@ -33,8 +33,8 @@ cat <<'EOL'
     allow_dependency_failure: true
     agents:
       image: "python:3.12"
-      memory: "16G"
-      ephemeralStorage: "20G"
+      memory: "32G"
+      ephemeralStorage: "30G"
     notify:
       - github_commit_status:
           context: "Validate PyTorch allowlist"

From 419c3a5a166ee0be64f6d1b84e0b0cb4ee103ccc Mon Sep 17 00:00:00 2001
From: Ed Savage <ed.savage@elastic.co>
Date: Thu, 26 Mar 2026 12:50:53 +1300
Subject: [PATCH 9/9] [ML] Free model memory after tracing to reduce peak usage

When validating 30+ models sequentially, the HF model weights
accumulate in memory. Explicitly delete the original model,
tokenizer, and inputs after tracing, and gc.collect() after each
validation to release memory promptly. This should allow the
validation step to complete within 32GB for the full model set
including facebook/bart-large-mnli (1.63GB).

Made-with: Cursor
---
 dev-tools/extract_model_ops/torchscript_utils.py  | 9 +++++++--
 dev-tools/extract_model_ops/validate_allowlist.py | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/dev-tools/extract_model_ops/torchscript_utils.py b/dev-tools/extract_model_ops/torchscript_utils.py
index af2b30f68..da8fb481b 100644
--- a/dev-tools/extract_model_ops/torchscript_utils.py
+++ b/dev-tools/extract_model_ops/torchscript_utils.py
@@ -145,13 +145,18 @@ def load_and_trace_hf_model(model_name: str, quantize: bool = False,
     attention_mask = inputs["attention_mask"]
 
     try:
-        return torch.jit.trace(
+        traced = torch.jit.trace(
             model, (input_ids, attention_mask), strict=False)
     except Exception as exc:
         print(f"    TRACE WARNING: {exc}", file=sys.stderr)
         print("    Falling back to torch.jit.script...", file=sys.stderr)
         try:
-            return torch.jit.script(model)
+            traced = torch.jit.script(model)
         except Exception as exc2:
             print(f"    SCRIPT ERROR: {exc2}", file=sys.stderr)
             return None
+
+    # Free the original HF model to reduce peak memory when validating
+    # many models sequentially.
+    del model, tokenizer, inputs
+    return traced
diff --git a/dev-tools/extract_model_ops/validate_allowlist.py b/dev-tools/extract_model_ops/validate_allowlist.py
index 53e6ff72c..d7a1ba99c 100644
--- a/dev-tools/extract_model_ops/validate_allowlist.py
+++ b/dev-tools/extract_model_ops/validate_allowlist.py
@@ -29,6 +29,7 @@
 """
 
 import argparse
+import gc
 import re
 import sys
 from pathlib import Path
@@ -121,7 +122,10 @@ def validate_model(model_name: str,
         print(f"    SKIPPED (could not load/trace)", file=sys.stderr)
         return "skip"
     ops = collect_inlined_ops(traced)
-    return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
+    result = "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail"
+    del traced
+    gc.collect()
+    return result
 
 
 def validate_pt_file(name: str,