From 4a83394332d2da93dcb14c3bac3f4ab2c2c5f5b4 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Mon, 23 Mar 2026 09:18:02 +1300 Subject: [PATCH 1/9] Reapply "[ML] Run allowlist validation in PyTorch edge pipeline (#2989)" (#3005) This reverts commit 9cc49ff7a6f96089c2cd98d3d13d715418eb8216. --- .buildkite/scripts/steps/run_tests.sh | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/.buildkite/scripts/steps/run_tests.sh b/.buildkite/scripts/steps/run_tests.sh index 0c5c08125..12b88c1bb 100755 --- a/.buildkite/scripts/steps/run_tests.sh +++ b/.buildkite/scripts/steps/run_tests.sh @@ -105,28 +105,6 @@ else -P cmake/run-all-tests-parallel.cmake || TEST_OUTCOME=$? fi -# --- PyTorch allowlist validation --- -# When triggered from the PyTorch edge pipeline, run the Python-based -# allowlist validation which traces live HuggingFace models with the -# new PyTorch version and verifies every op is in ALLOWED_OPERATIONS. -VALIDATION_OUTCOME=0 -if [[ "${GITHUB_PR_COMMENT_VAR_ACTION:-}" == "run_pytorch_tests" ]] && [ -f cmake/run-validation.cmake ]; then - echo "--- Validating PyTorch allowlist against HuggingFace models" - cmake \ - -DSOURCE_DIR="$(pwd)" \ - -DVALIDATE_CONFIG="$(pwd)/dev-tools/extract_model_ops/validation_models.json" \ - -DVALIDATE_PT_DIR="$(pwd)/dev-tools/extract_model_ops/es_it_models" \ - -DVALIDATE_VERBOSE=TRUE \ - -DOPTIONAL=TRUE \ - -P cmake/run-validation.cmake || VALIDATION_OUTCOME=$? - - if [[ $VALIDATION_OUTCOME -ne 0 ]]; then - echo "^^^ +++" - echo "Allowlist validation failed — the new PyTorch version may introduce ops not in ALLOWED_OPERATIONS." - echo "See dev-tools/extract_model_ops/README.md for how to update the allowlist." - fi -fi - # Upload test results echo "--- Uploading test results" TEST_RESULTS_ARCHIVE=${OS}-${HARDWARE_ARCH}-unit_test_results.tgz @@ -139,6 +117,4 @@ else echo "No test results archive created" fi -if [[ $TEST_OUTCOME -ne 0 || $VALIDATION_OUTCOME -ne 0 ]]; then - exit 1 -fi +exit $TEST_OUTCOME From 1580b8f2623cb635e9f7cb1464e2711a9df5e341 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 25 Mar 2026 13:09:23 +1300 Subject: [PATCH 2/9] [ML] Run PyTorch allowlist validation as a dedicated Buildkite step The Linux build/test Docker images don't include Python 3 (it's only used during image builds to compile PyTorch, then dropped in the multi-stage final image). Move the validation to a dedicated pipeline step using a python:3 agent image, triggered only for run_pytorch_tests builds. Made-with: Cursor --- .buildkite/pipeline.json.py | 8 +++++ .../validate_pytorch_allowlist.yml.sh | 36 +++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100755 .buildkite/pipelines/validate_pytorch_allowlist.yml.sh diff --git a/.buildkite/pipeline.json.py b/.buildkite/pipeline.json.py index 9b6d6616e..0ae577685 100755 --- a/.buildkite/pipeline.json.py +++ b/.buildkite/pipeline.json.py @@ -84,6 +84,14 @@ def main(): ".buildkite/pipelines/check_build_regression.yml.sh", soft_fail=True)) + # Validate the PyTorch allowlist against HuggingFace models when + # triggered from the PyTorch edge pipeline. Runs in a python:3 + # container since the build/test images don't include Python. + if config.run_pytorch_tests: + pipeline_steps.append(pipeline_steps.generate_step("Upload PyTorch allowlist validation", + ".buildkite/pipelines/validate_pytorch_allowlist.yml.sh", + soft_fail=True)) + pipeline["env"] = env pipeline["steps"] = pipeline_steps print(json.dumps(pipeline, indent=2)) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh new file mode 100755 index 000000000..5f9d50a6c --- /dev/null +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +# or more contributor license agreements. Licensed under the Elastic License +# 2.0 and the following additional limitation. Functionality enabled by the +# files subject to the Elastic License 2.0 may only be used in production when +# invoked by an Elasticsearch process with a license key installed that permits +# use of machine learning features. You may not use this file except in +# compliance with the Elastic License 2.0 and the foregoing additional +# limitation. + +cat <<'EOL' +steps: + - label: "Validate PyTorch allowlist :torch:" + key: "validate_pytorch_allowlist" + command: + - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi" + - "pip install -q -r dev-tools/extract_model_ops/requirements.txt" + - "python3 dev-tools/extract_model_ops/validate_allowlist.py --config dev-tools/extract_model_ops/validation_models.json --pt-dir dev-tools/extract_model_ops/es_it_models --verbose" +EOL + +# Depend on the build steps so validation doesn't start before the +# pipeline is fully generated. +if [ -n "${ML_BUILD_STEP_KEYS:-}" ]; then + echo ' depends_on:' + IFS=',' read -ra STEP_KEYS <<< "$ML_BUILD_STEP_KEYS" + for key in "${STEP_KEYS[@]}"; do + echo " - \"${key}\"" + done +fi + +cat <<'EOL' + allow_dependency_failure: true + soft_fail: true + agents: + image: "python:3" +EOL From 821afc7ae752730767a38fc165c33d75fa682e82 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 25 Mar 2026 15:38:44 +1300 Subject: [PATCH 3/9] [ML] Pin validation step to python:3.12 for torch 2.7.1 compatibility The python:3 tag now resolves to Python 3.14, which doesn't have torch==2.7.1 wheels. Pin to python:3.12 to match the PyTorch version we build and ship against. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index 5f9d50a6c..62277c322 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -32,5 +32,5 @@ cat <<'EOL' allow_dependency_failure: true soft_fail: true agents: - image: "python:3" + image: "python:3.12" EOL From a0a0d5656bb6650bf414cd9769a3b9d91380f864 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 25 Mar 2026 16:19:12 +1300 Subject: [PATCH 4/9] [ML] Add resources and timeout to PyTorch validation step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The step was being killed (exit -1) with no output — likely OOM or disk exhaustion from installing torch (800MB+) and tracing 27+ models. Add memory (16G), ephemeral storage (20G), and a 60-minute timeout. Remove -q from pip install so progress is visible in logs. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index 62277c322..df6b9b724 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -12,9 +12,10 @@ cat <<'EOL' steps: - label: "Validate PyTorch allowlist :torch:" key: "validate_pytorch_allowlist" + timeout_in_minutes: 60 command: - "if [ ! -f dev-tools/extract_model_ops/validate_allowlist.py ]; then echo 'validate_allowlist.py not found, skipping'; exit 0; fi" - - "pip install -q -r dev-tools/extract_model_ops/requirements.txt" + - "pip install -r dev-tools/extract_model_ops/requirements.txt" - "python3 dev-tools/extract_model_ops/validate_allowlist.py --config dev-tools/extract_model_ops/validation_models.json --pt-dir dev-tools/extract_model_ops/es_it_models --verbose" EOL @@ -33,4 +34,6 @@ cat <<'EOL' soft_fail: true agents: image: "python:3.12" + memory: "16G" + ephemeralStorage: "20G" EOL From 26715e1c506d056d1bbf8eb543492ad96981dfbb Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 25 Mar 2026 16:31:18 +1300 Subject: [PATCH 5/9] [ML] Make PyTorch allowlist validation a hard failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The validation step should fail the build if it detects allowlist errors — that's the whole point of running it. The upload step retains soft_fail in case of pipeline upload issues. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index df6b9b724..42f67f0be 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -31,7 +31,6 @@ fi cat <<'EOL' allow_dependency_failure: true - soft_fail: true agents: image: "python:3.12" memory: "16G" From 905341407589fdf0feb911e37b1964214c79190c Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Wed, 25 Mar 2026 16:41:31 +1300 Subject: [PATCH 6/9] [ML] Add GitHub commit status for PyTorch validation step Without a notify/github_commit_status block, the step doesn't appear as a check on the GitHub PR. Add it so the validation result is visible alongside the other build/test checks. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index 42f67f0be..25be46ae7 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -35,4 +35,7 @@ cat <<'EOL' image: "python:3.12" memory: "16G" ephemeralStorage: "20G" + notify: + - github_commit_status: + context: "Validate PyTorch allowlist" EOL From e1ff7dad8562667bcdc3be90e1a2a040f74387bf Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 26 Mar 2026 10:56:43 +1300 Subject: [PATCH 7/9] [ML] Treat model load failures as skips, not validation failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Private Elastic models on HuggingFace (elastic/elser-v2, etc.) can't be downloaded without a HF_TOKEN, causing the validation step to fail in CI even though the ops are correct. Change validate_model() to return "pass"/"fail"/"skip" — load/trace failures are reported as skips (warnings) while op validation failures remain hard failures. Also pass auto_class and config_overrides through to support BART and QA models. Made-with: Cursor --- .../extract_model_ops/validate_allowlist.py | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/dev-tools/extract_model_ops/validate_allowlist.py b/dev-tools/extract_model_ops/validate_allowlist.py index dfb39021a..53e6ff72c 100644 --- a/dev-tools/extract_model_ops/validate_allowlist.py +++ b/dev-tools/extract_model_ops/validate_allowlist.py @@ -104,30 +104,41 @@ def validate_model(model_name: str, allowed: set[str], forbidden: set[str], verbose: bool, - quantize: bool = False) -> bool: - """Validate one HuggingFace model. Returns True if all ops pass.""" + quantize: bool = False, + auto_class: str | None = None, + config_overrides: dict | None = None) -> str: + """Validate one HuggingFace model. + + Returns "pass", "fail" (op validation failed), or "skip" (could not + load/trace — e.g. private model without HF_TOKEN). + """ label = f"{model_name} (quantized)" if quantize else model_name print(f" {label}...", file=sys.stderr) - traced = load_and_trace_hf_model(model_name, quantize=quantize) + traced = load_and_trace_hf_model(model_name, quantize=quantize, + auto_class=auto_class, + config_overrides=config_overrides) if traced is None: - print(f" FAILED (could not load/trace)", file=sys.stderr) - return False + print(f" SKIPPED (could not load/trace)", file=sys.stderr) + return "skip" ops = collect_inlined_ops(traced) - return check_ops(ops, allowed, forbidden, verbose) + return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail" def validate_pt_file(name: str, pt_path: str, allowed: set[str], forbidden: set[str], - verbose: bool) -> bool: - """Validate a local TorchScript .pt file. Returns True if all ops pass.""" + verbose: bool) -> str: + """Validate a local TorchScript .pt file. + + Returns "pass", "fail", or "skip". + """ print(f" {name} ({pt_path})...", file=sys.stderr) ops = load_pt_and_collect_ops(pt_path) if ops is None: - print(f" FAILED (could not load)", file=sys.stderr) - return False - return check_ops(ops, allowed, forbidden, verbose) + print(f" SKIPPED (could not load)", file=sys.stderr) + return "skip" + return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail" def main(): @@ -151,7 +162,7 @@ def main(): print(f"Parsed {len(allowed)} allowed ops and {len(forbidden)} " f"forbidden ops from {SUPPORTED_OPS_CC.name}", file=sys.stderr) - results: dict[str, bool] = {} + results: dict[str, str] = {} models = load_model_config(args.config) @@ -161,7 +172,9 @@ def main(): for arch, spec in models.items(): results[arch] = validate_model( spec["model_id"], allowed, forbidden, args.verbose, - quantize=spec["quantized"]) + quantize=spec["quantized"], + auto_class=spec.get("auto_class"), + config_overrides=spec.get("config_overrides")) if args.pt_dir and args.pt_dir.is_dir(): pt_files = sorted(args.pt_dir.glob("*.pt")) @@ -175,26 +188,32 @@ def main(): print(file=sys.stderr) print("=" * 60, file=sys.stderr) - all_pass = all(results.values()) - for key, passed in results.items(): - status = "PASS" if passed else "FAIL" + for key, status in results.items(): + display = status.upper() if key.startswith("pt:"): - print(f" {key}: {status}", file=sys.stderr) + print(f" {key}: {display}", file=sys.stderr) else: spec = models[key] label = spec["model_id"] if spec["quantized"]: label += " (quantized)" - print(f" {key} ({label}): {status}", file=sys.stderr) + print(f" {key} ({label}): {display}", file=sys.stderr) + + failed = [a for a, s in results.items() if s == "fail"] + skipped = [a for a, s in results.items() if s == "skip"] + passed = [a for a, s in results.items() if s == "pass"] print("=" * 60, file=sys.stderr) - if all_pass: - print("All models PASS - no false positives.", file=sys.stderr) - else: - failed = [a for a, p in results.items() if not p] - print(f"FAILED models: {', '.join(failed)}", file=sys.stderr) + print(f"{len(passed)} passed, {len(failed)} failed, " + f"{len(skipped)} skipped", file=sys.stderr) + + if skipped: + print(f"Skipped (could not load/trace — may need HF_TOKEN " + f"for private models): {', '.join(skipped)}", file=sys.stderr) + if failed: + print(f"FAILED (op validation): {', '.join(failed)}", file=sys.stderr) - sys.exit(0 if all_pass else 1) + sys.exit(0 if not failed else 1) if __name__ == "__main__": From cbc8201685dd992d60cac21a9b5ca3ddf4f4bd50 Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 26 Mar 2026 12:47:16 +1300 Subject: [PATCH 8/9] [ML] Increase validation step resources for large models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit facebook/bart-large-mnli is 1.63GB — loading it into memory for tracing after torch and 30 other models exhausted the 16GB limit. Bump to 32GB memory and 30GB ephemeral storage. Made-with: Cursor --- .buildkite/pipelines/validate_pytorch_allowlist.yml.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh index 25be46ae7..ef2829976 100755 --- a/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh +++ b/.buildkite/pipelines/validate_pytorch_allowlist.yml.sh @@ -33,8 +33,8 @@ cat <<'EOL' allow_dependency_failure: true agents: image: "python:3.12" - memory: "16G" - ephemeralStorage: "20G" + memory: "32G" + ephemeralStorage: "30G" notify: - github_commit_status: context: "Validate PyTorch allowlist" From 419c3a5a166ee0be64f6d1b84e0b0cb4ee103ccc Mon Sep 17 00:00:00 2001 From: Ed Savage Date: Thu, 26 Mar 2026 12:50:53 +1300 Subject: [PATCH 9/9] [ML] Free model memory after tracing to reduce peak usage When validating 30+ models sequentially, the HF model weights accumulate in memory. Explicitly delete the original model, tokenizer, and inputs after tracing, and gc.collect() after each validation to release memory promptly. This should allow the validation step to complete within 32GB for the full model set including facebook/bart-large-mnli (1.63GB). Made-with: Cursor --- dev-tools/extract_model_ops/torchscript_utils.py | 9 +++++++-- dev-tools/extract_model_ops/validate_allowlist.py | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/dev-tools/extract_model_ops/torchscript_utils.py b/dev-tools/extract_model_ops/torchscript_utils.py index af2b30f68..da8fb481b 100644 --- a/dev-tools/extract_model_ops/torchscript_utils.py +++ b/dev-tools/extract_model_ops/torchscript_utils.py @@ -145,13 +145,18 @@ def load_and_trace_hf_model(model_name: str, quantize: bool = False, attention_mask = inputs["attention_mask"] try: - return torch.jit.trace( + traced = torch.jit.trace( model, (input_ids, attention_mask), strict=False) except Exception as exc: print(f" TRACE WARNING: {exc}", file=sys.stderr) print(" Falling back to torch.jit.script...", file=sys.stderr) try: - return torch.jit.script(model) + traced = torch.jit.script(model) except Exception as exc2: print(f" SCRIPT ERROR: {exc2}", file=sys.stderr) return None + + # Free the original HF model to reduce peak memory when validating + # many models sequentially. + del model, tokenizer, inputs + return traced diff --git a/dev-tools/extract_model_ops/validate_allowlist.py b/dev-tools/extract_model_ops/validate_allowlist.py index 53e6ff72c..d7a1ba99c 100644 --- a/dev-tools/extract_model_ops/validate_allowlist.py +++ b/dev-tools/extract_model_ops/validate_allowlist.py @@ -29,6 +29,7 @@ """ import argparse +import gc import re import sys from pathlib import Path @@ -121,7 +122,10 @@ def validate_model(model_name: str, print(f" SKIPPED (could not load/trace)", file=sys.stderr) return "skip" ops = collect_inlined_ops(traced) - return "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail" + result = "pass" if check_ops(ops, allowed, forbidden, verbose) else "fail" + del traced + gc.collect() + return result def validate_pt_file(name: str,