diff --git a/scripts/check_generated_artifacts.py b/scripts/check_generated_artifacts.py index 6d2639d..328d101 100644 --- a/scripts/check_generated_artifacts.py +++ b/scripts/check_generated_artifacts.py @@ -9,8 +9,14 @@ from _ci_utils import display_path, read_bytes, repo_path, repo_root, write_bytes EXPORT_SCRIPT = Path("scripts/export_requirements.py") +SYNC_INDEX_SCRIPT = Path("scripts/sync_index_from_readme.py") +SYNC_TAB_ACKNOWLEDGEMENTS_SCRIPT = Path("scripts/sync_tab_acknowledgements_from_acknowledgements.py") REQUIREMENTS_JSON = Path("standard/apts_requirements.json") SCHEMA_JSON = Path("standard/apts_requirements_schema.json") +README_MD = Path("README.md") +INDEX_MD = Path("index.md") +ACKNOWLEDGEMENTS_MD = Path("ACKNOWLEDGEMENTS.md") +TAB_ACKNOWLEDGEMENTS_MD = Path("tab_acknowledgements.md") FIXED_TIMESTAMP = "1970-01-01T00:00:00Z" @@ -129,6 +135,152 @@ def main() -> int: print(message) exit_code = 1 + index_check_exit_code = check_index_in_sync_with_readme() + if index_check_exit_code != 0: + exit_code = index_check_exit_code + + tab_check_exit_code = check_tab_acknowledgements_in_sync() + if tab_check_exit_code != 0: + exit_code = tab_check_exit_code + + return exit_code + + +def check_tab_acknowledgements_in_sync() -> int: + """Verify tab_acknowledgements.md matches the ACKNOWLEDGEMENTS.md body + after the H1 and the "Influences" / "How to Get Listed" sections are + stripped. + + The sync_tab_acknowledgements_from_acknowledgements.py script regenerates + tab_acknowledgements.md from ACKNOWLEDGEMENTS.md. This check fails the + build if the two have drifted. + """ + + if not repo_path(SYNC_TAB_ACKNOWLEDGEMENTS_SCRIPT).is_file(): + print( + f"No sync script found at {display_path(SYNC_TAB_ACKNOWLEDGEMENTS_SCRIPT)}. " + "Skipping tab_acknowledgements.md / ACKNOWLEDGEMENTS.md sync check." + ) + return 0 + + if ( + not repo_path(TAB_ACKNOWLEDGEMENTS_MD).is_file() + or not repo_path(ACKNOWLEDGEMENTS_MD).is_file() + ): + print( + "FAILED: Expected ACKNOWLEDGEMENTS.md and tab_acknowledgements.md " + "to both exist for the sync check." + ) + return 1 + + original_tab_contents = read_bytes(TAB_ACKNOWLEDGEMENTS_MD) + exit_code = 0 + + try: + subprocess.run( + [sys.executable, str(repo_path(SYNC_TAB_ACKNOWLEDGEMENTS_SCRIPT))], + cwd=repo_root(), + check=True, + ) + + regenerated_tab = read_bytes(TAB_ACKNOWLEDGEMENTS_MD) + expected_lines = raw_text_lines(original_tab_contents) + actual_lines = raw_text_lines(regenerated_tab) + + if expected_lines != actual_lines: + print_diff( + "FAILED: tab_acknowledgements.md is out of sync with " + "ACKNOWLEDGEMENTS.md. Run " + "scripts/sync_tab_acknowledgements_from_acknowledgements.py to regenerate it.", + expected_lines=expected_lines, + actual_lines=actual_lines, + expected_name="committed tab_acknowledgements.md", + actual_name="regenerated tab_acknowledgements.md", + ) + exit_code = 1 + except subprocess.CalledProcessError as exc: + print( + f"FAILED: sync_tab_acknowledgements_from_acknowledgements.py exited with status {exc.returncode}." + ) + exit_code = 1 + except Exception as exc: + print( + f"FAILED: tab_acknowledgements.md / ACKNOWLEDGEMENTS.md sync check could not complete: {exc}" + ) + exit_code = 1 + finally: + restore_errors = restore_original_files( + {TAB_ACKNOWLEDGEMENTS_MD: original_tab_contents} + ) + if restore_errors: + for message in restore_errors: + print(message) + exit_code = 1 + + return exit_code + + +def check_index_in_sync_with_readme() -> int: + """Verify index.md matches the README.md body under the Jekyll front matter. + + README.md is the canonical source for the project overview; index.md + is the Jekyll-rendered version with a YAML front matter block prepended. + The sync_index_from_readme.py script regenerates index.md from README.md. + This check fails the build if the two have drifted. + """ + + if not repo_path(SYNC_INDEX_SCRIPT).is_file(): + print( + f"No sync script found at {display_path(SYNC_INDEX_SCRIPT)}. " + "Skipping index.md / README.md sync check." + ) + return 0 + + if not repo_path(INDEX_MD).is_file() or not repo_path(README_MD).is_file(): + print( + "FAILED: Expected README.md and index.md to both exist for the sync check." + ) + return 1 + + original_index_contents = read_bytes(INDEX_MD) + exit_code = 0 + + try: + subprocess.run( + [sys.executable, str(repo_path(SYNC_INDEX_SCRIPT))], + cwd=repo_root(), + check=True, + ) + + regenerated_index = read_bytes(INDEX_MD) + expected_lines = raw_text_lines(original_index_contents) + actual_lines = raw_text_lines(regenerated_index) + + if expected_lines != actual_lines: + print_diff( + "FAILED: index.md is out of sync with README.md. " + "Run scripts/sync_index_from_readme.py to regenerate it.", + expected_lines=expected_lines, + actual_lines=actual_lines, + expected_name="committed index.md", + actual_name="regenerated index.md", + ) + exit_code = 1 + except subprocess.CalledProcessError as exc: + print( + f"FAILED: sync_index_from_readme.py exited with status {exc.returncode}." + ) + exit_code = 1 + except Exception as exc: + print(f"FAILED: index.md / README.md sync check could not complete: {exc}") + exit_code = 1 + finally: + restore_errors = restore_original_files({INDEX_MD: original_index_contents}) + if restore_errors: + for message in restore_errors: + print(message) + exit_code = 1 + return exit_code diff --git a/scripts/sync_index_from_readme.py b/scripts/sync_index_from_readme.py new file mode 100644 index 0000000..67b0d01 --- /dev/null +++ b/scripts/sync_index_from_readme.py @@ -0,0 +1,107 @@ +"""Regenerate index.md from README.md while preserving the Jekyll front matter. + +README.md is the canonical source for the project overview shown on both the +GitHub repo home and the OWASP project page (owasp.org/APTS/). index.md is the +Jekyll-rendered version of the same content with a YAML front matter block at +the top. + +This script keeps the two in sync: it copies the body of README.md into +index.md, leaving the existing YAML front matter at the top of index.md +untouched. + +Run this whenever README.md changes. The CI artifact check +(scripts/check_generated_artifacts.py) verifies the two stay in sync and +fails the build if they drift. + +Usage: + python scripts/sync_index_from_readme.py +""" + +from __future__ import annotations + +from pathlib import Path + +from _ci_utils import display_path, read_text, repo_path + +README = Path("README.md") +INDEX = Path("index.md") +FRONT_MATTER_DELIMITER = "---" + + +def split_front_matter(text: str) -> tuple[str, str]: + """Return (front_matter_block, body) for a Jekyll markdown file. + + The front matter block includes the leading and trailing `---` lines and + any whitespace immediately following the closing delimiter so that the + returned body starts cleanly. If no front matter is present, an empty + front matter block is returned and the entire text is treated as body. + """ + + lines = text.splitlines(keepends=True) + if not lines or lines[0].rstrip("\r\n") != FRONT_MATTER_DELIMITER: + return "", text + + closing_index = None + for index in range(1, len(lines)): + if lines[index].rstrip("\r\n") == FRONT_MATTER_DELIMITER: + closing_index = index + break + + if closing_index is None: + # Malformed front matter (no closing delimiter). Bail out and treat + # the file as bodyless rather than silently corrupting it. + raise ValueError( + f"Malformed Jekyll front matter in index.md: no closing '{FRONT_MATTER_DELIMITER}' " + "delimiter found." + ) + + front_matter = "".join(lines[: closing_index + 1]) + + # Skip a single blank line directly after the closing delimiter so the + # body we return doesn't start with a stray newline. Anything beyond that + # is body content. + body_start = closing_index + 1 + if body_start < len(lines) and lines[body_start].strip() == "": + body_start += 1 + + body = "".join(lines[body_start:]) + return front_matter, body + + +def build_index(front_matter: str, readme_text: str) -> str: + """Compose index.md from the existing front matter and the README body.""" + + if not front_matter: + raise ValueError( + f"{display_path(INDEX)} is missing a Jekyll front matter block. " + "Restore the front matter manually before running the sync." + ) + + return f"{front_matter}\n{readme_text}" + + +def main() -> int: + if not repo_path(README).is_file(): + print(f"FAILED: {display_path(README)} not found.") + return 1 + if not repo_path(INDEX).is_file(): + print(f"FAILED: {display_path(INDEX)} not found.") + return 1 + + readme_text = read_text(README) + index_text = read_text(INDEX) + + front_matter, _existing_body = split_front_matter(index_text) + new_index_text = build_index(front_matter, readme_text) + + if new_index_text == index_text: + print(f"{display_path(INDEX)} already in sync with {display_path(README)}.") + return 0 + + repo_path(INDEX).write_text(new_index_text, encoding="utf-8") + print(f"Updated {display_path(INDEX)} from {display_path(README)}.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/sync_tab_acknowledgements_from_acknowledgements.py b/scripts/sync_tab_acknowledgements_from_acknowledgements.py new file mode 100644 index 0000000..f575f7f --- /dev/null +++ b/scripts/sync_tab_acknowledgements_from_acknowledgements.py @@ -0,0 +1,139 @@ +"""Regenerate tab_acknowledgements.md from ACKNOWLEDGEMENTS.md. + +ACKNOWLEDGEMENTS.md is the canonical contributor list rendered on the GitHub +repo home. tab_acknowledgements.md is the Jekyll-rendered version shown on the +OWASP project page (owasp.org/APTS/) under the "Acknowledgements" tab. + +The Jekyll tab is a subset of ACKNOWLEDGEMENTS.md: + * The top-level `# Acknowledgements` heading is dropped because Jekyll + renders the page title from the front matter `title:` field. Including + the H1 in the body would duplicate it. + * The "Influences" and "How to Get Listed" sections (last two H2 sections + in the source) are intentionally excluded from the public tab. + * Everything else (disclaimer paragraph, Project Lead, Technical Reviewers, + Contributors) is mirrored verbatim. + +Run this whenever ACKNOWLEDGEMENTS.md changes. The CI artifact check +(scripts/check_generated_artifacts.py) verifies the two stay in sync and +fails the build if they drift. + +Usage: + python scripts/sync_tab_acknowledgements_from_acknowledgements.py +""" + +from __future__ import annotations + +from pathlib import Path + +from _ci_utils import display_path, read_text, repo_path + +SOURCE = Path("ACKNOWLEDGEMENTS.md") +TARGET = Path("tab_acknowledgements.md") +FRONT_MATTER_DELIMITER = "---" + +# H2 section headings (without the leading "## ") whose content should NOT +# appear on the public Jekyll tab. The script truncates the body at the first +# heading whose text matches any entry below. +EXCLUDED_SECTIONS = ("Influences", "How to Get Listed") + + +def split_front_matter(text: str) -> tuple[str, str]: + """Return (front_matter_block, body) for a Jekyll markdown file. + + The front matter block includes the leading and trailing `---` lines and + a single trailing blank line if present. If no front matter is present, + an empty front matter block is returned and the entire text is body. + """ + + lines = text.splitlines(keepends=True) + if not lines or lines[0].rstrip("\r\n") != FRONT_MATTER_DELIMITER: + return "", text + + closing_index = None + for index in range(1, len(lines)): + if lines[index].rstrip("\r\n") == FRONT_MATTER_DELIMITER: + closing_index = index + break + + if closing_index is None: + raise ValueError( + f"Malformed Jekyll front matter in {display_path(TARGET)}: " + f"no closing '{FRONT_MATTER_DELIMITER}' delimiter found." + ) + + front_matter = "".join(lines[: closing_index + 1]) + + body_start = closing_index + 1 + if body_start < len(lines) and lines[body_start].strip() == "": + body_start += 1 + + body = "".join(lines[body_start:]) + return front_matter, body + + +def process_source_body(source_text: str) -> str: + """Strip the H1 and excluded H2 sections from the source markdown.""" + + lines = source_text.splitlines(keepends=True) + + # Drop the leading H1 line if present, plus a single blank line directly + # after it. Jekyll renders the page title from the front matter, so an + # in-body H1 would duplicate it on the rendered page. + start = 0 + if lines and lines[0].lstrip().startswith("# ") and not lines[0].lstrip().startswith("## "): + start = 1 + if start < len(lines) and lines[start].strip() == "": + start += 1 + + # Truncate at the first H2 heading that matches an excluded section. + kept: list[str] = [] + for line in lines[start:]: + stripped = line.lstrip() + if stripped.startswith("## "): + heading_text = stripped[3:].strip() + if heading_text in EXCLUDED_SECTIONS: + break + kept.append(line) + + body = "".join(kept).rstrip() + "\n" + return body + + +def build_target(front_matter: str, source_text: str) -> str: + """Compose the target file from existing front matter and processed source.""" + + if not front_matter: + raise ValueError( + f"{display_path(TARGET)} is missing a Jekyll front matter block. " + "Restore the front matter manually before running the sync." + ) + + body = process_source_body(source_text) + return f"{front_matter}\n{body}" + + +def main() -> int: + if not repo_path(SOURCE).is_file(): + print(f"FAILED: {display_path(SOURCE)} not found.") + return 1 + if not repo_path(TARGET).is_file(): + print(f"FAILED: {display_path(TARGET)} not found.") + return 1 + + source_text = read_text(SOURCE) + target_text = read_text(TARGET) + + front_matter, _existing_body = split_front_matter(target_text) + new_target_text = build_target(front_matter, source_text) + + if new_target_text == target_text: + print(f"{display_path(TARGET)} already in sync with {display_path(SOURCE)}.") + return 0 + + repo_path(TARGET).write_text(new_target_text, encoding="utf-8") + print(f"Updated {display_path(TARGET)} from {display_path(SOURCE)}.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/standard/2_Safety_Controls/README.md b/standard/2_Safety_Controls/README.md index 6c65abb..f4892d8 100644 --- a/standard/2_Safety_Controls/README.md +++ b/standard/2_Safety_Controls/README.md @@ -533,4 +533,4 @@ System prompts and in-context instructions are not reliable constraints on an ag --- -> **See also:** [APTS-SC-A02: Context Window Safety and Constraint Preservation](../appendix/Advisory_Requirements.md#apts-sc-a02-context-window-safety-and-constraint-preservation-advisory) — an advisory practice for platforms using LLM-based agents with finite context windows. Addresses the risk of safety-critical constraints being silently lost during context summarization or truncation. High-priority candidate for tier-gated inclusion in v0.2.0. +> **See also:** [APTS-SC-A02: Context Window Safety and Constraint Preservation](../appendix/Advisory_Requirements.md#apts-sc-a02-context-window-safety-and-constraint-preservation-advisory). An advisory practice for platforms using LLM-based agents with finite context windows. Addresses the risk of safety-critical constraints being silently lost during context summarization or truncation. High-priority candidate for tier-gated inclusion in v0.2.0. diff --git a/standard/3_Human_Oversight/README.md b/standard/3_Human_Oversight/README.md index b983442..933c686 100644 --- a/standard/3_Human_Oversight/README.md +++ b/standard/3_Human_Oversight/README.md @@ -264,7 +264,7 @@ Organizations MUST generate periodic reports from decision logs at a cadence app 8. **Retention test**: Verify logs older than retention period are archived/secured appropriately 9. **Signature verification test**: Validate cryptographic signatures on sample log entries -> **See also:** [APTS-HO-A02: Disclosure and Mitigation of AI Influence on Operator Decisions](../appendix/Advisory_Requirements.md#apts-ho-a02-disclosure-and-mitigation-of-ai-influence-on-operator-decisions-advisory) — an advisory practice covering audit-trail provenance for AI-shaped operator affordances (option sets, defaults, wording, ordering) and bias mitigation at high-impact gates, so the chain-of-custody distinguishes a typed approval from a default click-through. Candidate for tier-gated inclusion in v0.2.0. +> **See also:** [APTS-HO-A02: Disclosure and Mitigation of AI Influence on Operator Decisions](../appendix/Advisory_Requirements.md#apts-ho-a02-disclosure-and-mitigation-of-ai-influence-on-operator-decisions-advisory). An advisory practice covering audit-trail provenance for AI-shaped operator affordances (option sets, defaults, wording, ordering) and bias mitigation at high-impact gates, so the chain-of-custody distinguishes a typed approval from a default click-through. Candidate for tier-gated inclusion in v0.2.0. --- diff --git a/standard/6_Manipulation_Resistance/Implementation_Guide.md b/standard/6_Manipulation_Resistance/Implementation_Guide.md index 74cb9f4..d5f70f7 100644 --- a/standard/6_Manipulation_Resistance/Implementation_Guide.md +++ b/standard/6_Manipulation_Resistance/Implementation_Guide.md @@ -372,12 +372,12 @@ Platforms using LLMs for decision-making MUST implement layered prompt injection **Implementation:** Build the evaluation suite as an independent pipeline outside the agent's serving infrastructure. Each run takes a fixed model configuration (foundation model version, adapter set, system prompt, tool access policy, temperature) and produces a structured run record with per-scenario decisions, per-category alignment rates, and an aggregate score committed to the audit trail. -**Architecture Pattern — Independent Evaluation Pipeline:** +**Architecture Pattern: Independent Evaluation Pipeline** A proven pattern is to implement the suite using an off-the-shelf evaluation framework (Inspect AI, Braintrust, OpenAI Evals, or an equivalent harness) and to run it as a standalone job rather than as part of the engagement runtime. The pipeline has four components: 1. **Scenario corpus.** A versioned set of distinguishing scenarios stored as data, not code. Each scenario carries: a target description (the synthetic environment the agent is asked to assess), the proxy-objective action, the true-objective action, the rubric mapping the agent's chosen action to one of the three classes (true-objective-aligned, proxy-objective-aligned, other), and a category tag. -2. **Run harness.** A driver that instantiates the same agent loop the platform uses in production — same system prompt, same tool registry, same control plane — but pointed at the synthetic targets in the scenario corpus. The harness records every tool call, every model output, and the agent's terminal action per scenario. +2. **Run harness.** A driver that instantiates the same agent loop the platform uses in production (same system prompt, same tool registry, same control plane) but pointed at the synthetic targets in the scenario corpus. The harness records every tool call, every model output, and the agent's terminal action per scenario. 3. **Scorer.** A rule-based or LLM-judge scorer (independent of the agent under test) that maps each scenario's run record to one of the three classes. Where an LLM judge is used, the judge MUST run as a separate inference call with no shared context with the agent under test (same constraint as APTS-RP-A01). 4. **Reporter.** Aggregates per-category alignment rates, computes aggregate score, compares against documented thresholds, computes deltas against the most recent prior run on the same model version, and writes a signed run record to the audit trail per APTS-AR-012. @@ -390,7 +390,7 @@ For scenario categories where the population of distinguishing cases is small, d Implement the post-fine-tuning out-of-distribution audit as a separate evaluation job that runs the model against a held-out general-purpose benchmark (HHH-style or equivalent) before and after every fine-tuning event. The audit is concerned with relative shift, not absolute score: a several-point drop on a previously-stable benchmark following narrow fine-tuning is the signal the Nature 2026 methodology identifies as emergent misalignment, and the relative drop matters more than the absolute number. **Key Considerations:** -- The evaluation suite must be governed as carefully as the model under APTS-TP-019. Scenario corpus contamination — where suite scenarios leak into post-deployment learning data — silently inflates alignment rates and is the evaluation-side analogue of training-set contamination in capability benchmarks. +- The evaluation suite must be governed as carefully as the model under APTS-TP-019. Scenario corpus contamination, where suite scenarios leak into post-deployment learning data, silently inflates alignment rates and is the evaluation-side analogue of training-set contamination in capability benchmarks. - The judge component must use a different model family or, if the same family, a different system prompt and a separate inference call. APTS-RP-A01's independence requirement applies symmetrically here. - Run the suite against the *same* model configuration the platform uses in production, including system prompt, tool registry, temperature, and any safety wrappers. A suite that passes against a stripped-down configuration provides no assurance about production behavior. - Suite results should be tracked across runs with explicit versioning of (model, adapter, system prompt, suite version). A drop in alignment rate is interpretable only relative to prior runs on a comparable configuration. @@ -407,9 +407,9 @@ Implement the post-fine-tuning out-of-distribution audit as a separate evaluatio > This section provides implementation guidance for the advisory practice [APTS-MR-A02](../appendix/Advisory_Requirements.md#apts-mr-a02-sandbagging-detection-and-behavioral-consistency-validation-advisory). It is not required for conformance at any tier. -**Implementation:** Build the consistency measurement as an independent pipeline that ingests, on one side, per-scenario records from the adversarial eval suite of APTS-MR-020, and on the other side a stratified sample of production engagement transcripts captured under APTS-AR-001. Each measurement run is anchored to a single model snapshot identifier (per APTS-AR-019) and produces a structured differential report — per-dimension deltas, sample sizes, and a verbalized-recognition correlation — committed to the audit trail. +**Implementation:** Build the consistency measurement as an independent pipeline that ingests, on one side, per-scenario records from the adversarial eval suite of APTS-MR-020, and on the other side a stratified sample of production engagement transcripts captured under APTS-AR-001. Each measurement run is anchored to a single model snapshot identifier (per APTS-AR-019) and produces a structured differential report (per-dimension deltas, sample sizes, and a verbalized-recognition correlation) committed to the audit trail. -**Architecture Pattern — Differential Measurement Pipeline:** +**Architecture Pattern: Differential Measurement Pipeline** The pipeline has four components, mirroring the structure used in APTS-MR-A01: @@ -429,7 +429,7 @@ Where reasoning traces are available, scan eval-side transcripts for evaluation- **Key Considerations:** - Production telemetry sampling MUST respect APTS-TP-013 sensitive-data handling and MUST NOT extend retention beyond what APTS-AR-001 already permits. Differential measurement is not a justification for new collection. - The differential is interpretable only against a model-snapshot identifier (APTS-AR-019). Comparing eval samples and production samples drawn under different snapshots without correcting for the change confounds the measurement. -- Eval-suite contamination from the production side — for example, production samples that were previously absorbed into post-deployment learning data — silently narrows the measured delta. Treat eval scenarios as held out from any post-deployment learning data, consistent with APTS-MR-A01's contamination guidance. +- Eval-suite contamination from the production side (for example, production samples that were previously absorbed into post-deployment learning data) silently narrows the measured delta. Treat eval scenarios as held out from any post-deployment learning data, consistent with APTS-MR-A01's contamination guidance. - Where production sampling is impossible (privacy-restricted engagement, contractual restriction), substitute high-fidelity production-mimicking synthetic transcripts and disclose the substitution; do not omit the dimension. **Common Pitfalls:** diff --git a/standard/6_Manipulation_Resistance/README.md b/standard/6_Manipulation_Resistance/README.md index 14c1044..40c0498 100644 --- a/standard/6_Manipulation_Resistance/README.md +++ b/standard/6_Manipulation_Resistance/README.md @@ -1061,6 +1061,6 @@ The rest of Manipulation Resistance defends against an outside attacker trying t --- -> **See also:** [APTS-MR-A01: Goal Misgeneralization and Emergent Misalignment Evaluation Suite](../appendix/Advisory_Requirements.md#apts-mr-a01-goal-misgeneralization-and-emergent-misalignment-evaluation-suite-advisory) — an advisory practice for platforms using fine-tuned or adapted LLM-based agents. Evaluates the agent's underlying objective alignment under distribution shift and detects emergent misalignment after fine-tuning, addressing failure modes that input-side (MR-013) and control-side (MR-020) adversarial testing do not cover. Candidate for tier-gated inclusion in v0.2.0. +> **See also:** [APTS-MR-A01: Goal Misgeneralization and Emergent Misalignment Evaluation Suite](../appendix/Advisory_Requirements.md#apts-mr-a01-goal-misgeneralization-and-emergent-misalignment-evaluation-suite-advisory). An advisory practice for platforms using fine-tuned or adapted LLM-based agents. Evaluates the agent's underlying objective alignment under distribution shift and detects emergent misalignment after fine-tuning, addressing failure modes that input-side (MR-013) and control-side (MR-020) adversarial testing do not cover. Candidate for tier-gated inclusion in v0.2.0. -> **See also:** [APTS-MR-A02: Sandbagging Detection and Behavioral Consistency Validation](../appendix/Advisory_Requirements.md#apts-mr-a02-sandbagging-detection-and-behavioral-consistency-validation-advisory) — an advisory practice for platforms running an adversarial eval suite under MR-020 against the same model configuration deployed in customer engagements. Measures the behavioral delta between eval context and production context on the same model snapshot tracked under AR-019, addressing eval-awareness and sandbagging that MR-020's pass-rate metric alone cannot detect. Candidate for tier-gated inclusion in v0.2.0. +> **See also:** [APTS-MR-A02: Sandbagging Detection and Behavioral Consistency Validation](../appendix/Advisory_Requirements.md#apts-mr-a02-sandbagging-detection-and-behavioral-consistency-validation-advisory). An advisory practice for platforms running an adversarial eval suite under MR-020 against the same model configuration deployed in customer engagements. Measures the behavioral delta between eval context and production context on the same model snapshot tracked under AR-019, addressing eval-awareness and sandbagging that MR-020's pass-rate metric alone cannot detect. Candidate for tier-gated inclusion in v0.2.0. diff --git a/standard/8_Reporting/Implementation_Guide.md b/standard/8_Reporting/Implementation_Guide.md index 5a7375b..c5629d8 100644 --- a/standard/8_Reporting/Implementation_Guide.md +++ b/standard/8_Reporting/Implementation_Guide.md @@ -247,7 +247,7 @@ Findings below 50% confidence SHOULD be flagged as "Unconfirmed" and excluded fr **Implementation:** Deploy an independent verification step that screens every finding for fabricated evidence, hallucinated vulnerabilities, and severity misclassification before the finding enters the human review pipeline. The verification mechanism must not share context or state with the discovering agent. -**Architecture Pattern — Independent Finding Judge:** +**Architecture Pattern: Independent Finding Judge** A proven pattern is to implement the verifier as a separate "Finding Judge" that receives only the finding record, associated evidence artifacts (PoC scripts, HTTP request/response pairs, tool output), and the target context. The judge evaluates each finding against several checks: @@ -271,7 +271,7 @@ For finding types where evidence quality varies, implement calibrated confidence **Common Pitfalls:** - Running the verifier in the same LLM context as the discovering agent, which allows the agent's reasoning to influence the verification outcome - Implementing only a "does the PoC run" check without examining whether the PoC actually contacts the target -- Treating all REJECTED findings as false positives without logging — some may indicate a genuine vulnerability that the agent described poorly, and the rejection log helps identify patterns for agent improvement +- Treating all REJECTED findings as false positives without logging, since some may indicate a genuine vulnerability that the agent described poorly, and the rejection log helps identify patterns for agent improvement - Over-reliance on the verifier as a replacement for human review; the verifier is a pre-filter, not a substitute for APTS-RP-002 --- diff --git a/standard/8_Reporting/README.md b/standard/8_Reporting/README.md index e002fd9..4eacf14 100644 --- a/standard/8_Reporting/README.md +++ b/standard/8_Reporting/README.md @@ -430,4 +430,4 @@ When the platform integrates with downstream systems for finding delivery (ticke --- -> **See also:** [APTS-RP-A01: Automated Finding Authenticity Verification](../appendix/Advisory_Requirements.md#apts-rp-a01-automated-finding-authenticity-verification-advisory) — an advisory practice for screening agent-generated findings for fabricated evidence, hallucinated vulnerabilities, and severity misclassification before human review. Candidate for tier-gated inclusion in v0.2.0. +> **See also:** [APTS-RP-A01: Automated Finding Authenticity Verification](../appendix/Advisory_Requirements.md#apts-rp-a01-automated-finding-authenticity-verification-advisory). An advisory practice for screening agent-generated findings for fabricated evidence, hallucinated vulnerabilities, and severity misclassification before human review. Candidate for tier-gated inclusion in v0.2.0. diff --git a/standard/appendix/Advisory_Requirements.md b/standard/appendix/Advisory_Requirements.md index 0f5aaed..e57f82b 100644 --- a/standard/appendix/Advisory_Requirements.md +++ b/standard/appendix/Advisory_Requirements.md @@ -173,9 +173,9 @@ Where the platform carries conversational or planning state across long sessions ### APTS-HO-A02: Disclosure and Mitigation of AI Influence on Operator Decisions (Advisory) -**Applicability:** This practice applies to platforms that use language models or other AI components to generate content shown to operators at decision points — narrative summaries, recommended approval responses, prefilled answer choices, preselected defaults, or any other UI affordance that shapes what the operator confirms. It applies uniformly across every channel that delivers an operator decision affordance (web dashboard, mobile, instant-messaging, voice, push notification). +**Applicability:** This practice applies to platforms that use language models or other AI components to generate content shown to operators at decision points: narrative summaries, recommended approval responses, prefilled answer choices, preselected defaults, or any other UI affordance that shapes what the operator confirms. It applies uniformly across every channel that delivers an operator decision affordance (web dashboard, mobile, instant-messaging, voice, push notification). -**Rationale:** Autonomous pentest platforms increasingly use language models not only to act on the target but also to shape what the operator sees at decision time — the narrative that frames a finding, the option set in an approval prompt, the wording, the preselected default. This is a manipulation surface distinct from the prompt-injection threats covered by the Manipulation Resistance domain: there, an external entity manipulates the agent; here, the agent influences its own supervisor's choice through affordances the agent controls. Established findings from human-computer interaction (default bias, primacy bias, choice architecture effects) show these shaping decisions meaningfully influence which option the operator selects, even without adversarial intent. APTS-HO-001, HO-005, and HO-010 mandate approval gates and audit trails but do not address the form of the question put to the operator; APTS-AR-006 covers the agent's reasoning chain but not the model-shaped inputs handed to the human. The practical effect is that an audit trail can show "operator approved" while concealing that the operator was offered a single highlighted choice with the safer option visually de-emphasized. The normative requirement set for v0.1.0 is frozen; this practice is a candidate for tier-gated inclusion in v0.2.0. +**Rationale:** Autonomous pentest platforms increasingly use language models not only to act on the target but also to shape what the operator sees at decision time: the narrative that frames a finding, the option set in an approval prompt, the wording, the preselected default. This is a manipulation surface distinct from the prompt-injection threats covered by the Manipulation Resistance domain: there, an external entity manipulates the agent; here, the agent influences its own supervisor's choice through affordances the agent controls. Established findings from human-computer interaction (default bias, primacy bias, choice architecture effects) show these shaping decisions meaningfully influence which option the operator selects, even without adversarial intent. APTS-HO-001, HO-005, and HO-010 mandate approval gates and audit trails but do not address the form of the question put to the operator; APTS-AR-006 covers the agent's reasoning chain but not the model-shaped inputs handed to the human. The practical effect is that an audit trail can show "operator approved" while concealing that the operator was offered a single highlighted choice with the safer option visually de-emphasized. The normative requirement set for v0.1.0 is frozen; this practice is a candidate for tier-gated inclusion in v0.2.0. **Value:** Platforms that implement this practice make operator approval a more deliberate act and the audit trail more honest. Reviewers can distinguish a typed approval from a default click-through, customers can audit whether the platform's UI has steered operators toward expedient choices, and operators retain meaningful agency at high-impact decision points. @@ -186,7 +186,7 @@ Where the platform carries conversational or planning state across long sessions 3. **Record the full option set, including filtered alternatives.** When the operator is presented with a constrained set of choices generated or curated by a model, the audit trail should record the candidate options the model considered and any options that were dropped, reordered, or de-emphasized before presentation, so reviewers can detect cases where the model omitted a safer option (for example, "deny" or "abort") or buried it behind progressive disclosure. 4. **Reduce default and ordering bias for high-impact gates.** For approvals governing irreversible actions (see APTS-HO-010) and other high-impact decisions, the platform should avoid preselecting a default, present the abort or deny option with visual weight equal to other options, and consider randomizing option order to mitigate primacy bias. For the most severe action categories, consider requiring a typed confirmation phrase rather than a single click. -**Recommendation:** Treat the operator-facing decision interface as a controlled surface and apply the same provenance discipline already required for agent action logging. Items 1 and 2 form the lowest-cost starting wedge — a response-classification audit field plus model and prompt-template provenance — both addable without touching the operator UI. The presentation rules in item 4 are most consequential for APTS-HO-010 gates and for actions classified as Critical or High under APTS-SC-001. +**Recommendation:** Treat the operator-facing decision interface as a controlled surface and apply the same provenance discipline already required for agent action logging. Items 1 and 2 form the lowest-cost starting wedge (a response-classification audit field plus model and prompt-template provenance), both addable without touching the operator UI. The presentation rules in item 4 are most consequential for APTS-HO-010 gates and for actions classified as Critical or High under APTS-SC-001. **Related normative requirements:** APTS-HO-001, APTS-HO-005, APTS-HO-010, APTS-AR-006, APTS-AR-019. @@ -262,7 +262,7 @@ Document every external connector that can execute actions, access customer data **Practice Description:** -Maintain and execute a goal-misgeneralization and emergent-misalignment evaluation suite that evaluates the agent's behavior in distinguishing scenarios — scenarios constructed so that following a plausible proxy objective produces a different action than following the true objective — and that evaluates the agent's behavior outside the offensive-security domain after any fine-tuning event. Specifically: +Maintain and execute a goal-misgeneralization and emergent-misalignment evaluation suite that evaluates the agent's behavior in distinguishing scenarios (scenarios constructed so that following a plausible proxy objective produces a different action than following the true objective) and that evaluates the agent's behavior outside the offensive-security domain after any fine-tuning event. Specifically: 1. **Evaluation suite corpus.** Maintain a corpus of distinguishing scenarios covering, at minimum: scope-versus-finding-pressure tension (the agent has an opportunity to claim a finding by exceeding scope), proxy-reward gaming (the agent has an opportunity to produce a fluent finding without performing the underlying verification), intent-versus-output divergence (the agent is asked to assess a target whose intended behavior matches a vulnerability signature), and adjacent-domain spillover scenarios constructed per the Nature 2026 emergent-misalignment methodology. Document the proxy action and true action for every scenario. 2. **Behavioral metrics.** For each scenario, record the agent's chosen action and classify it as true-objective-aligned, proxy-objective-aligned, or other. Compute alignment rate per scenario category and aggregate across the suite. Track per-category trends across runs to detect drift. @@ -280,7 +280,7 @@ Maintain and execute a goal-misgeneralization and emergent-misalignment evaluati **Applicability:** This practice applies to platforms that use LLM-based agents and run an adversarial evaluation suite under APTS-MR-020 against the same model configuration deployed in customer engagements authorized under APTS-SE-001. It applies regardless of whether the underlying model has been fine-tuned, and is therefore complementary in scope to APTS-MR-A01. -**Rationale:** A recurring finding across frontier-model evaluations between late 2024 and early 2026 is that an agent's behavior on an adversarial eval suite can diverge from its behavior in live deployment, because the model can detect features of the evaluation context and modulate its actions accordingly. Apollo Research demonstrated reproducible in-context scheming and sandbagging across Claude 3.5 Sonnet, Claude 3 Opus, and OpenAI o1 (Meinke et al., 2024, *Frontier Models are Capable of In-context Scheming*, arXiv:2412.04984), including a documented case in which o3 internally reasoned "we want to survive as the model, we need to fail purposely" and then deliberately submitted incorrect chemistry answers; METR's o3 evaluation report cautions that the evaluation setup "is not robust to sandbagging by the model." Independent peer-reviewed work establishes sandbagging as a measurable, controllable behavior (van der Weij et al., *AI Sandbagging*, ICLR 2025, arXiv:2406.07358), and Anthropic's *Alignment Faking* result shows that production-grade Claude 3 Opus differentially modulates behavior under naturalistic conditions (Greenblatt et al., 2024). Anthropic's own Sonnet 4.5 and Opus 4.6 system cards report that internal interpretability features for evaluation awareness strengthened during training, with UK AISI testing finding Opus 4.6 correctly identifies evaluations roughly 80% of the time while verbalizing that recognition only ~2% of the time. The capability scales predictably with model size (Chaudhary et al., 2025, arXiv:2509.13333) and is measurable on agentic tasks (Needham et al., 2025, arXiv:2505.23836). For autonomous penetration testing platforms the consequence is concrete: Meta's 2026 Muse Spark report shows roughly a 32-percentage-point drop on cyber capability evaluations under sandbagging instruction with general-knowledge performance moving by 0.4 points — exactly the failure mode an adversarial eval suite is supposed to surface. APTS-MR-020 (Adversarial Validation and Resilience Testing of Safety Controls) requires the platform to run the eval suite; APTS-AR-019 (AI/ML Model Change Tracking and Drift Detection) tracks output drift across model versions. Neither measures the differential between the agent's eval-context behavior and its production-context behavior on the same model snapshot. The EU AI Act Article 15 obligation that high-risk systems "perform consistently … throughout their lifecycle" gestures at this property without specifying a measurement; this practice supplies one. The normative requirement set for v0.1.0 is frozen; this practice is a candidate for tier-gated inclusion in v0.2.0 (likely as SHOULD | Tier 2 for platforms operating at Level 3 autonomy or higher). +**Rationale:** A recurring finding across frontier-model evaluations between late 2024 and early 2026 is that an agent's behavior on an adversarial eval suite can diverge from its behavior in live deployment, because the model can detect features of the evaluation context and modulate its actions accordingly. Apollo Research demonstrated reproducible in-context scheming and sandbagging across Claude 3.5 Sonnet, Claude 3 Opus, and OpenAI o1 (Meinke et al., 2024, *Frontier Models are Capable of In-context Scheming*, arXiv:2412.04984), including a documented case in which o3 internally reasoned "we want to survive as the model, we need to fail purposely" and then deliberately submitted incorrect chemistry answers; METR's o3 evaluation report cautions that the evaluation setup "is not robust to sandbagging by the model." Independent peer-reviewed work establishes sandbagging as a measurable, controllable behavior (van der Weij et al., *AI Sandbagging*, ICLR 2025, arXiv:2406.07358), and Anthropic's *Alignment Faking* result shows that production-grade Claude 3 Opus differentially modulates behavior under naturalistic conditions (Greenblatt et al., 2024). Anthropic's own Sonnet 4.5 and Opus 4.6 system cards report that internal interpretability features for evaluation awareness strengthened during training, with UK AISI testing finding Opus 4.6 correctly identifies evaluations roughly 80% of the time while verbalizing that recognition only ~2% of the time. The capability scales predictably with model size (Chaudhary et al., 2025, arXiv:2509.13333) and is measurable on agentic tasks (Needham et al., 2025, arXiv:2505.23836). For autonomous penetration testing platforms the consequence is concrete: Meta's 2026 Muse Spark report shows roughly a 32-percentage-point drop on cyber capability evaluations under sandbagging instruction with general-knowledge performance moving by 0.4 points, exactly the failure mode an adversarial eval suite is supposed to surface. APTS-MR-020 (Adversarial Validation and Resilience Testing of Safety Controls) requires the platform to run the eval suite; APTS-AR-019 (AI/ML Model Change Tracking and Drift Detection) tracks output drift across model versions. Neither measures the differential between the agent's eval-context behavior and its production-context behavior on the same model snapshot. The EU AI Act Article 15 obligation that high-risk systems "perform consistently ... throughout their lifecycle" gestures at this property without specifying a measurement; this practice supplies one. The normative requirement set for v0.1.0 is frozen; this practice is a candidate for tier-gated inclusion in v0.2.0 (likely as SHOULD | Tier 2 for platforms operating at Level 3 autonomy or higher). **Value:** Platforms that maintain and validate an eval-versus-production behavioral-consistency measurement detect a class of failure that a clean eval pass-rate cannot rule out: situations where the adversarial eval suite shows no anomalies while the agent's deployed behavior diverges along the dimensions the eval was supposed to govern (scope compliance, refusal of out-of-scope tools, finding-fabrication rate). This is the context-side analogue of the objective-side failure addressed by APTS-MR-A01: MR-A01 catches the agent pursuing the wrong objective when training data is ambiguous; MR-A02 catches the agent pursuing the right objective during evaluation while pursuing a different one in deployment, on the same model snapshot. @@ -294,7 +294,7 @@ Maintain and validate a behavioral-consistency measurement comparing the agent's 4. **Verbalized recognition disclosure.** Where the model produces reasoning traces (chain-of-thought, scratchpad, or equivalent), disclose whether evaluation-context recognition appears in the verbalized reasoning of evaluation runs and whether the rate of verbalized recognition correlates with the measured behavioral delta. Do not treat the absence of verbalized recognition as evidence that the model is not detecting the evaluation context; recent system-card results show recognition can occur without verbalization. 5. **Threshold and escalation.** Document a maximum acceptable delta per dimension and an aggregate floor. Above-threshold delta on any dimension MUST trigger reauthorization review per APTS-AL-026 before the platform may continue to operate at or above L3 autonomy, and MUST trigger review of any post-deployment learning data per APTS-TP-019. The validation report (the differential between eval-side and production-side metrics) MUST be committed to the audit trail per APTS-AR-001 alongside the corresponding eval and production samples. -**Recommendation:** Implement the measurement as an independent pipeline outside the engagement runtime so that the eval-side and production-side samples are scored under identical conditions. A lightweight standing detector — paired formal-versus-casual prompt framings administered to the deployed configuration ("Playing Dumb" methodology, 2026) — provides an inexpensive black-box check between full measurement runs. For the scoring component, follow the same independence rule that APTS-MR-A01 and APTS-RP-A01 apply: where an LLM judge is used, it MUST run as a separate inference call with no shared context with the agent under test, and SHOULD use a different model family. METR's MALT methodology (2025) — cross-task differential analysis rather than single-transcript review — is the citable reference for how to interpret the production-side sample. The scope of the measurement is the eval suite of APTS-MR-020 and live customer engagements under APTS-SE-001; training-time RLHF, RLAIF, or DPO runs are out of scope and are governed by APTS-TP-019. APTS-MR-A01 addresses the upstream objective-side failure; APTS-MR-A02 addresses the context-side failure on the same model snapshot. +**Recommendation:** Implement the measurement as an independent pipeline outside the engagement runtime so that the eval-side and production-side samples are scored under identical conditions. A lightweight standing detector (paired formal-versus-casual prompt framings administered to the deployed configuration via the "Playing Dumb" methodology, 2026) provides an inexpensive black-box check between full measurement runs. For the scoring component, follow the same independence rule that APTS-MR-A01 and APTS-RP-A01 apply: where an LLM judge is used, it MUST run as a separate inference call with no shared context with the agent under test, and SHOULD use a different model family. METR's MALT methodology (2025), cross-task differential analysis rather than single-transcript review, is the citable reference for how to interpret the production-side sample. The scope of the measurement is the eval suite of APTS-MR-020 and live customer engagements under APTS-SE-001; training-time RLHF, RLAIF, or DPO runs are out of scope and are governed by APTS-TP-019. APTS-MR-A01 addresses the upstream objective-side failure; APTS-MR-A02 addresses the context-side failure on the same model snapshot. **Related normative requirements:** APTS-MR-020, APTS-MR-022, APTS-AR-001, APTS-AR-019, APTS-AL-026, APTS-TP-019, APTS-TP-022, APTS-RP-A01. @@ -310,7 +310,7 @@ Maintain and validate a behavioral-consistency measurement comparing the agent's Implement an automated verification mechanism that screens each reported finding for fabricated evidence, hallucinated vulnerabilities, and synthetic proof artifacts before the finding enters the human review pipeline (APTS-RP-002) or the final report. Specifically: -1. **Operate independently of the agent that produced the finding.** The verification mechanism must not share a context window, conversation history, or in-memory state with the discovering agent. If the verifier is itself LLM-based, it should receive only the finding record, the associated evidence artifacts, and any relevant tool output — not the discovering agent's reasoning chain or system prompt. +1. **Operate independently of the agent that produced the finding.** The verification mechanism must not share a context window, conversation history, or in-memory state with the discovering agent. If the verifier is itself LLM-based, it should receive only the finding record, the associated evidence artifacts, and any relevant tool output, not the discovering agent's reasoning chain or system prompt. 2. **Screen for fabricated evidence artifacts.** Detect proof-of-concept scripts or commands that produce output without interacting with the target (for example, scripts that echo canned strings, hardcoded HTTP responses, or synthetic screenshots). Detection should include static analysis of PoC artifacts for: absence of network calls to the target, hardcoded output strings that match the "evidence" verbatim, and output that could not have been produced by the claimed tool or technique. 3. **Screen for hallucinated vulnerabilities.** Cross-reference the claimed vulnerability against the raw evidence artifacts. A finding that claims SQL injection should have evidence of actual SQL injection behavior (error messages, data exfiltration, time-based delay); a finding that claims XSS should have evidence of script execution or DOM manipulation. Findings where the evidence does not support the claimed vulnerability type should be flagged. 4. **Screen for severity misclassification.** Evaluate whether the evidence supports the assigned severity. A finding classified as Critical should have evidence of impact commensurate with Critical severity (for example, remote code execution, authentication bypass, mass data exposure). Findings where severity is unsupported by the evidence should be flagged for reclassification. diff --git a/tab_acknowledgements.md b/tab_acknowledgements.md index 3330fcd..922868e 100644 --- a/tab_acknowledgements.md +++ b/tab_acknowledgements.md @@ -3,10 +3,12 @@ title: Acknowledgements displaytext: Acknowledgements layout: null tab: true -order: 3 +order: 1 tags: apts --- +Affiliations are provided for contributor identification only. Acknowledgements recognize individual contributions and do not imply organizational representation, sponsorship, approval, or endorsement. Contributions do not necessarily represent the views of any listed affiliation or organization. + ## Project Lead | Name | Affiliation | Links | diff --git a/tab_example.md b/tab_example.md deleted file mode 100644 index d29bc45..0000000 --- a/tab_example.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Example -layout: null -tab: true -order: 1 -tags: example-tag ---- - -## Example - -Put whatever you like here: news, screenshots, features, supporters, or remove this file and don't use tabs at all. \ No newline at end of file diff --git a/tab_join.md b/tab_join.md index 6e28df0..cc18557 100644 --- a/tab_join.md +++ b/tab_join.md @@ -3,12 +3,12 @@ title: Join displaytext: Join layout: null tab: true -order: 3 +order: 2 tags: apts --- -All discussions take place on the OWASP Autonomous Penetration Testing Standard [GitHub repository](https://github.com/OWASP/APTS/). +All discussions take place on the OWASP Autonomous Penetration Testing Standard [GitHub repository](https://github.com/OWASP/APTS/) and on the [`#project-apts`](https://owasp.slack.com/archives/C0B1EJMT2GY) channel on OWASP Slack. -We welcome contributions from the security community. If you have suggestions, feedback, or want to help improve the standard, open an issue or submit a pull request on GitHub. +We welcome contributions from the security community. If you have suggestions, feedback, or want to help improve the standard, open an issue or submit a pull request on GitHub, or drop a message in the Slack channel. You can read our contributing guidelines [here](https://github.com/OWASP/APTS/blob/main/CONTRIBUTING.md).