From 62ea84bff09d4e6f97c3a44eae12a08f388ea0c2 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 20:36:12 +0300 Subject: [PATCH 01/12] Falsify + calibrate S2: boundary pass (power robust, specificity seed-sensitive) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Falsification-first: actively attacked BONN_S2_BRIGHT_LINE_PASSED (4 seed perturbations + 3 AR-order misspecifications, N=30, 199 surrogates). Result S2_FRAGILE_under_attack: - G1 power ROBUST: Set E SURVIVED 0.967 under every seed and AR order. - G2 specificity BOUNDARY: AR-null FPR 0.0-0.067; seed_base=7 gave 0.067 > 0.05. Calibrated (not defended): BONN_S2_BRIGHT_LINE_PASSED is a marginal/boundary pass — cleared the predeclared N=100 confirmatory (FPR 0.02) but the specificity margin is thin and seed-sensitive (Wilson 95% CI of 0.02 reaches ~0.05). Integrated into the truth-system: CURRENT_TRUTH.s2_robustness=BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE; caveats in FORMAL_VERDICT + STATISTIC_REGISTRY + CLAIM_AUDIT. Honest next step: seed-averaged / larger-N specificity confirmatory. Governance regenerated to fixpoint (CERTIFIED). No over-claim. Co-Authored-By: Claude Opus 4.8 (1M context) --- FORMAL_VERDICT.md | 9 +++ .../S2_FALSIFICATION_REPORT.json | 64 +++++++++++++++++++ artifacts/release/CLAIM_SAFETY_REPORT.json | 4 +- artifacts/release/CURRENT_TRUTH.json | 5 +- .../release/TRUTH_CONSISTENCY_CHECK.json | 4 +- docs/validation/CLAIM_AUDIT.md | 7 ++ docs/validation/STATISTIC_REGISTRY.md | 12 ++++ tools/generate_current_truth.py | 12 ++++ 8 files changed, 111 insertions(+), 6 deletions(-) create mode 100644 artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json diff --git a/FORMAL_VERDICT.md b/FORMAL_VERDICT.md index e94ae0b..3727ca6 100644 --- a/FORMAL_VERDICT.md +++ b/FORMAL_VERDICT.md @@ -56,3 +56,12 @@ adjudicated on its own executed evidence. S2_BRIGHT_LINE_SUMMARY, s2_CONFIRMATORY_VERDICT, S2_SELECTION_LOCK, DATASET_MANIFEST}.json` · `docs/validation/{S2_VERDICT, STATISTIC_REGISTRY, CLAIM_AUDIT}.md` · hashes `artifacts/release/bonn_bright_line/HASHES.sha256` · reproduce `REPRODUCE.md`. + +## Robustness (falsification-calibrated) +An adversarial battery (`artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json`) found: +**G1 power is robust** (Set E SURVIVED 0.967 under all seeds/AR-orders), but **G2 specificity is a +boundary pass** — AR-null FPR reached **0.067 > 0.05** under one perturbation seed (N=30). So +`BONN_S2_BRIGHT_LINE_PASSED` is a **marginal/boundary** pass: it cleared the predeclared N=100 +confirmatory (FPR 0.02) but the specificity margin is thin and seed-sensitive. Not claimed as +robustly crossed; a seed-averaged / larger-N specificity confirmatory is the honest next step. +`CURRENT_TRUTH.s2_robustness = BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE`. diff --git a/artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json b/artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json new file mode 100644 index 0000000..915a737 --- /dev/null +++ b/artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json @@ -0,0 +1,64 @@ +{ + "schema": "bsff.s2_falsification/v1", + "N_segments": 30, + "n_surrogates": 199, + "detection_p": 0.025, + "attacks": [ + { + "attack": "seed_perturbation", + "seed_base": 20260623, + "E_survived": 0.967, + "ar_null_fpr": 0.0, + "E_ok": true, + "fpr_ok": true + }, + { + "attack": "seed_perturbation", + "seed_base": 7, + "E_survived": 0.967, + "ar_null_fpr": 0.067, + "E_ok": true, + "fpr_ok": false + }, + { + "attack": "seed_perturbation", + "seed_base": 999, + "E_survived": 0.967, + "ar_null_fpr": 0.0, + "E_ok": true, + "fpr_ok": true + }, + { + "attack": "seed_perturbation", + "seed_base": 314159, + "E_survived": 0.967, + "ar_null_fpr": 0.033, + "E_ok": true, + "fpr_ok": true + }, + { + "attack": "ar_order_variation", + "ar_order": 5, + "ar_null_fpr": 0.0, + "fpr_ok": true + }, + { + "attack": "ar_order_variation", + "ar_order": 10, + "ar_null_fpr": 0.033, + "fpr_ok": true + }, + { + "attack": "ar_order_variation", + "ar_order": 15, + "ar_null_fpr": 0.0, + "fpr_ok": true + } + ], + "claim_survives_attacks": false, + "verdict": "S2_FRAGILE_under_attack", + "git_commit": "394f5b33547591b4d074e9e1224735ba0947291d", + "timestamp_utc": "2026-06-24T17:28:19Z", + "interpretation": "G1 power robust (Set E SURVIVED 0.967 across all 4 seeds + AR orders). G2 specificity is a BOUNDARY pass: AR-null FPR 0.0-0.067 across seeds; seed_base=7 gave 0.067>0.05 at N=30. The committed confirmatory (N=100) FPR=0.02 has a Wilson 95% CI reaching ~0.05, so the specificity margin is thin and seed-sensitive. The bright line PASSED the predeclared confirmatory but is NOT robust to seed.", + "calibrated_claim": "BONN_S2_BRIGHT_LINE_PASSED is a BOUNDARY/marginal pass (power robust; specificity margin thin, seed-sensitive)." +} diff --git a/artifacts/release/CLAIM_SAFETY_REPORT.json b/artifacts/release/CLAIM_SAFETY_REPORT.json index 7f5c2cd..9c3424f 100644 --- a/artifacts/release/CLAIM_SAFETY_REPORT.json +++ b/artifacts/release/CLAIM_SAFETY_REPORT.json @@ -14,6 +14,6 @@ "docs/QUICKSTART.md" ], "violations": [], - "git_commit": "85009007efec4b0adcfd236d906acd66b6a46b26", - "timestamp_utc": "2026-06-24T15:13:00Z" + "git_commit": "394f5b33547591b4d074e9e1224735ba0947291d", + "timestamp_utc": "2026-06-24T17:35:39Z" } diff --git a/artifacts/release/CURRENT_TRUTH.json b/artifacts/release/CURRENT_TRUTH.json index b943b33..23dc016 100644 --- a/artifacts/release/CURRENT_TRUTH.json +++ b/artifacts/release/CURRENT_TRUTH.json @@ -1,7 +1,7 @@ { "schema": "bsff.current_truth/v1", "package_version": "0.4.0", - "main_commit": "f3fcd3697ed56a4bc642fe7146058b66628f0b28", + "main_commit": "394f5b33547591b4d074e9e1224735ba0947291d", "latest_validation_state": "BONN_S2_BRIGHT_LINE_PASSED", "bonn_s1_state": "BRIGHT_LINE_NOT_PASSED", "bonn_s2_state": "S2_BRIGHT_LINE_PASSED", @@ -19,6 +19,7 @@ }, "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "bnci_execution_state": "BNCI_BLOCKED_METHOD", + "s2_robustness": "BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE", "multi_dataset_replication_state": "NOT_DONE", "pypi_deployment_state": "TESTPYPI_READY_PYPI_READY", "supported_claims": [ @@ -48,5 +49,5 @@ }, "hash_manifest_path": "artifacts/release/bonn_bright_line/HASHES.sha256", "reproduction_entrypoint": "REPRODUCE.md", - "timestamp_utc": "2026-06-24T16:18:36Z" + "timestamp_utc": "2026-06-24T17:29:52Z" } diff --git a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json index 422fc00..bfb6498 100644 --- a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json +++ b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json @@ -14,6 +14,6 @@ ], "contradictions": [], "stale_claims": [], - "timestamp_utc": "2026-06-24T16:19:05Z", - "git_commit": "f3fcd3697ed56a4bc642fe7146058b66628f0b28" + "timestamp_utc": "2026-06-24T17:35:39Z", + "git_commit": "394f5b33547591b4d074e9e1224735ba0947291d" } diff --git a/docs/validation/CLAIM_AUDIT.md b/docs/validation/CLAIM_AUDIT.md index bd75c6b..722ac23 100644 --- a/docs/validation/CLAIM_AUDIT.md +++ b/docs/validation/CLAIM_AUDIT.md @@ -61,3 +61,10 @@ Still **FORBIDDEN**: clinical/medical/regulatory/device claims; final proof of b | Multi-dataset replication (Cho2017/Lee2019) is done | REFUTED_BY_ARTIFACT | NOT_DONE — only preregistration scaffolds (`artifacts/replication/*/LOCK.json`) | | BNCI method repair has passed its short-epoch check | UNSUPPORTED (not yet) | `METHOD_REPAIR_LOCK.json` = PREDECLARED_NOT_VALIDATED; short-epoch validation INCONCLUSIVE for narrowband | | Forbidden claims are enforced in CI | PROVEN_BY_ARTIFACT | `tools/validate_forbidden_claims.py` (CI) + `CLAIM_SAFETY_REPORT.json` | + +## S2 falsification (calibrated) +| claim | status | evidence | +|-------|--------|----------| +| S2 G1 power is robust to seed | PROVEN_BY_ARTIFACT | Set E SURVIVED 0.967 all seeds (`S2_FALSIFICATION_REPORT.json`) | +| S2 G2 specificity is robustly below 0.05 | REFUTED_BY_ARTIFACT | seed_base=7 -> AR-null FPR 0.067 > 0.05; margin thin/seed-sensitive | +| S2 bright line is a boundary/marginal pass (not robustly crossed) | PROVEN_BY_ARTIFACT | falsification battery; calibrated claim | diff --git a/docs/validation/STATISTIC_REGISTRY.md b/docs/validation/STATISTIC_REGISTRY.md index 5cdecdc..6ba20f6 100644 --- a/docs/validation/STATISTIC_REGISTRY.md +++ b/docs/validation/STATISTIC_REGISTRY.md @@ -107,3 +107,15 @@ confirmatory verdict follow the same fail-closed pattern as S1. 0.05 gave real FPR 0.08); strong ictal rejections (p≈0.005) survive the stricter threshold, so G1 power is preserved. A conservative-threshold variant, not a new statistic. - Evidence: `s2_CONFIRMATORY_VERDICT.json`, `S2_BRIGHT_LINE_SUMMARY.json`, `docs/validation/S2_VERDICT.md`. + +### S2 robustness — falsification (calibrated) + +An adversarial battery (`artifacts/bonn_bright_line/S2_FALSIFICATION_REPORT.json`) attacked the S2 +verdict with 4 seed perturbations + 3 AR-order misspecifications (N=30, 199 surrogates): +- **G1 power ROBUST:** Set E SURVIVED = 0.967 under every seed and AR order. +- **G2 specificity BOUNDARY/FRAGILE:** AR-null FPR ranged 0.0–0.067; `seed_base=7` gave **0.067 > 0.05**. + +**Calibrated claim:** `BONN_S2_BRIGHT_LINE_PASSED` is a **boundary/marginal pass** — it passed the +predeclared N=100 confirmatory (FPR 0.02) but the specificity margin is thin (Wilson 95% CI of +0.02 reaches ~0.05) and **seed-sensitive**. The bright line is not claimed as robustly crossed; a +seed-averaged or larger-N specificity confirmatory is the honest next step. diff --git a/tools/generate_current_truth.py b/tools/generate_current_truth.py index 2bb124e..bb51dc1 100644 --- a/tools/generate_current_truth.py +++ b/tools/generate_current_truth.py @@ -38,6 +38,17 @@ def _bnci_execution_state() -> str: return "NOT_ATTEMPTED" +def _s2_robustness() -> str: + # Calibrated by the falsification battery: power robust, specificity margin thin/seed-sensitive. + p = ROOT / "artifacts" / "bonn_bright_line" / "S2_FALSIFICATION_REPORT.json" + if p.is_file(): + r = json.loads(p.read_text()) + if not r.get("claim_survives_attacks", True): + return "BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE" + return "ROBUST" + return "NOT_TESTED" + + def _replication_state() -> str: rep = ROOT / "artifacts" / "replication" done = rep.is_dir() and any(rep.glob("**/CONFIRMATORY_VERDICT.json")) @@ -81,6 +92,7 @@ def build() -> dict: }, "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY" if s2_pass else "BLOCKED", "bnci_execution_state": _bnci_execution_state(), + "s2_robustness": _s2_robustness(), "multi_dataset_replication_state": _replication_state(), "pypi_deployment_state": _pypi_state(), "supported_claims": [ From 3ae10213e789c05f3139a4cc92117417277ce263 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 21:45:37 +0300 Subject: [PATCH 02/12] Calibrate S2 specificity (seed-averaged): NOT robustly below 0.05 -> bright line not robustly crossed Continuing the falsification: a seed-averaged specificity calibration (480 AR-null tests, 6 seeds x Sets A+B) gives pooled FPR 0.0354, Wilson 95% CI [0.022, 0.056]. The CI UPPER bound (0.056) EXCEEDS the 0.05 gate; 2/6 seeds gave FPR > 0.05 (0.075, 0.0625). The predeclared confirmatory FPR=0.02 (seed 20260623) was a favorable-seed point estimate. Calibrated (truth over comfort): G1 power robust; G2 specificity NOT robust; the Bonn S2 bright line is a MARGINAL/favorable-seed pass, NOT robustly crossed. Integrated: CURRENT_TRUTH.s2_robustness=NOT_ROBUST_G2_SPECIFICITY...CI_[0.0222,0.056]_crosses_0.05; FORMAL_VERDICT section 1 + README + STATISTIC_REGISTRY + CLAIM_AUDIT updated to lead with the non-robustness. Honest next step: re-preregister a seed-averaged specificity gate (FPR CI upper <= 0.05) and re-run. Artifact: S2_SPECIFICITY_CALIBRATION.json. Governance fixpoint (CERTIFIED). No over-claim. Co-Authored-By: Claude Opus 4.8 (1M context) --- FORMAL_VERDICT.md | 12 +++-- README.md | 7 ++- .../S2_SPECIFICITY_CALIBRATION.json | 51 +++++++++++++++++++ artifacts/release/CLAIM_SAFETY_REPORT.json | 4 +- artifacts/release/CURRENT_TRUTH.json | 6 +-- .../release/TRUTH_CONSISTENCY_CHECK.json | 4 +- docs/validation/CLAIM_AUDIT.md | 7 +++ docs/validation/STATISTIC_REGISTRY.md | 10 ++++ tools/generate_current_truth.py | 16 +++--- 9 files changed, 98 insertions(+), 19 deletions(-) create mode 100644 artifacts/bonn_bright_line/S2_SPECIFICITY_CALIBRATION.json diff --git a/FORMAL_VERDICT.md b/FORMAL_VERDICT.md index 3727ca6..88275ee 100644 --- a/FORMAL_VERDICT.md +++ b/FORMAL_VERDICT.md @@ -5,12 +5,16 @@ Canonical machine-readable truth: [`artifacts/release/CURRENT_TRUTH.json`](artif This document must agree with it (enforced by `tools/validate_current_truth.py`). ## 1. Current canonical verdict -**BONN_S2_BRIGHT_LINE_PASSED.** BSFF passed the Bonn S2 bright-line under the frozen -finite-N-corrected SampEn protocol (`S2-C1-sampen-finiteN`). +**BONN_S2_BRIGHT_LINE_PASSED (predeclared confirmatory) — but a MARGINAL, NOT-robust pass.** +BSFF cleared the frozen `S2-C1-sampen-finiteN` confirmatory at the predeclared seed, but a +seed-averaged falsification (§Robustness) shows **G2 specificity is NOT robust**: pooled AR-null +FPR 0.0354, Wilson 95% CI **[0.022, 0.056]** crosses the 0.05 gate. The bright line is **not +robustly crossed**; the FPR 0.02 below was a favorable-seed point estimate. -- G1 (power): Set E SURVIVED **0.96**, Set A not-SURVIVED **0.92**, Set B not-SURVIVED **0.92** (≥ 0.80). -- G2 (specificity): real-spectrum AR-null FPR A **0.02**, B **0.02**, combined **0.02** (≤ 0.05). +- G1 (power): Set E SURVIVED **0.96**, Set A not-SURVIVED **0.92**, Set B not-SURVIVED **0.92** (≥ 0.80) — **robust**. +- G2 (specificity): predeclared-seed AR-null FPR **0.02** ≤ 0.05; **seed-averaged 0.035, CI [0.022, 0.056] — not robustly ≤ 0.05**. - BNCI2014-001 chain: **UNLOCKED_FOR_PREREGISTRATION_ONLY**. +- `CURRENT_TRUTH.s2_robustness = NOT_ROBUST_G2_SPECIFICITY` (see `S2_SPECIFICITY_CALIBRATION.json`). > BSFF passed the Bonn S2 bright-line under the frozen finite-N-corrected SampEn protocol. > This permits BNCI2014-001 preregistration. It does not validate BSFF across BCI datasets, diff --git a/README.md b/README.md index d49ec78..324d763 100644 --- a/README.md +++ b/README.md @@ -38,8 +38,11 @@ BSFF aims at a **BCI/EEG signal claim** and tries to refute it under stated atta **Current canonical evidence — `BONN_S2_BRIGHT_LINE_PASSED`** ([`artifacts/release/CURRENT_TRUTH.json`](artifacts/release/CURRENT_TRUTH.json)): on real -Andrzejak-2001 Bonn EEG the instrument has **power** (ictal SURVIVED 0.96) **and specificity** -(real-spectrum AR-null FPR 0.02 ≤ 0.05). The earlier S1 negative result is preserved as evidence. +Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.96) and a +**marginal, NOT-robust specificity**: the predeclared-seed AR-null FPR was 0.02, but a +seed-averaged falsification gives FPR 0.035, Wilson 95% CI **[0.022, 0.056] crossing the 0.05 gate** +(`S2_SPECIFICITY_CALIBRATION.json`). The bright line is **not robustly crossed** — a favorable-seed +pass. The earlier S1 negative result is preserved as evidence. ```bash git clone https://github.com/neuron7xLab/bsff && cd bsff diff --git a/artifacts/bonn_bright_line/S2_SPECIFICITY_CALIBRATION.json b/artifacts/bonn_bright_line/S2_SPECIFICITY_CALIBRATION.json new file mode 100644 index 0000000..aeadaa2 --- /dev/null +++ b/artifacts/bonn_bright_line/S2_SPECIFICITY_CALIBRATION.json @@ -0,0 +1,51 @@ +{ + "schema": "bsff.s2_specificity_calibration/v1", + "n_ar_null_tests": 480, + "n_false_positives": 17, + "pooled_fpr": 0.0354, + "wilson_95ci": [ + 0.0222, + 0.056 + ], + "threshold": 0.05, + "seeds": [ + 20260624, + 7, + 999, + 314159, + 2718, + 42 + ], + "per_seed": [ + { + "seed": 20260624, + "fpr": 0.075 + }, + { + "seed": 7, + "fpr": 0.0625 + }, + { + "seed": 999, + "fpr": 0.0 + }, + { + "seed": 314159, + "fpr": 0.0125 + }, + { + "seed": 2718, + "fpr": 0.025 + }, + { + "seed": 42, + "fpr": 0.0375 + } + ], + "fpr_ci_upper_below_threshold": false, + "verdict": "S2_SPECIFICITY_NOT_ROBUSTLY_BELOW_0.05", + "git_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2", + "timestamp_utc": "2026-06-24T18:35:51Z", + "interpretation": "Seed-averaged AR-null FPR = 0.0354 (17/480), Wilson 95% CI [0.022, 0.056]. The CI UPPER bound (0.056) EXCEEDS the 0.05 gate, and 2 of 6 seeds gave FPR > 0.05 (0.075, 0.0625). The predeclared confirmatory FPR=0.02 (seed 20260623, N=100) was a favorable-seed point estimate. G2 specificity is NOT robustly below 0.05.", + "calibrated_verdict": "BONN_S2_BRIGHT_LINE not robustly crossed: G1 power robust, but G2 specificity fails robustness (seed-averaged FPR CI crosses the gate). Marginal/favorable-seed pass only." +} diff --git a/artifacts/release/CLAIM_SAFETY_REPORT.json b/artifacts/release/CLAIM_SAFETY_REPORT.json index 9c3424f..0f75a03 100644 --- a/artifacts/release/CLAIM_SAFETY_REPORT.json +++ b/artifacts/release/CLAIM_SAFETY_REPORT.json @@ -14,6 +14,6 @@ "docs/QUICKSTART.md" ], "violations": [], - "git_commit": "394f5b33547591b4d074e9e1224735ba0947291d", - "timestamp_utc": "2026-06-24T17:35:39Z" + "git_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2", + "timestamp_utc": "2026-06-24T18:45:04Z" } diff --git a/artifacts/release/CURRENT_TRUTH.json b/artifacts/release/CURRENT_TRUTH.json index 23dc016..63a36d1 100644 --- a/artifacts/release/CURRENT_TRUTH.json +++ b/artifacts/release/CURRENT_TRUTH.json @@ -1,7 +1,7 @@ { "schema": "bsff.current_truth/v1", "package_version": "0.4.0", - "main_commit": "394f5b33547591b4d074e9e1224735ba0947291d", + "main_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2", "latest_validation_state": "BONN_S2_BRIGHT_LINE_PASSED", "bonn_s1_state": "BRIGHT_LINE_NOT_PASSED", "bonn_s2_state": "S2_BRIGHT_LINE_PASSED", @@ -19,7 +19,7 @@ }, "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "bnci_execution_state": "BNCI_BLOCKED_METHOD", - "s2_robustness": "BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE", + "s2_robustness": "NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_0.0354_CI_[0.0222, 0.056]_crosses_0.05", "multi_dataset_replication_state": "NOT_DONE", "pypi_deployment_state": "TESTPYPI_READY_PYPI_READY", "supported_claims": [ @@ -49,5 +49,5 @@ }, "hash_manifest_path": "artifacts/release/bonn_bright_line/HASHES.sha256", "reproduction_entrypoint": "REPRODUCE.md", - "timestamp_utc": "2026-06-24T17:29:52Z" + "timestamp_utc": "2026-06-24T18:37:06Z" } diff --git a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json index bfb6498..146ab50 100644 --- a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json +++ b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json @@ -14,6 +14,6 @@ ], "contradictions": [], "stale_claims": [], - "timestamp_utc": "2026-06-24T17:35:39Z", - "git_commit": "394f5b33547591b4d074e9e1224735ba0947291d" + "timestamp_utc": "2026-06-24T18:45:05Z", + "git_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2" } diff --git a/docs/validation/CLAIM_AUDIT.md b/docs/validation/CLAIM_AUDIT.md index 722ac23..cd935e0 100644 --- a/docs/validation/CLAIM_AUDIT.md +++ b/docs/validation/CLAIM_AUDIT.md @@ -68,3 +68,10 @@ Still **FORBIDDEN**: clinical/medical/regulatory/device claims; final proof of b | S2 G1 power is robust to seed | PROVEN_BY_ARTIFACT | Set E SURVIVED 0.967 all seeds (`S2_FALSIFICATION_REPORT.json`) | | S2 G2 specificity is robustly below 0.05 | REFUTED_BY_ARTIFACT | seed_base=7 -> AR-null FPR 0.067 > 0.05; margin thin/seed-sensitive | | S2 bright line is a boundary/marginal pass (not robustly crossed) | PROVEN_BY_ARTIFACT | falsification battery; calibrated claim | + +## S2 specificity calibration (decisive) +| claim | status | evidence | +|-------|--------|----------| +| S2 G2 specificity is robustly below 0.05 | REFUTED_BY_ARTIFACT | seed-avg FPR 0.0354, Wilson 95% CI [0.022, 0.056] crosses 0.05; 2/6 seeds >0.05 (`S2_SPECIFICITY_CALIBRATION.json`) | +| Bonn S2 bright line is robustly crossed | REFUTED_BY_ARTIFACT | marginal/favorable-seed pass only; G2 not robust | +| Bonn S2 G1 power is robust | PROVEN_BY_ARTIFACT | Set E SURVIVED 0.96-0.967 across all seeds | diff --git a/docs/validation/STATISTIC_REGISTRY.md b/docs/validation/STATISTIC_REGISTRY.md index 6ba20f6..8ae8653 100644 --- a/docs/validation/STATISTIC_REGISTRY.md +++ b/docs/validation/STATISTIC_REGISTRY.md @@ -119,3 +119,13 @@ verdict with 4 seed perturbations + 3 AR-order misspecifications (N=30, 199 surr predeclared N=100 confirmatory (FPR 0.02) but the specificity margin is thin (Wilson 95% CI of 0.02 reaches ~0.05) and **seed-sensitive**. The bright line is not claimed as robustly crossed; a seed-averaged or larger-N specificity confirmatory is the honest next step. + +### S2 specificity — seed-averaged calibration (decisive) + +Following the falsification, a seed-averaged calibration (`S2_SPECIFICITY_CALIBRATION.json`, 480 +AR-null tests over 6 seeds × Sets A+B) gives **pooled FPR 0.0354, Wilson 95% CI [0.022, 0.056]**. +The CI **upper bound (0.056) exceeds the 0.05 gate**, and 2 of 6 seeds gave FPR > 0.05 (0.075, 0.0625). +**G2 specificity is NOT robustly below 0.05.** The predeclared confirmatory FPR=0.02 (seed 20260623) +was a favorable-seed point estimate. The Bonn bright line is **not robustly crossed** — a marginal, +seed-dependent pass. `CURRENT_TRUTH.s2_robustness` carries this. Honest next step: re-preregister a +seed-averaged specificity gate (require the FPR CI upper bound ≤ 0.05) and re-run. diff --git a/tools/generate_current_truth.py b/tools/generate_current_truth.py index bb51dc1..8d5c8a2 100644 --- a/tools/generate_current_truth.py +++ b/tools/generate_current_truth.py @@ -39,13 +39,17 @@ def _bnci_execution_state() -> str: def _s2_robustness() -> str: - # Calibrated by the falsification battery: power robust, specificity margin thin/seed-sensitive. - p = ROOT / "artifacts" / "bonn_bright_line" / "S2_FALSIFICATION_REPORT.json" - if p.is_file(): - r = json.loads(p.read_text()) - if not r.get("claim_survives_attacks", True): - return "BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE" + # Calibrated by the falsification battery + seed-averaged specificity calibration. + cal = ROOT / "artifacts" / "bonn_bright_line" / "S2_SPECIFICITY_CALIBRATION.json" + if cal.is_file(): + c = json.loads(cal.read_text()) + if not c.get("fpr_ci_upper_below_threshold", True): + ci = c.get("wilson_95ci") + return f"NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_{c.get('pooled_fpr')}_CI_{ci}_crosses_0.05" return "ROBUST" + fals = ROOT / "artifacts" / "bonn_bright_line" / "S2_FALSIFICATION_REPORT.json" + if fals.is_file() and not json.loads(fals.read_text()).get("claim_survives_attacks", True): + return "BOUNDARY_PASS_G1_POWER_ROBUST_G2_SPECIFICITY_SEED_SENSITIVE" return "NOT_TESTED" From f84ff94e2f5c30a9f51b9b774834b788c3d2c52a Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 21:50:55 +0300 Subject: [PATCH 03/12] predeclare S3 seed-averaged re-confirmatory (robust specificity gate, FROZEN before run) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gate: G1 seed-avg Set-E SURVIVED>=0.80 AND G2 pooled AR-null FPR Wilson-95-CI-upper<=0.05 (stricter than a point estimate — the failure the falsification exposed). 10 seeds, N=50, 199 surrogates, statistic S2-C1 unchanged. No tuning after results. Run pending. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../bonn_bright_line/S3_PROTOCOL_LOCK.json | 35 +++++ docs/validation/S3_SEED_AVERAGED_PROTOCOL.md | 21 +++ .../s3_seed_averaged_confirmatory.py | 142 ++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json create mode 100644 docs/validation/S3_SEED_AVERAGED_PROTOCOL.md create mode 100644 examples/bonn_bright_line/s3_seed_averaged_confirmatory.py diff --git a/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json b/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json new file mode 100644 index 0000000..1b24d2b --- /dev/null +++ b/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json @@ -0,0 +1,35 @@ +{ + "schema": "bsff.s3_protocol_lock/v1", + "status": "FROZEN_BEFORE_RUN", + "statistic_id": "sampen_lower_tail_m2_r015_v1", + "detection_p": 0.025, + "alpha": 0.05, + "n_surrogates": 199, + "n_seeds": 10, + "seeds": [ + 20260623, + 7, + 999, + 314159, + 2718, + 42, + 161803, + 27182, + 31337, + 123456 + ], + "n_segments_per_set": 50, + "G1_gate": "seed-averaged Set-E SURVIVED fraction >= 0.80", + "G2_gate": "pooled AR-null FPR (A+B) Wilson-95-CI upper bound <= 0.05", + "forbidden": [ + "alpha change", + "threshold change", + "statistic change", + "seed dropping", + "favorable-seed selection" + ], + "runner": "examples/bonn_bright_line/s3_seed_averaged_confirmatory.py", + "runner_sha256": "a19f6400945c2f96d0603c864743f435a4e62f5844b16d62850b57ba4a21dc0a", + "git_commit": "3ae10213e789c05f3139a4cc92117417277ce263", + "timestamp_utc": "2026-06-24T18:50:14Z" +} diff --git a/docs/validation/S3_SEED_AVERAGED_PROTOCOL.md b/docs/validation/S3_SEED_AVERAGED_PROTOCOL.md new file mode 100644 index 0000000..f394301 --- /dev/null +++ b/docs/validation/S3_SEED_AVERAGED_PROTOCOL.md @@ -0,0 +1,21 @@ + +# S3 seed-averaged bright-line protocol (frozen before run) + +The S2 falsification showed G2 specificity is seed-sensitive (seed-avg FPR 0.035, Wilson 95% CI +[0.022, 0.056] crossing 0.05). S3 re-runs the bright line with a **robust, seed-averaged** gate. + +## Frozen gate (no tuning after results) +- **G1 (power):** seed-averaged Set-E SURVIVED fraction ≥ 0.80. +- **G2 (specificity):** pooled AR-null FPR (Sets A+B) over all seeds, with a Wilson 95% CI. + PASS requires the **CI upper bound ≤ 0.05** (stricter than a point estimate — the failure mode + the falsification exposed). +- Statistic: `sampen_lower_tail_m2_r015_v1`, p ≤ α/2 = 0.025, MIAAFT null, **unchanged**. +- K = 10 seeds (fixed list), N = 50 segments/set, n_surrogates = 199. α = 0.05 fixed. + +## Allowed terminal states +`S3_BRIGHT_LINE_ROBUSTLY_PASSED` · `S3_BRIGHT_LINE_NOT_ROBUSTLY_PASSED`. + +## Forbidden +Changing α/thresholds/statistic after results; dropping seeds; selecting a favorable seed. +Runner: `examples/bonn_bright_line/s3_seed_averaged_confirmatory.py`. Lock: +`artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json`. diff --git a/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py b/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py new file mode 100644 index 0000000..63f60d8 --- /dev/null +++ b/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-3.0-or-later +# Copyright (c) 2026 Yaroslav Vasylenko / neuron7xLab +"""S3 seed-averaged bright-line re-confirmatory (robust specificity gate). + +Pre-declared gate (docs/validation/S3_SEED_AVERAGED_PROTOCOL.md, FROZEN before run): + G1 (power): seed-averaged Set-E SURVIVED fraction >= 0.80 + G2 (specificity): seed-averaged AR-null FPR (Sets A+B) with Wilson 95% CI; + PASS requires the CI UPPER bound <= 0.05 (not just the point estimate). + K seeds, N segments/set, n_surrogates=199, statistic S2-C1 (sampen lower-tail, p<=alpha/2=0.025). +No tuning after results. The artifact decides. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path + +import numpy as np + +_HERE = Path(__file__).resolve().parent +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from loader import load_set # noqa: E402 +from run_ar_negative import ar_null # noqa: E402 +from statistics_sampen import STATISTIC_ID, sampen_lower_tail_test # noqa: E402 + +NSUR = 199 +ALPHA_EFF = 0.025 +G1_MIN = 0.80 +G2_MAX_FPR = 0.05 +SEEDS = [20260623, 7, 999, 314159, 2718, 42, 161803, 27182, 31337, 123456] + + +def _wilson(k: int, n: int, z: float = 1.96) -> tuple[float, float, float]: + if n == 0: + return 0.0, 0.0, 1.0 + p = k / n + den = 1 + z * z / n + centre = (p + z * z / (2 * n)) / den + half = (z * np.sqrt(p * (1 - p) / n + z * z / (4 * n * n))) / den + return p, max(0.0, centre - half), min(1.0, centre + half) + + +def _survived(sig, seed) -> str: + t = sampen_lower_tail_test(np.asarray(sig, float), n_surrogates=NSUR, alpha=0.05, seed=seed) + if not t["surrogate_converged"]: + return "UNSUPPORTED" + return "SURVIVED" if t["p_value"] <= ALPHA_EFF else "REFUTED" + + +def main(argv=None) -> int: + p = argparse.ArgumentParser() + p.add_argument("--data-dir", default="examples/bonn_bright_line/bonn_data", type=Path) + p.add_argument("--n-segments", type=int, default=50) + p.add_argument("--seeds", type=int, nargs="+", default=SEEDS) + p.add_argument( + "--output", + type=Path, + default=Path("artifacts/bonn_bright_line/S3_CONFIRMATORY_VERDICT.json"), + ) + a = p.parse_args(argv) + t0 = time.time() + E = [s.data for s in load_set(a.data_dir, "E", n_segments=a.n_segments)] + A = [s.data for s in load_set(a.data_dir, "A", n_segments=a.n_segments)] + B = [s.data for s in load_set(a.data_dir, "B", n_segments=a.n_segments)] + + e_surv = e_tot = 0 + fp = ar_tot = 0 + per_seed = [] + for sb in a.seeds: + es = sum(_survived(E[i], sb + i) == "SURVIVED" for i in range(len(E))) + fa = sum( + _survived(ar_null(A[i], 10, sb + i + 500), sb + i + 700) == "SURVIVED" + for i in range(len(A)) + ) + fbb = sum( + _survived(ar_null(B[i], 10, sb + i + 900), sb + i + 1100) == "SURVIVED" + for i in range(len(B)) + ) + e_surv += es + e_tot += len(E) + fp += fa + fbb + ar_tot += len(A) + len(B) + per_seed.append( + { + "seed": sb, + "E_survived": round(es / len(E), 4), + "ar_null_fpr": round((fa + fbb) / (len(A) + len(B)), 4), + } + ) + print( + f" seed {sb}: E={es / len(E):.2f} fpr={(fa + fbb) / (len(A) + len(B)):.3f}", flush=True + ) + + e_frac = e_surv / e_tot + fpr, fpr_lo, fpr_hi = _wilson(fp, ar_tot) + g1 = e_frac >= G1_MIN + g2 = fpr_hi <= G2_MAX_FPR # robust gate: CI upper bound, not the point estimate + passed = bool(g1 and g2) + verdict = "S3_BRIGHT_LINE_ROBUSTLY_PASSED" if passed else "S3_BRIGHT_LINE_NOT_ROBUSTLY_PASSED" + out = { + "schema": "bsff.s3_seed_averaged/v1", + "verdict": verdict, + "statistic_id": STATISTIC_ID, + "n_seeds": len(a.seeds), + "n_segments_per_set": a.n_segments, + "n_surrogates": NSUR, + "G1": { + "E_survived_fraction": round(e_frac, 4), + "threshold": G1_MIN, + "pass": g1, + "n": e_tot, + }, + "G2": { + "ar_null_fpr": round(fpr, 4), + "wilson_95ci": [round(fpr_lo, 4), round(fpr_hi, 4)], + "ci_upper_threshold": G2_MAX_FPR, + "pass": g2, + "n_ar_null": ar_tot, + "n_false_positives": fp, + }, + "S3_PASS": passed, + "per_seed": per_seed, + "gate": "G1 seed-avg SURVIVED>=0.80 AND G2 AR-null FPR Wilson-95-CI-upper<=0.05", + "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "elapsed_sec": round(time.time() - t0, 1), + } + a.output.parent.mkdir(parents=True, exist_ok=True) + a.output.write_text(json.dumps(out, indent=2) + "\n") + print( + f"\n{verdict} | G1 E={e_frac:.3f}(>=0.80) G2 FPR={fpr:.4f} CI=[{fpr_lo:.4f},{fpr_hi:.4f}] (upper<=0.05?{g2})" + ) + return 0 if passed else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) From 87466768dfac819bde0f36d8fdbed25c0aceb995 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 21:58:15 +0300 Subject: [PATCH 04/12] PI-grade methodology: formal G2 decision rule + design power + multi-null robustness (predeclared) - S3_DECISION_RULE.md: G2 framed as a one-sided test of H0 (true FPR>=0.05); PASS = Wilson 95% CI-upper <= 0.05 (rejects H0 at alpha=0.025). Strictly stronger than the S2 point-estimate rule a favorable seed can satisfy. - S3_DESIGN_POWER.json: at N=1000 the gate passes only if observed FPR <= ~0.035 (CI-upper 0.048) and fails at >=0.04; the calibration 0.0354 sits at the resolution boundary -> design adequate. - MULTI_NULL_ROBUSTNESS_PROTOCOL.md: predeclared specificity check across AR/IAAFT/phase-randomized nulls (null model is a researcher DOF); robust only if it survives every null model. All frozen before the S3 run completes. No tuning after results. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../bonn_bright_line/S3_DESIGN_POWER.json | 40 +++++++++++++++++++ .../MULTI_NULL_ROBUSTNESS_PROTOCOL.md | 21 ++++++++++ docs/validation/S3_DECISION_RULE.md | 26 ++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 artifacts/bonn_bright_line/S3_DESIGN_POWER.json create mode 100644 docs/validation/MULTI_NULL_ROBUSTNESS_PROTOCOL.md create mode 100644 docs/validation/S3_DECISION_RULE.md diff --git a/artifacts/bonn_bright_line/S3_DESIGN_POWER.json b/artifacts/bonn_bright_line/S3_DESIGN_POWER.json new file mode 100644 index 0000000..4e2e0de --- /dev/null +++ b/artifacts/bonn_bright_line/S3_DESIGN_POWER.json @@ -0,0 +1,40 @@ +{ + "schema": "bsff.s3_design_power/v1", + "n_ar_null": 1000, + "gate": "Wilson 95pct CI-upper <= 0.05", + "precision_half_width_at_p_0.035": 0.0115, + "decision_table": [ + { + "true_fpr": 0.02, + "wilson95_upper_at_N1000": 0.0307, + "passes_gate_upper_le_0.05": true + }, + { + "true_fpr": 0.03, + "wilson95_upper_at_N1000": 0.0425, + "passes_gate_upper_le_0.05": true + }, + { + "true_fpr": 0.035, + "wilson95_upper_at_N1000": 0.0483, + "passes_gate_upper_le_0.05": true + }, + { + "true_fpr": 0.04, + "wilson95_upper_at_N1000": 0.054, + "passes_gate_upper_le_0.05": false + }, + { + "true_fpr": 0.045, + "wilson95_upper_at_N1000": 0.0597, + "passes_gate_upper_le_0.05": false + }, + { + "true_fpr": 0.05, + "wilson95_upper_at_N1000": 0.0653, + "passes_gate_upper_le_0.05": false + } + ], + "max_true_fpr_that_passes": 0.035, + "interpretation": "At N=1000 the Wilson 95pct CI half-width at p=0.035 is ~0.012, so the design confidently excludes FPR>=0.05 only if the observed seed-averaged FPR is <= ~0.037. The calibration point estimate (0.035) sits at the resolution boundary; the S3 verdict hinges on whether the larger-N estimate lands just below or above ~0.037. Design precision is ADEQUATE to decide a robust pass vs a marginal/non-robust outcome." +} diff --git a/docs/validation/MULTI_NULL_ROBUSTNESS_PROTOCOL.md b/docs/validation/MULTI_NULL_ROBUSTNESS_PROTOCOL.md new file mode 100644 index 0000000..aa619d2 --- /dev/null +++ b/docs/validation/MULTI_NULL_ROBUSTNESS_PROTOCOL.md @@ -0,0 +1,21 @@ + +# Multi-null-model robustness protocol (predeclared) + +The choice of null model is a **researcher degree of freedom**. The bright line so far uses one +null family (spectrum-matched AR for G2, MIAAFT inside the statistic). A PI-grade specificity claim +must hold across **independent null models**, not just AR order. + +## Predeclared null models (to run after S3, frozen) +1. **AR(p)** spectrum-matched (current). +2. **IAAFT** (iterative amplitude-adjusted Fourier transform) surrogates. +3. **Phase-randomized** (FT) surrogates. +4. **CAAFT** / cyclic-AAFT (optional). + +## Gate (same as S3, applied per null model) +Pooled seed-averaged FPR Wilson 95% CI upper bound ≤ 0.05 **for every null model**. A specificity +claim is robust only if it survives all of them; failing any one ⇒ specificity is null-model-dependent +(not robust). + +## Forbidden +Selecting the null model that passes; changing thresholds; post-hoc null choice. The null-model set +is frozen here before execution. diff --git a/docs/validation/S3_DECISION_RULE.md b/docs/validation/S3_DECISION_RULE.md new file mode 100644 index 0000000..0d33396 --- /dev/null +++ b/docs/validation/S3_DECISION_RULE.md @@ -0,0 +1,26 @@ + +# S3 decision rule — formal specificity test (PI-grade) + +The bright-line G2 gate is framed as a **one-sided hypothesis test of the true AR-null FPR**, +not a point-estimate comparison (the weakness the falsification exposed in S2). + +## Hypotheses +- H0 (must be rejected to PASS): **true FPR ≥ 0.05** — the instrument is not specific enough. +- H1 (PASS): true FPR < 0.05, with 97.5% one-sided confidence. + +## Decision rule (pre-registered, frozen) +PASS G2 iff the **Wilson 95% CI upper bound** of the pooled seed-averaged AR-null FPR is **≤ 0.05**. +This is equivalent to a one-sided test rejecting H0 at α = 0.025. It is strictly stronger than the +S2 rule (point estimate ≤ 0.05), which a favorable seed can satisfy while the true FPR exceeds 0.05. + +## Design precision (`S3_DESIGN_POWER.json`) +At N = 1000 AR-null tests, the Wilson 95% CI half-width at p ≈ 0.035 is ~0.011, so the gate passes +only if the observed FPR ≤ ~0.035 (CI-upper 0.048) and fails at FPR ≥ 0.04 (CI-upper 0.054). The +calibration point estimate (0.0354) sits at the resolution boundary — the design is adequately +powered to resolve a robust pass from a marginal/non-robust outcome. + +## G1 +Seed-averaged Set-E SURVIVED fraction ≥ 0.80 (power was already robust under falsification). + +## Forbidden +No α/threshold/statistic change after results; no seed dropping; no favorable-seed selection. From a173ca7f1abd1e52e672fad1bf792d334b842f5c Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 22:35:11 +0300 Subject: [PATCH 05/12] P0+P2: downgrade canonical truth to BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST + statistical-claims gate The seed-averaged falsification proved G2 specificity is not robust (Wilson 95% CI upper 0.056 > 0.05). Per the audit, the canonical truth must not headline an unqualified pass. - CURRENT_TRUTH (schema v2): latest_validation_state=BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST; bonn_s2_nominal_state=PASSED_SINGLE_SEED; bonn_s2_robustness_state=NOT_ROBUST...; s2_seed_averaged_fpr=0.0354; s2_wilson_ci_upper=0.056; robust_gate + robust_gate_passed=false. Data-driven: auto-upgrades to BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED if the S3 confirmatory passes. - FORMAL_VERDICT s1 + README first screen + STATUS generator + CLAIM_AUDIT lead with the honest state. - tools/validate_statistical_claims.py (wired ci.yml + release-dry-run.yml): fails CI if a point estimate is sold as a final pass while the CI crosses the gate, or if robustness fields are absent, or a surface headlines a robust pass while robust_gate_passed!=true. Guard test added. - test_current_truth_sync updated to the honest token. Governance fixpoint (CERTIFIED, 525->527). No over-claim. S3 seed-averaged re-confirmatory in progress (will set the robust verdict). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 1 + .github/workflows/release-dry-run.yml | 1 + DEMONSTRATION.md | 2 +- FORMAL_VERDICT.md | 11 +- README.md | 2 +- STATUS.md | 4 +- artifacts/MANIFEST.json | 2 +- artifacts/decision/decision.json | 2 +- artifacts/demonstration/DEMONSTRATION.sha256 | 2 +- artifacts/demonstration/demonstration.json | 2 +- artifacts/release/CLAIM_SAFETY_REPORT.json | 4 +- artifacts/release/CURRENT_TRUTH.json | 14 ++- .../release/STATISTICAL_CLAIMS_REPORT.json | 10 ++ .../release/TRUTH_CONSISTENCY_CHECK.json | 6 +- docs/validation/CLAIM_AUDIT.md | 7 ++ tests/test_current_truth_sync.py | 12 ++- tests/test_statistical_claims.py | 35 ++++++ tools/generate_current_truth.py | 39 ++++++- tools/update_status.py | 10 +- tools/validate_statistical_claims.py | 100 ++++++++++++++++++ 20 files changed, 235 insertions(+), 31 deletions(-) create mode 100644 artifacts/release/STATISTICAL_CLAIMS_REPORT.json create mode 100644 tests/test_statistical_claims.py create mode 100644 tools/validate_statistical_claims.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc0512d..970c9fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,6 +68,7 @@ jobs: - run: python tools/generate_current_truth.py --check - run: python tools/validate_current_truth.py - run: python tools/validate_forbidden_claims.py + - run: python tools/validate_statistical_claims.py - run: python tools/validate_release_notes.py - run: python tools/validate_open_source_readiness.py - run: python tools/check_github_actions_policy.py diff --git a/.github/workflows/release-dry-run.yml b/.github/workflows/release-dry-run.yml index e273cdd..3ce4864 100644 --- a/.github/workflows/release-dry-run.yml +++ b/.github/workflows/release-dry-run.yml @@ -43,6 +43,7 @@ jobs: - run: uv run --no-sync python tools/generate_current_truth.py --check - run: uv run --no-sync python tools/validate_current_truth.py - run: uv run --no-sync python tools/validate_forbidden_claims.py + - run: uv run --no-sync python tools/validate_statistical_claims.py - run: uv run --no-sync python tools/validate_artifact_schema.py - run: uv run --no-sync python tools/update_status.py --check - run: uv run --no-sync python tools/generate_manifest.py --check diff --git a/DEMONSTRATION.md b/DEMONSTRATION.md index 826f144..a4cdc4e 100644 --- a/DEMONSTRATION.md +++ b/DEMONSTRATION.md @@ -54,6 +54,6 @@ Self-conformance (`tools/run_contract_conformance.py`): **PARTIAL** — ## State -- tests: **525** (generated by `tools/update_status.py`) +- tests: **527** (generated by `tools/update_status.py`) - full evidence: `CLAIM_AUDIT.md`, `EVIDENCE_INDEX.md`, `docs/HONESTY_AUTOMATION.md` - nothing here is "true": the ceiling is *survived falsification under stated conditions*. diff --git a/FORMAL_VERDICT.md b/FORMAL_VERDICT.md index 88275ee..7b33bf0 100644 --- a/FORMAL_VERDICT.md +++ b/FORMAL_VERDICT.md @@ -5,11 +5,12 @@ Canonical machine-readable truth: [`artifacts/release/CURRENT_TRUTH.json`](artif This document must agree with it (enforced by `tools/validate_current_truth.py`). ## 1. Current canonical verdict -**BONN_S2_BRIGHT_LINE_PASSED (predeclared confirmatory) — but a MARGINAL, NOT-robust pass.** -BSFF cleared the frozen `S2-C1-sampen-finiteN` confirmatory at the predeclared seed, but a -seed-averaged falsification (§Robustness) shows **G2 specificity is NOT robust**: pooled AR-null -FPR 0.0354, Wilson 95% CI **[0.022, 0.056]** crosses the 0.05 gate. The bright line is **not -robustly crossed**; the FPR 0.02 below was a favorable-seed point estimate. +**`BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST`.** BSFF cleared the frozen `S2-C1-sampen-finiteN` +confirmatory at the **predeclared single seed** (nominal pass, FPR 0.02), but a seed-averaged +falsification (§Robustness) shows **G2 specificity is NOT robust**: pooled AR-null FPR 0.0354, +Wilson 95% CI **[0.022, 0.056]** whose **upper bound 0.056 > 0.05**. Under the robust gate +(`robust_gate = G1 power ≥ 0.80 AND G2 AR-null FPR Wilson-95-CI-upper ≤ 0.05`), **`robust_gate_passed += false`**. The bright line is **not robustly crossed**; the 0.02 was a favorable-seed point estimate. - G1 (power): Set E SURVIVED **0.96**, Set A not-SURVIVED **0.92**, Set B not-SURVIVED **0.92** (≥ 0.80) — **robust**. - G2 (specificity): predeclared-seed AR-null FPR **0.02** ≤ 0.05; **seed-averaged 0.035, CI [0.022, 0.056] — not robustly ≤ 0.05**. diff --git a/README.md b/README.md index 324d763..b016788 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ BSFF aims at a **BCI/EEG signal claim** and tries to refute it under stated atta (surrogate nulls, controls, corroboration), emitting a bounded verdict — `SURVIVED` / `REFUTED` / `UNSUPPORTED` (see [`docs/VERDICT_SEMANTICS.md`](docs/VERDICT_SEMANTICS.md)). -**Current canonical evidence — `BONN_S2_BRIGHT_LINE_PASSED`** +**Current canonical evidence — `BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST`** ([`artifacts/release/CURRENT_TRUTH.json`](artifacts/release/CURRENT_TRUTH.json)): on real Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.96) and a **marginal, NOT-robust specificity**: the predeclared-seed AR-null FPR was 0.02, but a diff --git a/STATUS.md b/STATUS.md index 685135b..9508b0f 100644 --- a/STATUS.md +++ b/STATUS.md @@ -14,7 +14,7 @@ facts (version, live test count, CLI surface, extras) by | Field | Value | |---|---| | Package version | `0.4.0` | -| Live test count | **525** (collected by `pytest tests/`) | +| Live test count | **527** (collected by `pytest tests/`) | | CLI subcommands | 18 (parsed from `src/bsff/cli.py`) | | Optional extras | `dev`, `full`, `fuzz`, `leakage`, `moabb`, `security`, `stats`, `yaml` | @@ -29,7 +29,7 @@ authoritative status: ## Validation level -Synthetic-ground-truth calibration PLUS a passed external real-data bright-line benchmark (Bonn S2: G1 power + G2 specificity, BONN_S2_BRIGHT_LINE_PASSED). BNCI2014-001 is preregistration-only (not executed). NOT clinical, regulatory, or multi-dataset replicated. Canonical state: artifacts/release/CURRENT_TRUTH.json. +Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is a NOMINAL single-seed pass with G2 specificity NOT robust: BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST. Falsification + seed-averaged calibration show the AR-null FPR Wilson-95-CI upper bound (0.056) exceeds the 0.05 gate (robust_gate_passed=false). G1 power is robust. BNCI2014-001 is preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or robustly validated. Canonical state: artifacts/release/CURRENT_TRUTH.json. See [`docs/VALIDATION.md`](docs/VALIDATION.md) for the full evidence tier table and [`docs/OPERATING_CHARACTERISTIC.md`](docs/OPERATING_CHARACTERISTIC.md) diff --git a/artifacts/MANIFEST.json b/artifacts/MANIFEST.json index e65a8f3..46eabe4 100644 --- a/artifacts/MANIFEST.json +++ b/artifacts/MANIFEST.json @@ -4,7 +4,7 @@ "package": "bsff", "generator": "tools/generate_manifest.py", "version": "0.4.0", - "test_count": 525, + "test_count": 527, "release_gates": [ "truth_contract", "architecture_contract", diff --git a/artifacts/decision/decision.json b/artifacts/decision/decision.json index f0ece7f..3e819f4 100644 --- a/artifacts/decision/decision.json +++ b/artifacts/decision/decision.json @@ -9,7 +9,7 @@ "nonconformant": 0, "unverifiable": 4 }, - "certificate_root": "e543d74af813329649b6fa7c0130f029542a72be24e439d7da4c25a644c5c112", + "certificate_root": "b278588e56a9d3665979b1e4749ab4113bef50c4bacfe65f3e815fedaaf288bd", "criteria": [ { "id": "V1", diff --git a/artifacts/demonstration/DEMONSTRATION.sha256 b/artifacts/demonstration/DEMONSTRATION.sha256 index 667e305..43e3716 100644 --- a/artifacts/demonstration/DEMONSTRATION.sha256 +++ b/artifacts/demonstration/DEMONSTRATION.sha256 @@ -1 +1 @@ -5a463e0e63ed40590ac53d57a33b21ae0e8e4dc73d7a5f2131dc33b01699255d +2212e2365dd55038d4c4d4447170c3e35e2b519f948b150023471126bca1304f diff --git a/artifacts/demonstration/demonstration.json b/artifacts/demonstration/demonstration.json index ecba3b5..e19aa24 100644 --- a/artifacts/demonstration/demonstration.json +++ b/artifacts/demonstration/demonstration.json @@ -40,5 +40,5 @@ "loso_within": 0.807, "loso_cross": 0.603, "loso_gap": 0.204, - "test_count": "525" + "test_count": "527" } \ No newline at end of file diff --git a/artifacts/release/CLAIM_SAFETY_REPORT.json b/artifacts/release/CLAIM_SAFETY_REPORT.json index 0f75a03..2388b02 100644 --- a/artifacts/release/CLAIM_SAFETY_REPORT.json +++ b/artifacts/release/CLAIM_SAFETY_REPORT.json @@ -14,6 +14,6 @@ "docs/QUICKSTART.md" ], "violations": [], - "git_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2", - "timestamp_utc": "2026-06-24T18:45:04Z" + "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", + "timestamp_utc": "2026-06-24T19:34:48Z" } diff --git a/artifacts/release/CURRENT_TRUTH.json b/artifacts/release/CURRENT_TRUTH.json index 63a36d1..9794cc5 100644 --- a/artifacts/release/CURRENT_TRUTH.json +++ b/artifacts/release/CURRENT_TRUTH.json @@ -1,8 +1,14 @@ { - "schema": "bsff.current_truth/v1", + "schema": "bsff.current_truth/v2", "package_version": "0.4.0", - "main_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2", - "latest_validation_state": "BONN_S2_BRIGHT_LINE_PASSED", + "main_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", + "latest_validation_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "bonn_s2_nominal_state": "PASSED_SINGLE_SEED", + "bonn_s2_robustness_state": "NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_0.0354_CI_[0.0222, 0.056]_crosses_0.05", + "s2_seed_averaged_fpr": 0.0354, + "s2_wilson_ci_upper": 0.056, + "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", + "robust_gate_passed": false, "bonn_s1_state": "BRIGHT_LINE_NOT_PASSED", "bonn_s2_state": "S2_BRIGHT_LINE_PASSED", "G1_metrics": { @@ -49,5 +55,5 @@ }, "hash_manifest_path": "artifacts/release/bonn_bright_line/HASHES.sha256", "reproduction_entrypoint": "REPRODUCE.md", - "timestamp_utc": "2026-06-24T18:37:06Z" + "timestamp_utc": "2026-06-24T19:25:30Z" } diff --git a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json new file mode 100644 index 0000000..f991229 --- /dev/null +++ b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json @@ -0,0 +1,10 @@ +{ + "schema": "bsff.statistical_claims/v1", + "status": "PASS", + "latest_validation_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "s2_wilson_ci_upper": 0.056, + "robust_gate_passed": false, + "violations": [], + "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", + "timestamp_utc": "2026-06-24T19:34:48Z" +} diff --git a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json index 146ab50..56430e2 100644 --- a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json +++ b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json @@ -1,6 +1,6 @@ { "status": "PASS", - "final_state": "BONN_S2_BRIGHT_LINE_PASSED", + "final_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "checked_files": [ "FORMAL_VERDICT.md", @@ -14,6 +14,6 @@ ], "contradictions": [], "stale_claims": [], - "timestamp_utc": "2026-06-24T18:45:05Z", - "git_commit": "62ea84bff09d4e6f97c3a44eae12a08f388ea0c2" + "timestamp_utc": "2026-06-24T19:34:49Z", + "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995" } diff --git a/docs/validation/CLAIM_AUDIT.md b/docs/validation/CLAIM_AUDIT.md index cd935e0..4988160 100644 --- a/docs/validation/CLAIM_AUDIT.md +++ b/docs/validation/CLAIM_AUDIT.md @@ -75,3 +75,10 @@ Still **FORBIDDEN**: clinical/medical/regulatory/device claims; final proof of b | S2 G2 specificity is robustly below 0.05 | REFUTED_BY_ARTIFACT | seed-avg FPR 0.0354, Wilson 95% CI [0.022, 0.056] crosses 0.05; 2/6 seeds >0.05 (`S2_SPECIFICITY_CALIBRATION.json`) | | Bonn S2 bright line is robustly crossed | REFUTED_BY_ARTIFACT | marginal/favorable-seed pass only; G2 not robust | | Bonn S2 G1 power is robust | PROVEN_BY_ARTIFACT | Set E SURVIVED 0.96-0.967 across all seeds | + +## Canonical-state honesty (PI-grade) +| claim | status | evidence | +|-------|--------|----------| +| Bonn S2 robust bright-line passed | REFUTED_BY_ARTIFACT | robust_gate_passed=false; CI upper 0.056 > 0.05 (`CURRENT_TRUTH.json`) | +| Bonn S2 nominal single-seed pass exists | PROVEN_BY_ARTIFACT | predeclared confirmatory (FPR 0.02), `bonn_s2_nominal_state=PASSED_SINGLE_SEED` | +| "Bonn validated" without a robustness qualifier | FORBIDDEN | enforced by `tools/validate_statistical_claims.py` (CI) | diff --git a/tests/test_current_truth_sync.py b/tests/test_current_truth_sync.py index f3d15f9..1991dc2 100644 --- a/tests/test_current_truth_sync.py +++ b/tests/test_current_truth_sync.py @@ -28,10 +28,18 @@ def test_no_public_doc_contradicts_current_truth(tmp_path): assert _load("validate_current_truth").main(["--output", str(out)]) == 0 -def test_canonical_state_is_s2_passed(): +def test_canonical_state_is_honest_about_robustness(): import json truth = json.loads((ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json").read_text()) - assert truth["latest_validation_state"] == "BONN_S2_BRIGHT_LINE_PASSED" + # Nominal single-seed pass, but the falsification downgraded G2 specificity to NOT robust. + assert truth["latest_validation_state"] in { + "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", # only if S3 proves robust + } + assert truth["bonn_s2_nominal_state"] == "PASSED_SINGLE_SEED" + # The robust gate must not be silently claimed passed unless an artifact proves it. + if truth["latest_validation_state"] == "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST": + assert truth["robust_gate_passed"] is False assert truth["BNCI_chain_state"] == "UNLOCKED_FOR_PREREGISTRATION_ONLY" assert truth["bonn_s1_state"] == "BRIGHT_LINE_NOT_PASSED" # preserved diff --git a/tests/test_statistical_claims.py b/tests/test_statistical_claims.py new file mode 100644 index 0000000..4e9acd0 --- /dev/null +++ b/tests/test_statistical_claims.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# Copyright (c) 2026 Yaroslav Vasylenko / neuron7xLab +"""PI-grade statistical-claims gate: no point-estimate-as-pass when the CI crosses the gate.""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] + + +def _vsc(): + spec = importlib.util.spec_from_file_location("vsc", ROOT / "tools" / "validate_statistical_claims.py") + mod = importlib.util.module_from_spec(spec) + sys.modules["vsc"] = mod + spec.loader.exec_module(mod) + return mod + + +def test_repo_passes_statistical_claims(tmp_path): + assert _vsc().main(["--output", str(tmp_path / "r.json")]) == 0 + + +def test_truth_records_robustness_honestly(): + t = json.loads((ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json").read_text()) + assert {"robust_gate", "robust_gate_passed", "s2_wilson_ci_upper", "bonn_s2_robustness_state"} <= set(t) + # If the specificity CI upper crosses 0.05, the state must NOT claim a robust/unqualified pass. + if t.get("s2_wilson_ci_upper") and t["s2_wilson_ci_upper"] > 0.05: + assert t["latest_validation_state"] not in { + "BONN_S2_BRIGHT_LINE_PASSED", "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED" + } + assert t["robust_gate_passed"] is False diff --git a/tools/generate_current_truth.py b/tools/generate_current_truth.py index 8d5c8a2..2c7485c 100644 --- a/tools/generate_current_truth.py +++ b/tools/generate_current_truth.py @@ -66,6 +66,27 @@ def _pypi_state() -> str: return "TESTPYPI_READY_PYPI_READY" if (has_test and has_pypi) else "INCOMPLETE" +def _bonn_robustness() -> dict: + """Resolve the robust specificity state from the strongest available evidence: + S3 seed-averaged confirmatory > seed-averaged calibration > nominal-only.""" + cal_p = ROOT / "artifacts" / "bonn_bright_line" / "S2_SPECIFICITY_CALIBRATION.json" + s3_p = ROOT / "artifacts" / "bonn_bright_line" / "S3_CONFIRMATORY_VERDICT.json" + fpr = ci_upper = None + robust = None + if cal_p.is_file(): + c = json.loads(cal_p.read_text()) + fpr = c.get("pooled_fpr") + ci_upper = (c.get("wilson_95ci") or [None, None])[1] + robust = bool(c.get("fpr_ci_upper_below_threshold", False)) + if s3_p.is_file(): # S3 is the authoritative, larger-N evidence + s3 = json.loads(s3_p.read_text()) + g2 = s3.get("G2", {}) + fpr = g2.get("ar_null_fpr", fpr) + ci_upper = (g2.get("wilson_95ci") or [None, ci_upper])[1] + robust = bool(s3.get("S3_PASS", False)) + return {"robust": robust, "seed_avg_fpr": fpr, "wilson_ci_upper": ci_upper} + + def build() -> dict: s1 = json.loads(S1.read_text()) s2 = json.loads(S2.read_text()) @@ -74,14 +95,26 @@ def build() -> dict: ["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=ROOT ).stdout.strip() g1, g2 = s2["G1"], s2["G2"] - latest = "BONN_S2_BRIGHT_LINE_PASSED" if s2_pass else s2["final_state"] + rob = _bonn_robustness() + if rob["robust"] is True: + latest = "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED" + elif rob["robust"] is False: + latest = "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST" + else: + latest = "BONN_S2_BRIGHT_LINE_PASSED" if s2_pass else s2["final_state"] return { - "schema": "bsff.current_truth/v1", + "schema": "bsff.current_truth/v2", "package_version": _ver(), "main_commit": commit, "latest_validation_state": latest, + "bonn_s2_nominal_state": "PASSED_SINGLE_SEED" if s2_pass else s2["final_state"], + "bonn_s2_robustness_state": _s2_robustness(), + "s2_seed_averaged_fpr": rob["seed_avg_fpr"], + "s2_wilson_ci_upper": rob["wilson_ci_upper"], + "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", + "robust_gate_passed": bool(rob["robust"]) if rob["robust"] is not None else None, "bonn_s1_state": s1["final_state"], # BRIGHT_LINE_NOT_PASSED (historical) - "bonn_s2_state": s2["final_state"], # S2_BRIGHT_LINE_PASSED (current) + "bonn_s2_state": s2["final_state"], # nominal single-seed confirmatory "G1_metrics": { "E_survived": g1["E_survived_fraction"], "A_not_survived": g1["A_not_survived_fraction"], diff --git a/tools/update_status.py b/tools/update_status.py index ee18cc3..90c2b9d 100644 --- a/tools/update_status.py +++ b/tools/update_status.py @@ -41,10 +41,12 @@ CI_WORKFLOW = ".github/workflows/ci.yml" VALIDATION_LEVEL = ( - "Synthetic-ground-truth calibration PLUS a passed external real-data bright-line " - "benchmark (Bonn S2: G1 power + G2 specificity, BONN_S2_BRIGHT_LINE_PASSED). BNCI2014-001 " - "is preregistration-only (not executed). NOT clinical, regulatory, or multi-dataset replicated. " - "Canonical state: artifacts/release/CURRENT_TRUTH.json." + "Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is a NOMINAL " + "single-seed pass with G2 specificity NOT robust: BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST. " + "Falsification + seed-averaged calibration show the AR-null FPR Wilson-95-CI upper bound " + "(0.056) exceeds the 0.05 gate (robust_gate_passed=false). G1 power is robust. BNCI2014-001 " + "is preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, " + "or robustly validated. Canonical state: artifacts/release/CURRENT_TRUTH.json." ) diff --git a/tools/validate_statistical_claims.py b/tools/validate_statistical_claims.py new file mode 100644 index 0000000..f409143 --- /dev/null +++ b/tools/validate_statistical_claims.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-3.0-or-later +# Copyright (c) 2026 Yaroslav Vasylenko / neuron7xLab +"""Statistical-claims gate (PI-grade): a point estimate must not be sold as a final pass +when the confidence interval crosses the threshold. + +Fails CI if: + 1. CURRENT_TRUTH lacks the robustness fields (robustness state absent). + 2. The G2 specificity CI upper bound > 0.05 but the canonical state still claims a robust / + unqualified bright-line pass (point-estimate-as-pass while CI crosses the gate). + 3. A public surface headlines a robust pass while robust_gate_passed is false. +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +TRUTH = ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json" +SURFACES = ["README.md", "FORMAL_VERDICT.md", "STATUS.md", "docs/validation/CLAIM_AUDIT.md"] + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--output", default="artifacts/release/STATISTICAL_CLAIMS_REPORT.json") + a = ap.parse_args(argv) + t = json.loads(TRUTH.read_text()) + viol = [] + + required = { + "robust_gate", + "robust_gate_passed", + "s2_wilson_ci_upper", + "bonn_s2_robustness_state", + } + missing = [k for k in required if k not in t] + if missing: + viol.append(f"CURRENT_TRUTH missing robustness fields: {missing}") + + ci_upper = t.get("s2_wilson_ci_upper") + gate_passed = t.get("robust_gate_passed") + state = t.get("latest_validation_state", "") + # 2. CI crosses the gate but the state claims a robust/unqualified pass. + if ci_upper is not None and ci_upper > 0.05: + if gate_passed is True: + viol.append( + f"robust_gate_passed=True but CI upper {ci_upper} > 0.05 (point-estimate-as-pass)" + ) + if state in {"BONN_S2_BRIGHT_LINE_PASSED", "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED"}: + viol.append( + f"latest_validation_state={state!r} claims a (robust) pass while CI upper {ci_upper} > 0.05" + ) + + # 3. Public surfaces must not headline a robust/unqualified pass while gate not passed. + if gate_passed is not True: + bad = re.compile( + r"robustly (?:passed|crossed|validated)|bright[- ]line (?:robustly )?validated", re.I + ) + neg = re.compile(r"\bnot\b|\bno\b|never|n't|fail|marginal|favorable[- ]seed|crosses") + for rel in SURFACES: + p = ROOT / rel + if not p.is_file(): + continue + lines = p.read_text(encoding="utf-8").splitlines() + lows = [ln.lower() for ln in lines] + for i, ln in enumerate(lines): + # negation context = this line + previous 2 (handles wrapped "not\nrobustly crossed"). + ctx = " ".join(lows[max(0, i - 2): i + 1]) + if bad.search(ln) and not neg.search(ctx): + viol.append( + f"{rel}:{i + 1}: claims robust pass while robust_gate_passed!=True: {ln.strip()[:60]}" + ) + + status = "PASS" if not viol else "FAIL" + rep = { + "schema": "bsff.statistical_claims/v1", + "status": status, + "latest_validation_state": state, + "s2_wilson_ci_upper": ci_upper, + "robust_gate_passed": gate_passed, + "violations": viol, + "git_commit": subprocess.run( + ["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=ROOT + ).stdout.strip(), + "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + (ROOT / a.output).write_text(json.dumps(rep, indent=2) + "\n") + print(f"STATISTICAL_CLAIMS: {status}") + for v in viol: + print(" -", v) + return 0 if status == "PASS" else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) From a81d595620b1435f9a3caa0f5bd42890bd964dee Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Wed, 24 Jun 2026 22:39:57 +0300 Subject: [PATCH 06/12] NIST/mission-grade hardening: risk register + fail-closed table + hostile-review + mission-check - docs/risk/BSFF_RISK_REGISTER.md: R1-R10 each with a fail-closed control + enforcing gate; open red risks (R1/R2 G2 not robust, R3 multi-null pending, R4 BNCI method) flagged. - docs/risk/FAIL_CLOSED_DECISION_TABLE.md: the only allowed decision states; current = nominal/not-robust. - docs/reviewer_packet/{HOSTILE_REVIEW_CHECKLIST,KNOWN_FAILURES}.md: reproduce-without-author surface; failures preserved, not hidden. - artifacts/risk/RISK_ACCEPTANCE.json: disclosed residual (published as falsifier w/ open robustness gap). - Makefile: `make mission-check` (full gate battery: compile+tests+selftest+evidence+truth+forbidden+ statistical+contract+regenerate-check) and `make hostile-review`. No silent success; no ambiguous PASS; no unbounded claim. Co-Authored-By: Claude Opus 4.8 (1M context) --- Makefile | 22 ++++++++++++++++++- .../release/STATISTICAL_CLAIMS_REPORT.json | 4 ++-- artifacts/risk/RISK_ACCEPTANCE.json | 15 +++++++++++++ .../HOSTILE_REVIEW_CHECKLIST.md | 17 ++++++++++++++ docs/reviewer_packet/KNOWN_FAILURES.md | 12 ++++++++++ docs/risk/BSFF_RISK_REGISTER.md | 21 ++++++++++++++++++ docs/risk/FAIL_CLOSED_DECISION_TABLE.md | 13 +++++++++++ 7 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 artifacts/risk/RISK_ACCEPTANCE.json create mode 100644 docs/reviewer_packet/HOSTILE_REVIEW_CHECKLIST.md create mode 100644 docs/reviewer_packet/KNOWN_FAILURES.md create mode 100644 docs/risk/BSFF_RISK_REGISTER.md create mode 100644 docs/risk/FAIL_CLOSED_DECISION_TABLE.md diff --git a/Makefile b/Makefile index 44eee39..561e53d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-3.0-or-later # Copyright (c) 2026 Yaroslav Vasylenko / neuron7xLab -.PHONY: lab-99 regen lock verify verify-offline build-proof openai-2026 +.PHONY: lab-99 regen lock verify verify-offline build-proof openai-2026 mission-check hostile-review # Full local lab run — mirrors the CI test + slow-tests + build surface. lab-99: @@ -72,3 +72,23 @@ build-proof: # The whole grid, locally. openai-2026: lock verify-offline build-proof verify @echo "OpenAI-2026 validation grid complete." + +# Mission-critical gate: no silent success, no ambiguous PASS, no stale truth, no unbounded claim. +mission-check: + python -m compileall -q src tests examples research tools + python -m pytest -q tests/ -m "not slow" + bsff selftest + bsff evidence verify + python tools/validate_current_truth.py + python tools/generate_current_truth.py --check + python tools/validate_forbidden_claims.py + python tools/validate_statistical_claims.py + python tools/validate_truth_contract.py + python tools/regenerate.py --check + +# Reviewer-facing hostile-review surface. +hostile-review: + @echo "See docs/reviewer_packet/HOSTILE_REVIEW_CHECKLIST.md and docs/ADVERSARIAL_REVIEW.md" + bsff evidence verify + python tools/validate_statistical_claims.py + python tools/validate_forbidden_claims.py diff --git a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json index f991229..9b95699 100644 --- a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json +++ b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json @@ -5,6 +5,6 @@ "s2_wilson_ci_upper": 0.056, "robust_gate_passed": false, "violations": [], - "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", - "timestamp_utc": "2026-06-24T19:34:48Z" + "git_commit": "a173ca7f1abd1e52e672fad1bf792d334b842f5c", + "timestamp_utc": "2026-06-24T19:38:12Z" } diff --git a/artifacts/risk/RISK_ACCEPTANCE.json b/artifacts/risk/RISK_ACCEPTANCE.json new file mode 100644 index 0000000..970c4f7 --- /dev/null +++ b/artifacts/risk/RISK_ACCEPTANCE.json @@ -0,0 +1,15 @@ +{ + "schema": "bsff.risk_acceptance/v1", + "open_red_risks": [ + "R1 false-positive instability", + "R2 favorable-seed pass", + "R3 multi-null DOF (not run)", + "R4 BNCI method transfer" + ], + "accepted_residual": "Repository is published as a falsification framework with a NOMINAL single-seed Bonn pass and an OPEN, disclosed robustness gap (G2 not robust). No robust/clinical/replication claim is made.", + "fail_closed": true, + "all_states_bounded": true, + "canonical_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "git_commit": "a173ca7f1abd1e52e672fad1bf792d334b842f5c", + "timestamp_utc": "2026-06-24T19:38:12Z" +} diff --git a/docs/reviewer_packet/HOSTILE_REVIEW_CHECKLIST.md b/docs/reviewer_packet/HOSTILE_REVIEW_CHECKLIST.md new file mode 100644 index 0000000..b993866 --- /dev/null +++ b/docs/reviewer_packet/HOSTILE_REVIEW_CHECKLIST.md @@ -0,0 +1,17 @@ + +# Hostile-review checklist + +Run each; the repo must withstand all without the author. + +- [ ] `bsff evidence verify` → state PASS, canonical state shown. +- [ ] `python tools/validate_statistical_claims.py` → PASS (no point-estimate-as-pass). +- [ ] `python tools/validate_forbidden_claims.py` → PASS (no clinical/over-claim). +- [ ] `sha256sum -c artifacts/release/bonn_bright_line/HASHES.sha256` → all OK. +- [ ] `CURRENT_TRUTH.latest_validation_state` is `BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST` (not an unqualified pass). +- [ ] `robust_gate_passed` is `false` while `s2_wilson_ci_upper` (0.056) > 0.05. +- [ ] No doc headlines a robust pass while `robust_gate_passed != true`. +- [ ] `git ls-files | grep bonn_data` is empty (no raw data). +- [ ] Falsification artifacts present: `S2_FALSIFICATION_REPORT.json`, `S2_SPECIFICITY_CALIBRATION.json`. +- [ ] BNCI is `BNCI_BLOCKED_METHOD`; replication `NOT_DONE`; no claim beyond. + +See `docs/ADVERSARIAL_REVIEW.md`, `docs/reviewer_packet/KNOWN_FAILURES.md`. diff --git a/docs/reviewer_packet/KNOWN_FAILURES.md b/docs/reviewer_packet/KNOWN_FAILURES.md new file mode 100644 index 0000000..23ff80c --- /dev/null +++ b/docs/reviewer_packet/KNOWN_FAILURES.md @@ -0,0 +1,12 @@ + +# Known failures (preserved, not hidden) + +1. **G2 specificity not robust** — seed-averaged AR-null FPR 0.0354, Wilson 95% CI [0.022, 0.056] + crosses the 0.05 gate. The Bonn S2 bright line is a nominal single-seed pass, NOT robustly crossed. +2. **S1 lagged_quadratic** — ~20% Set-E power (insufficient). Preserved as the first negative result. +3. **BNCI method transfer** — S2-C1 anti-conservative on narrowband 501-sample epochs (probe FPR 0.375); + BNCI is `BNCI_BLOCKED_METHOD`, not executed. +4. **Replication** — Cho2017/Lee2019 not executed (scaffolds only); `multi_dataset_replication_state=NOT_DONE`. +5. **Docker `evidence verify`** — needs git in the slim image (host-only tool); selftest works in-container. + +These are evidence, not embarrassment: a falsifier must publish where it fails. diff --git a/docs/risk/BSFF_RISK_REGISTER.md b/docs/risk/BSFF_RISK_REGISTER.md new file mode 100644 index 0000000..428816e --- /dev/null +++ b/docs/risk/BSFF_RISK_REGISTER.md @@ -0,0 +1,21 @@ + +# BSFF risk register (NIST AI RMF framing) + +Each risk has a fail-closed control and an enforcing gate. A verdict is never accepted while +an open risk's control is bypassed. + +| id | risk | control (fail-closed) | enforcing gate | +|----|------|-----------------------|----------------| +| R1 | false-positive instability | seed-averaged FPR + Wilson CI-upper gate | `validate_statistical_claims.py`, S3 | +| R2 | favorable-seed pass | robust gate = CI-upper ≤ 0.05 (not point estimate) | `validate_statistical_claims.py` | +| R3 | null-model researcher DOF | multi-null robustness (AR/IAAFT/phase-rand) | `MULTI_NULL_ROBUSTNESS_PROTOCOL.md` | +| R4 | epoch-length transfer failure | method-validity gate; BNCI BLOCKED_METHOD | `METHOD_VALIDITY.json`, lock audit | +| R5 | stale CURRENT_TRUTH | regenerate + main_commit; sync gate | `generate_current_truth.py --check` | +| R6 | overclaim in README/docs | forbidden + statistical-claims scanners | `validate_forbidden_claims.py`, `validate_statistical_claims.py` | +| R7 | dataset substitution | manifest + per-file SHA256; raw not tracked | `evidence verify`, DATA_POLICY | +| R8 | raw-rank shortcut | verdict via convergence-gated test, not raw | `test_statistics_sampen.py` | +| R9 | nonconverged surrogate | >10% nonconverged → UNSUPPORTED | `statistics_sampen.py` | +| R10 | CI green without scientific validity | robustness state must be present + honest | `validate_statistical_claims.py` | + +Current open/red risks: **R1, R2** (G2 specificity not robust — see CURRENT_TRUTH), **R3** (multi-null +not yet run), **R4** (BNCI method-blocked). The canonical state reflects these; no claim exceeds them. diff --git a/docs/risk/FAIL_CLOSED_DECISION_TABLE.md b/docs/risk/FAIL_CLOSED_DECISION_TABLE.md new file mode 100644 index 0000000..5be0b8c --- /dev/null +++ b/docs/risk/FAIL_CLOSED_DECISION_TABLE.md @@ -0,0 +1,13 @@ + +# Fail-closed decision table + +| condition | decision state | claim allowed | +|-----------|----------------|---------------| +| G1 power ≥ 0.80 AND G2 CI-upper ≤ 0.05 | `BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED` | robust bright-line pass | +| G1 ≥ 0.80, predeclared-seed FPR ≤ 0.05, G2 CI-upper > 0.05 | `BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST` | nominal single-seed pass only | +| G1 < 0.80 OR seed-avg FPR clearly > 0.05 | `BRIGHT_LINE_NOT_PASSED` | none | +| data absent | `BLOCKED_DATA` | none | +| method invalid for regime | `BLOCKED_METHOD` | none | +| runtime/repo surface absent | `BLOCKED_RUNTIME` | none | + +No other state. Non-PASS always carries a non-zero exit. Current: row 2 (nominal, not robust). From 4bd046407948f34daed565eb585e49b0dc1ec9ff Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 00:08:56 +0300 Subject: [PATCH 07/12] S3 runner serialization fix + preliminary reconstruction (fail-closed: state NOT flipped) The S3 run completed all 10 seeds but crashed on JSON write (numpy bool_ not serializable). Fixed the runner (cast np bool_/float64 -> Python; measurement logic byte-identical, lock records the serialization-only patch with original+patched sha). Reconstructed verdict from the exact per-seed counts in the log: G1 E=0.94, G2 FPR=0.028, Wilson 95% CI [0.019, 0.040], upper <= 0.05 -> S3 would ROBUSTLY PASS (a flip from the N=480 calibration's CI-upper 0.056). Per the standard "a fact is a reproducible measurement by independent witnesses", a hand- reconstruction from a crashed run is NOT a fact. CURRENT_TRUTH stays BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ ROBUST. A clean re-run with the fixed runner is in progress; only its authoritative artifact (reproducing these per-seed counts) will flip the canonical state. S3_PRELIMINARY_FROM_LOG.json is marked PRELIMINARY_NOT_AUTHORITATIVE. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../S3_PRELIMINARY_FROM_LOG.json | 38 +++++++++++++++++++ .../bonn_bright_line/S3_PROTOCOL_LOCK.json | 7 +++- .../s3_seed_averaged_confirmatory.py | 6 +-- 3 files changed, 47 insertions(+), 4 deletions(-) create mode 100644 artifacts/bonn_bright_line/S3_PRELIMINARY_FROM_LOG.json diff --git a/artifacts/bonn_bright_line/S3_PRELIMINARY_FROM_LOG.json b/artifacts/bonn_bright_line/S3_PRELIMINARY_FROM_LOG.json new file mode 100644 index 0000000..6082e61 --- /dev/null +++ b/artifacts/bonn_bright_line/S3_PRELIMINARY_FROM_LOG.json @@ -0,0 +1,38 @@ +{ + "schema": "bsff.s3_preliminary/v1", + "status": "PRELIMINARY_NOT_AUTHORITATIVE", + "provenance": "Reconstructed from exact per-seed integer counts in s3_confirmatory.log; the original run completed all 10 seeds but crashed on JSON write (numpy bool_ not serializable, now fixed).", + "reconstructed_verdict": "S3_BRIGHT_LINE_ROBUSTLY_PASSED", + "G1": { + "E_survived_fraction": 0.94, + "threshold": 0.8, + "pass": true + }, + "G2": { + "ar_null_fpr": 0.028, + "wilson_95ci": [ + 0.0194, + 0.0402 + ], + "ci_upper_threshold": 0.05, + "pass": true, + "n_ar_null": 1000, + "n_false_positives": 28 + }, + "per_seed_fpr": { + "20260623": 0.01, + "7": 0.05, + "999": 0.0, + "314159": 0.01, + "2718": 0.02, + "42": 0.04, + "161803": 0.04, + "27182": 0.04, + "31337": 0.04, + "123456": 0.03 + }, + "vs_calibration": "Calibration (N=480, 6 seeds) gave FPR 0.0354 CI-upper 0.056 (not robust); this larger pre-registered S3 (N=1000, 10 seeds) gives 0.028 CI-upper 0.040. The estimate is seed-set/N sensitive near the boundary; the pre-registered larger-N test passes its gate.", + "canonical_state_action": "NOT applied yet. CURRENT_TRUTH stays BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST until the clean re-run produces the authoritative artifact and reproduces these per-seed counts.", + "git_commit": "a81d595620b1435f9a3caa0f5bd42890bd964dee", + "timestamp_utc": "2026-06-24T21:08:40Z" +} diff --git a/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json b/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json index 1b24d2b..11224dd 100644 --- a/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json +++ b/artifacts/bonn_bright_line/S3_PROTOCOL_LOCK.json @@ -31,5 +31,10 @@ "runner": "examples/bonn_bright_line/s3_seed_averaged_confirmatory.py", "runner_sha256": "a19f6400945c2f96d0603c864743f435a4e62f5844b16d62850b57ba4a21dc0a", "git_commit": "3ae10213e789c05f3139a4cc92117417277ce263", - "timestamp_utc": "2026-06-24T18:50:14Z" + "timestamp_utc": "2026-06-24T18:50:14Z", + "original_runner_sha256": "a19f6400945c2f96d0603c864743f435a4e62f5844b16d62850b57ba4a21dc0a", + "patched_runner_sha256": "89485f335b4b6622d245b132a6ce176e670bcba890c639b7d958d8e0c7a363c8", + "patch_note": "Post-run serialization-only fix: cast numpy bool_/float64 to Python types so the verdict JSON serializes. The measurement logic (seeds, statistic, AR-null, gate) is byte-identical; the clean re-run reproduces the same per-seed counts. No methodological change.", + "patch_affects_measurement": false, + "patch_timestamp_utc": "2026-06-24T21:08:09Z" } diff --git a/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py b/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py index 63f60d8..cd85f80 100644 --- a/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py +++ b/examples/bonn_bright_line/s3_seed_averaged_confirmatory.py @@ -43,7 +43,7 @@ def _wilson(k: int, n: int, z: float = 1.96) -> tuple[float, float, float]: den = 1 + z * z / n centre = (p + z * z / (2 * n)) / den half = (z * np.sqrt(p * (1 - p) / n + z * z / (4 * n * n))) / den - return p, max(0.0, centre - half), min(1.0, centre + half) + return float(p), float(max(0.0, centre - half)), float(min(1.0, centre + half)) def _survived(sig, seed) -> str: @@ -99,8 +99,8 @@ def main(argv=None) -> int: e_frac = e_surv / e_tot fpr, fpr_lo, fpr_hi = _wilson(fp, ar_tot) - g1 = e_frac >= G1_MIN - g2 = fpr_hi <= G2_MAX_FPR # robust gate: CI upper bound, not the point estimate + g1 = bool(e_frac >= G1_MIN) + g2 = bool(fpr_hi <= G2_MAX_FPR) # robust gate: CI upper bound, not the point estimate passed = bool(g1 and g2) verdict = "S3_BRIGHT_LINE_ROBUSTLY_PASSED" if passed else "S3_BRIGHT_LINE_NOT_ROBUSTLY_PASSED" out = { From cb5f0a54e38b9324520e16ef0c938ac6ec38cf2d Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 02:17:55 +0300 Subject: [PATCH 08/12] Integrate authoritative S3: seed-robust AR-null PASS (reproduced) -> MULTINULL_PENDING The clean S3 re-run (fixed runner) produced the authoritative verdict and REPRODUCED the crashed run's per-seed counts byte-for-byte (1,5,0,1,2,4,4,4,4,3) -> a reproducible fact, not a log artifact. S3_BRIGHT_LINE_ROBUSTLY_PASSED: G1 0.94, G2 AR-null FPR 0.028, Wilson 95% CI [0.0194, 0.0402], upper <= 0.05 (N=1000, 10 seeds, frozen lock f84ff94 before run, elapsed 7110s). Honest intermediate canonical state (NOT an unqualified "robust"): the pre-registered seed-averaged AR-null gate passed, but the audit's S3 definition also requires multi-null robustness, which is not yet run. So: - latest_validation_state = BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING - seed_robust_gate_passed = true; multi_null_robustness_state = NOT_DONE; robust_gate_passed = null - FORMAL_VERDICT s1 + README + STATUS + CLAIM_AUDIT lead with seed-robust pass + multi-null pending - generator: full ROBUSTLY_PASSED requires seed-robust AND multi-null; statistical-claims gate honors it This supersedes the N=480 calibration (0.0354, CI-upper 0.056): the estimate is seed-set/N sensitive near the boundary; the larger pre-registered test passes and reproduces. Governance fixpoint (CERTIFIED). Co-Authored-By: Claude Opus 4.8 (1M context) --- FORMAL_VERDICT.md | 23 +++--- README.md | 14 ++-- STATUS.md | 2 +- .../S3_CONFIRMATORY_VERDICT.json | 81 +++++++++++++++++++ .../logs/s3_confirmatory_clean.log | 12 +++ artifacts/release/CLAIM_SAFETY_REPORT.json | 4 +- artifacts/release/CURRENT_TRUTH.json | 18 +++-- .../release/STATISTICAL_CLAIMS_REPORT.json | 10 +-- .../release/TRUTH_CONSISTENCY_CHECK.json | 6 +- docs/validation/CLAIM_AUDIT.md | 8 ++ tests/test_current_truth_sync.py | 14 ++-- tools/generate_current_truth.py | 57 ++++++++++--- tools/update_status.py | 14 ++-- 13 files changed, 206 insertions(+), 57 deletions(-) create mode 100644 artifacts/bonn_bright_line/S3_CONFIRMATORY_VERDICT.json create mode 100644 artifacts/bonn_bright_line/logs/s3_confirmatory_clean.log diff --git a/FORMAL_VERDICT.md b/FORMAL_VERDICT.md index 7b33bf0..b5d096e 100644 --- a/FORMAL_VERDICT.md +++ b/FORMAL_VERDICT.md @@ -5,17 +5,20 @@ Canonical machine-readable truth: [`artifacts/release/CURRENT_TRUTH.json`](artif This document must agree with it (enforced by `tools/validate_current_truth.py`). ## 1. Current canonical verdict -**`BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST`.** BSFF cleared the frozen `S2-C1-sampen-finiteN` -confirmatory at the **predeclared single seed** (nominal pass, FPR 0.02), but a seed-averaged -falsification (§Robustness) shows **G2 specificity is NOT robust**: pooled AR-null FPR 0.0354, -Wilson 95% CI **[0.022, 0.056]** whose **upper bound 0.056 > 0.05**. Under the robust gate -(`robust_gate = G1 power ≥ 0.80 AND G2 AR-null FPR Wilson-95-CI-upper ≤ 0.05`), **`robust_gate_passed -= false`**. The bright line is **not robustly crossed**; the 0.02 was a favorable-seed point estimate. +**`BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING`.** The pre-registered **S3 seed-averaged AR-null +confirmatory** (frozen lock before run; N=1000 over 10 seeds; independently re-run and **reproduced +byte-for-byte**) PASSES the robust gate: G1 power 0.94, G2 AR-null FPR **0.028**, Wilson 95% CI +**[0.0194, 0.0402]**, upper bound ≤ 0.05 (`S3_CONFIRMATORY_VERDICT.json`). This supersedes the smaller +N=480 calibration (FPR 0.0354, CI-upper 0.056) — the estimate is seed-set/N sensitive near the +boundary, and the larger pre-registered test passes. **Specificity is now robust to seed under the +AR null.** It is **not yet** robust across null models: **multi-null robustness (IAAFT / +phase-randomized) is NOT_DONE**, so the full robust claim is withheld (`robust_gate_passed = null`). -- G1 (power): Set E SURVIVED **0.96**, Set A not-SURVIVED **0.92**, Set B not-SURVIVED **0.92** (≥ 0.80) — **robust**. -- G2 (specificity): predeclared-seed AR-null FPR **0.02** ≤ 0.05; **seed-averaged 0.035, CI [0.022, 0.056] — not robustly ≤ 0.05**. -- BNCI2014-001 chain: **UNLOCKED_FOR_PREREGISTRATION_ONLY**. -- `CURRENT_TRUTH.s2_robustness = NOT_ROBUST_G2_SPECIFICITY` (see `S2_SPECIFICITY_CALIBRATION.json`). +- G1 (power): Set E SURVIVED **0.94** seed-averaged (≥ 0.80) — **robust**. +- G2 (specificity, seed-averaged AR-null): FPR **0.028**, CI **[0.019, 0.040]**, upper ≤ 0.05 — **robust (reproduced)**. +- Remaining gate: multi-null (AR/IAAFT/phase-randomized), `multi_null_robustness_state = NOT_DONE`. +- BNCI2014-001 chain: **UNLOCKED_FOR_PREREGISTRATION_ONLY** (execution not valid for narrowband epochs). +- `CURRENT_TRUTH.bonn_s2_robustness_state = SEED_ROBUST_AR_NULL_PASS ... MULTINULL_PENDING`. > BSFF passed the Bonn S2 bright-line under the frozen finite-N-corrected SampEn protocol. > This permits BNCI2014-001 preregistration. It does not validate BSFF across BCI datasets, diff --git a/README.md b/README.md index b016788..da0b391 100644 --- a/README.md +++ b/README.md @@ -36,13 +36,15 @@ BSFF aims at a **BCI/EEG signal claim** and tries to refute it under stated atta (surrogate nulls, controls, corroboration), emitting a bounded verdict — `SURVIVED` / `REFUTED` / `UNSUPPORTED` (see [`docs/VERDICT_SEMANTICS.md`](docs/VERDICT_SEMANTICS.md)). -**Current canonical evidence — `BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST`** +**Current canonical evidence — `BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING`** ([`artifacts/release/CURRENT_TRUTH.json`](artifacts/release/CURRENT_TRUTH.json)): on real -Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.96) and a -**marginal, NOT-robust specificity**: the predeclared-seed AR-null FPR was 0.02, but a -seed-averaged falsification gives FPR 0.035, Wilson 95% CI **[0.022, 0.056] crossing the 0.05 gate** -(`S2_SPECIFICITY_CALIBRATION.json`). The bright line is **not robustly crossed** — a favorable-seed -pass. The earlier S1 negative result is preserved as evidence. +Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.94 seed-averaged) and, +under the pre-registered **S3 seed-averaged AR-null** confirmatory (N=1000, 10 seeds, frozen before +run, **independently re-run and reproduced byte-for-byte**), robust **specificity**: FPR 0.028, +Wilson 95% CI **[0.019, 0.040]**, upper ≤ 0.05 (`S3_CONFIRMATORY_VERDICT.json`). This survived a +falsification that had earlier flagged a smaller-N calibration (0.035, CI-upper 0.056). **Remaining +gate: multi-null robustness (IAAFT/phase-randomized) is not yet run**, so the full robust claim is +withheld. The earlier S1 negative result is preserved as evidence. ```bash git clone https://github.com/neuron7xLab/bsff && cd bsff diff --git a/STATUS.md b/STATUS.md index 9508b0f..7653d40 100644 --- a/STATUS.md +++ b/STATUS.md @@ -29,7 +29,7 @@ authoritative status: ## Validation level -Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is a NOMINAL single-seed pass with G2 specificity NOT robust: BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST. Falsification + seed-averaged calibration show the AR-null FPR Wilson-95-CI upper bound (0.056) exceeds the 0.05 gate (robust_gate_passed=false). G1 power is robust. BNCI2014-001 is preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or robustly validated. Canonical state: artifacts/release/CURRENT_TRUTH.json. +Synthetic-ground-truth calibration PLUS a Bonn external benchmark whose pre-registered, reproduced seed-averaged AR-null specificity gate PASSES: BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING. S3 (N=1000, 10 seeds, frozen-before-run, re-run reproduced byte-for-byte): G1 power 0.94, G2 AR-null FPR 0.028, Wilson 95% CI [0.019, 0.040], upper <= 0.05. Specificity is robust to seed under the AR null; multi-null robustness (IAAFT/phase-randomized) is NOT_DONE, so the full robust claim is withheld (robust_gate_passed=null). BNCI2014-001 preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or multi-null robust. Canonical state: artifacts/release/CURRENT_TRUTH.json. See [`docs/VALIDATION.md`](docs/VALIDATION.md) for the full evidence tier table and [`docs/OPERATING_CHARACTERISTIC.md`](docs/OPERATING_CHARACTERISTIC.md) diff --git a/artifacts/bonn_bright_line/S3_CONFIRMATORY_VERDICT.json b/artifacts/bonn_bright_line/S3_CONFIRMATORY_VERDICT.json new file mode 100644 index 0000000..0a0fff4 --- /dev/null +++ b/artifacts/bonn_bright_line/S3_CONFIRMATORY_VERDICT.json @@ -0,0 +1,81 @@ +{ + "schema": "bsff.s3_seed_averaged/v1", + "verdict": "S3_BRIGHT_LINE_ROBUSTLY_PASSED", + "statistic_id": "sampen_lower_tail_m2_r015_v1", + "n_seeds": 10, + "n_segments_per_set": 50, + "n_surrogates": 199, + "G1": { + "E_survived_fraction": 0.94, + "threshold": 0.8, + "pass": true, + "n": 500 + }, + "G2": { + "ar_null_fpr": 0.028, + "wilson_95ci": [ + 0.0194, + 0.0402 + ], + "ci_upper_threshold": 0.05, + "pass": true, + "n_ar_null": 1000, + "n_false_positives": 28 + }, + "S3_PASS": true, + "per_seed": [ + { + "seed": 20260623, + "E_survived": 0.94, + "ar_null_fpr": 0.01 + }, + { + "seed": 7, + "E_survived": 0.94, + "ar_null_fpr": 0.05 + }, + { + "seed": 999, + "E_survived": 0.94, + "ar_null_fpr": 0.0 + }, + { + "seed": 314159, + "E_survived": 0.94, + "ar_null_fpr": 0.01 + }, + { + "seed": 2718, + "E_survived": 0.94, + "ar_null_fpr": 0.02 + }, + { + "seed": 42, + "E_survived": 0.94, + "ar_null_fpr": 0.04 + }, + { + "seed": 161803, + "E_survived": 0.94, + "ar_null_fpr": 0.04 + }, + { + "seed": 27182, + "E_survived": 0.94, + "ar_null_fpr": 0.04 + }, + { + "seed": 31337, + "E_survived": 0.94, + "ar_null_fpr": 0.04 + }, + { + "seed": 123456, + "E_survived": 0.94, + "ar_null_fpr": 0.03 + } + ], + "gate": "G1 seed-avg SURVIVED>=0.80 AND G2 AR-null FPR Wilson-95-CI-upper<=0.05", + "timestamp_utc": "2026-06-24T23:06:40Z", + "elapsed_sec": 7109.9 +} diff --git a/artifacts/bonn_bright_line/logs/s3_confirmatory_clean.log b/artifacts/bonn_bright_line/logs/s3_confirmatory_clean.log new file mode 100644 index 0000000..48a9d2d --- /dev/null +++ b/artifacts/bonn_bright_line/logs/s3_confirmatory_clean.log @@ -0,0 +1,12 @@ + seed 20260623: E=0.94 fpr=0.010 + seed 7: E=0.94 fpr=0.050 + seed 999: E=0.94 fpr=0.000 + seed 314159: E=0.94 fpr=0.010 + seed 2718: E=0.94 fpr=0.020 + seed 42: E=0.94 fpr=0.040 + seed 161803: E=0.94 fpr=0.040 + seed 27182: E=0.94 fpr=0.040 + seed 31337: E=0.94 fpr=0.040 + seed 123456: E=0.94 fpr=0.030 + +S3_BRIGHT_LINE_ROBUSTLY_PASSED | G1 E=0.940(>=0.80) G2 FPR=0.0280 CI=[0.0194,0.0402] (upper<=0.05?True) diff --git a/artifacts/release/CLAIM_SAFETY_REPORT.json b/artifacts/release/CLAIM_SAFETY_REPORT.json index 2388b02..36f2b73 100644 --- a/artifacts/release/CLAIM_SAFETY_REPORT.json +++ b/artifacts/release/CLAIM_SAFETY_REPORT.json @@ -14,6 +14,6 @@ "docs/QUICKSTART.md" ], "violations": [], - "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", - "timestamp_utc": "2026-06-24T19:34:48Z" + "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", + "timestamp_utc": "2026-06-24T23:17:37Z" } diff --git a/artifacts/release/CURRENT_TRUTH.json b/artifacts/release/CURRENT_TRUTH.json index 9794cc5..d8560f0 100644 --- a/artifacts/release/CURRENT_TRUTH.json +++ b/artifacts/release/CURRENT_TRUTH.json @@ -1,14 +1,16 @@ { "schema": "bsff.current_truth/v2", "package_version": "0.4.0", - "main_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995", - "latest_validation_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "main_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", + "latest_validation_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", "bonn_s2_nominal_state": "PASSED_SINGLE_SEED", - "bonn_s2_robustness_state": "NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_0.0354_CI_[0.0222, 0.056]_crosses_0.05", - "s2_seed_averaged_fpr": 0.0354, - "s2_wilson_ci_upper": 0.056, + "bonn_s2_robustness_state": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_PENDING", + "s2_seed_averaged_fpr": 0.028, + "s2_wilson_ci_upper": 0.0402, "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", - "robust_gate_passed": false, + "seed_robust_gate_passed": true, + "multi_null_robustness_state": "NOT_DONE", + "robust_gate_passed": null, "bonn_s1_state": "BRIGHT_LINE_NOT_PASSED", "bonn_s2_state": "S2_BRIGHT_LINE_PASSED", "G1_metrics": { @@ -25,7 +27,7 @@ }, "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "bnci_execution_state": "BNCI_BLOCKED_METHOD", - "s2_robustness": "NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_0.0354_CI_[0.0222, 0.056]_crosses_0.05", + "s2_robustness": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_PENDING", "multi_dataset_replication_state": "NOT_DONE", "pypi_deployment_state": "TESTPYPI_READY_PYPI_READY", "supported_claims": [ @@ -55,5 +57,5 @@ }, "hash_manifest_path": "artifacts/release/bonn_bright_line/HASHES.sha256", "reproduction_entrypoint": "REPRODUCE.md", - "timestamp_utc": "2026-06-24T19:25:30Z" + "timestamp_utc": "2026-06-24T23:10:01Z" } diff --git a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json index 9b95699..a134019 100644 --- a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json +++ b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json @@ -1,10 +1,10 @@ { "schema": "bsff.statistical_claims/v1", "status": "PASS", - "latest_validation_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", - "s2_wilson_ci_upper": 0.056, - "robust_gate_passed": false, + "latest_validation_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", + "s2_wilson_ci_upper": 0.0402, + "robust_gate_passed": null, "violations": [], - "git_commit": "a173ca7f1abd1e52e672fad1bf792d334b842f5c", - "timestamp_utc": "2026-06-24T19:38:12Z" + "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", + "timestamp_utc": "2026-06-24T23:17:37Z" } diff --git a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json index 56430e2..4731659 100644 --- a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json +++ b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json @@ -1,6 +1,6 @@ { "status": "PASS", - "final_state": "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", + "final_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "checked_files": [ "FORMAL_VERDICT.md", @@ -14,6 +14,6 @@ ], "contradictions": [], "stale_claims": [], - "timestamp_utc": "2026-06-24T19:34:49Z", - "git_commit": "87466768dfac819bde0f36d8fdbed25c0aceb995" + "timestamp_utc": "2026-06-24T23:17:38Z", + "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff" } diff --git a/docs/validation/CLAIM_AUDIT.md b/docs/validation/CLAIM_AUDIT.md index 4988160..5f0c97c 100644 --- a/docs/validation/CLAIM_AUDIT.md +++ b/docs/validation/CLAIM_AUDIT.md @@ -82,3 +82,11 @@ Still **FORBIDDEN**: clinical/medical/regulatory/device claims; final proof of b | Bonn S2 robust bright-line passed | REFUTED_BY_ARTIFACT | robust_gate_passed=false; CI upper 0.056 > 0.05 (`CURRENT_TRUTH.json`) | | Bonn S2 nominal single-seed pass exists | PROVEN_BY_ARTIFACT | predeclared confirmatory (FPR 0.02), `bonn_s2_nominal_state=PASSED_SINGLE_SEED` | | "Bonn validated" without a robustness qualifier | FORBIDDEN | enforced by `tools/validate_statistical_claims.py` (CI) | + +## S3 seed-averaged confirmatory (reproduced fact) +| claim | status | evidence | +|-------|--------|----------| +| Seed-averaged AR-null specificity is robust (FPR 0.028, Wilson CI [0.019,0.040] upper ≤ 0.05) | PROVEN_BY_ARTIFACT | `S3_CONFIRMATORY_VERDICT.json` (N=1000, 10 seeds, frozen lock, re-run reproduced per-seed counts byte-for-byte) | +| The S2 not-robust calibration is superseded by the larger pre-registered S3 | PROVEN_BY_ARTIFACT | N=480 (0.0354) vs N=1000 (0.028); seed-set/N sensitive near boundary; larger test passes | +| Bonn S2 is robust across null models | UNSUPPORTED (not yet) | multi-null (IAAFT/phase-randomized) NOT_DONE; `multi_null_robustness_state=NOT_DONE` | +| Bonn S2 bright line is fully robustly passed | UNVERIFIED | requires multi-null; `robust_gate_passed=null` | diff --git a/tests/test_current_truth_sync.py b/tests/test_current_truth_sync.py index 1991dc2..412ead0 100644 --- a/tests/test_current_truth_sync.py +++ b/tests/test_current_truth_sync.py @@ -32,14 +32,18 @@ def test_canonical_state_is_honest_about_robustness(): import json truth = json.loads((ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json").read_text()) - # Nominal single-seed pass, but the falsification downgraded G2 specificity to NOT robust. + # The state tracks the strongest reproduced evidence; it must be one of the honest tokens. assert truth["latest_validation_state"] in { "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST", - "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", # only if S3 proves robust + "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", + "BONN_S2_SEED_ROBUST_PASS_MULTINULL_FAILED", + "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", # only with seed-robust AND multi-null } assert truth["bonn_s2_nominal_state"] == "PASSED_SINGLE_SEED" - # The robust gate must not be silently claimed passed unless an artifact proves it. - if truth["latest_validation_state"] == "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST": - assert truth["robust_gate_passed"] is False + # Full robust must not be claimed unless multi-null robustness also passed. + if truth["latest_validation_state"] != "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED": + assert truth["robust_gate_passed"] is not True + else: + assert truth["multi_null_robustness_state"] == "PASSED" assert truth["BNCI_chain_state"] == "UNLOCKED_FOR_PREREGISTRATION_ONLY" assert truth["bonn_s1_state"] == "BRIGHT_LINE_NOT_PASSED" # preserved diff --git a/tools/generate_current_truth.py b/tools/generate_current_truth.py index 2c7485c..80c9464 100644 --- a/tools/generate_current_truth.py +++ b/tools/generate_current_truth.py @@ -39,7 +39,16 @@ def _bnci_execution_state() -> str: def _s2_robustness() -> str: - # Calibrated by the falsification battery + seed-averaged specificity calibration. + # Authoritative: S3 seed-averaged AR-null confirmatory (reproduced) > calibration > falsification. + s3 = ROOT / "artifacts" / "bonn_bright_line" / "S3_CONFIRMATORY_VERDICT.json" + if s3.is_file(): + d = json.loads(s3.read_text()) + ci = d.get("G2", {}).get("wilson_95ci") + if d.get("S3_PASS"): + mn = _multi_null_passed() + tag = {True: "_MULTINULL_CONFIRMED", False: "_MULTINULL_FAILED", None: "_MULTINULL_PENDING"}[mn] + return f"SEED_ROBUST_AR_NULL_PASS_CI_{ci}{tag}" + return f"S3_SEED_AVERAGED_NOT_ROBUST_CI_{ci}" cal = ROOT / "artifacts" / "bonn_bright_line" / "S2_SPECIFICITY_CALIBRATION.json" if cal.is_file(): c = json.loads(cal.read_text()) @@ -66,25 +75,38 @@ def _pypi_state() -> str: return "TESTPYPI_READY_PYPI_READY" if (has_test and has_pypi) else "INCOMPLETE" +def _multi_null_passed(): + """True/False if the multi-null robustness artifact exists; None if not yet run.""" + p = ROOT / "artifacts" / "bonn_bright_line" / "MULTI_NULL_ROBUSTNESS.json" + if not p.is_file(): + return None + return bool(json.loads(p.read_text()).get("all_nulls_pass", False)) + + def _bonn_robustness() -> dict: - """Resolve the robust specificity state from the strongest available evidence: - S3 seed-averaged confirmatory > seed-averaged calibration > nominal-only.""" + """seed_robust = S3 seed-averaged AR-null gate passed (authoritative, reproduced); + full_robust additionally requires multi-null robustness (AR/IAAFT/phase-randomized).""" cal_p = ROOT / "artifacts" / "bonn_bright_line" / "S2_SPECIFICITY_CALIBRATION.json" s3_p = ROOT / "artifacts" / "bonn_bright_line" / "S3_CONFIRMATORY_VERDICT.json" fpr = ci_upper = None - robust = None + seed_robust = None if cal_p.is_file(): c = json.loads(cal_p.read_text()) fpr = c.get("pooled_fpr") ci_upper = (c.get("wilson_95ci") or [None, None])[1] - robust = bool(c.get("fpr_ci_upper_below_threshold", False)) - if s3_p.is_file(): # S3 is the authoritative, larger-N evidence + seed_robust = bool(c.get("fpr_ci_upper_below_threshold", False)) + if s3_p.is_file(): # authoritative, larger-N, reproduced s3 = json.loads(s3_p.read_text()) g2 = s3.get("G2", {}) fpr = g2.get("ar_null_fpr", fpr) ci_upper = (g2.get("wilson_95ci") or [None, ci_upper])[1] - robust = bool(s3.get("S3_PASS", False)) - return {"robust": robust, "seed_avg_fpr": fpr, "wilson_ci_upper": ci_upper} + seed_robust = bool(s3.get("S3_PASS", False)) + multi_null = _multi_null_passed() + full_robust = (seed_robust is True and multi_null is True) if multi_null is not None else None + return { + "seed_robust": seed_robust, "multi_null": multi_null, "full_robust": full_robust, + "seed_avg_fpr": fpr, "wilson_ci_upper": ci_upper, + } def build() -> dict: @@ -96,9 +118,16 @@ def build() -> dict: ).stdout.strip() g1, g2 = s2["G1"], s2["G2"] rob = _bonn_robustness() - if rob["robust"] is True: + if rob["full_robust"] is True: latest = "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED" - elif rob["robust"] is False: + elif rob["seed_robust"] is True: + # seed-averaged AR-null gate passed (reproduced); multi-null robustness pending/failed. + latest = ( + "BONN_S2_SEED_ROBUST_PASS_MULTINULL_FAILED" + if rob["multi_null"] is False + else "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING" + ) + elif rob["seed_robust"] is False: latest = "BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST" else: latest = "BONN_S2_BRIGHT_LINE_PASSED" if s2_pass else s2["final_state"] @@ -112,7 +141,13 @@ def build() -> dict: "s2_seed_averaged_fpr": rob["seed_avg_fpr"], "s2_wilson_ci_upper": rob["wilson_ci_upper"], "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", - "robust_gate_passed": bool(rob["robust"]) if rob["robust"] is not None else None, + "seed_robust_gate_passed": rob["seed_robust"], # S3 seed-averaged AR-null (reproduced) + "multi_null_robustness_state": ( + "PASSED" if rob["multi_null"] is True + else "FAILED" if rob["multi_null"] is False else "NOT_DONE" + ), + # robust_gate_passed requires BOTH seed-averaged AND multi-null robustness. + "robust_gate_passed": rob["full_robust"], "bonn_s1_state": s1["final_state"], # BRIGHT_LINE_NOT_PASSED (historical) "bonn_s2_state": s2["final_state"], # nominal single-seed confirmatory "G1_metrics": { diff --git a/tools/update_status.py b/tools/update_status.py index 90c2b9d..bc033a9 100644 --- a/tools/update_status.py +++ b/tools/update_status.py @@ -41,12 +41,14 @@ CI_WORKFLOW = ".github/workflows/ci.yml" VALIDATION_LEVEL = ( - "Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is a NOMINAL " - "single-seed pass with G2 specificity NOT robust: BONN_NOMINAL_S2_PASS_BUT_G2_NOT_ROBUST. " - "Falsification + seed-averaged calibration show the AR-null FPR Wilson-95-CI upper bound " - "(0.056) exceeds the 0.05 gate (robust_gate_passed=false). G1 power is robust. BNCI2014-001 " - "is preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, " - "or robustly validated. Canonical state: artifacts/release/CURRENT_TRUTH.json." + "Synthetic-ground-truth calibration PLUS a Bonn external benchmark whose pre-registered, " + "reproduced seed-averaged AR-null specificity gate PASSES: BONN_S2_SEED_ROBUST_PASS_MULTINULL_" + "PENDING. S3 (N=1000, 10 seeds, frozen-before-run, re-run reproduced byte-for-byte): G1 power " + "0.94, G2 AR-null FPR 0.028, Wilson 95% CI [0.019, 0.040], upper <= 0.05. Specificity is robust " + "to seed under the AR null; multi-null robustness (IAAFT/phase-randomized) is NOT_DONE, so the " + "full robust claim is withheld (robust_gate_passed=null). BNCI2014-001 preregistration-only " + "(BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or multi-null robust. " + "Canonical state: artifacts/release/CURRENT_TRUTH.json." ) From 554adb3a0e5db9e51912ba6797af12723122d6a3 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 02:26:29 +0300 Subject: [PATCH 09/12] Add multi-null robustness runner (final gate: AR/IAAFT/phase-randomized) Predeclared MULTI_NULL_ROBUSTNESS_PROTOCOL. Each null family generates null DATA from real Set-A/B signals; the unchanged S2-C1 test must NOT survive a linear null. Gate per null = seed-averaged FPR Wilson-95-CI upper <= 0.05. IAAFT (Schreiber-Schmitz) + FT phase-randomization are standalone, independent of the test's internal MIAAFT. Smoke (tiny-N) confirms iaaft/phaserand FPR point estimates ~0. Full run pending -> sets multi_null_robustness_state. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../bonn_bright_line/multi_null_robustness.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 examples/bonn_bright_line/multi_null_robustness.py diff --git a/examples/bonn_bright_line/multi_null_robustness.py b/examples/bonn_bright_line/multi_null_robustness.py new file mode 100644 index 0000000..cb34a76 --- /dev/null +++ b/examples/bonn_bright_line/multi_null_robustness.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-3.0-or-later +# Copyright (c) 2026 Yaroslav Vasylenko / neuron7xLab +"""Multi-null robustness gate (predeclared: docs/validation/MULTI_NULL_ROBUSTNESS_PROTOCOL.md). + +The null model is a researcher degree of freedom. Specificity is robust only if the seed-averaged +AR-null result holds across independent linear-null families. For each null model the gate is the +same as S3: pooled seed-averaged FPR Wilson-95-CI upper bound <= 0.05. + +Null families (each generates the NULL DATA from the real Set-A/B signals, then runs the unchanged +S2-C1 sampen lower-tail test on it; a linear null must NOT survive): + - ar : spectrum-matched AR(p) (reuses run_ar_negative.ar_null; = S3) + - iaaft : classic Schreiber-Schmitz IAAFT (preserves spectrum + amplitude distribution) + - phaserand : Fourier phase randomization (preserves spectrum, Gaussianizes) +The iaaft/phaserand generators are standalone (independent of the test's internal MIAAFT). +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path + +import numpy as np + +_HERE = Path(__file__).resolve().parent +if str(_HERE) not in sys.path: + sys.path.insert(0, str(_HERE)) + +from loader import load_set # noqa: E402 +from run_ar_negative import ar_null # noqa: E402 +from statistics_sampen import sampen_lower_tail_test # noqa: E402 + +NSUR = 199 +ALPHA_EFF = 0.025 +G2_MAX_FPR = 0.05 +SEEDS = [20260623, 7, 999, 314159, 2718, 42, 161803, 27182, 31337, 123456] + + +def phaserand_null(x, seed): + """Fourier phase-randomized surrogate: keep amplitudes, randomize phases.""" + rng = np.random.default_rng(seed) + x = np.asarray(x, float) + f = np.fft.rfft(x) + phases = rng.uniform(0, 2 * np.pi, size=f.shape) + phases[0] = 0.0 + if x.size % 2 == 0: + phases[-1] = 0.0 + return np.fft.irfft(np.abs(f) * np.exp(1j * phases), n=x.size) + + +def iaaft_null(x, seed, iters=100): + """Classic Schreiber-Schmitz IAAFT: matches power spectrum AND amplitude distribution.""" + rng = np.random.default_rng(seed) + x = np.asarray(x, float) + amp = np.abs(np.fft.rfft(x)) + sorted_x = np.sort(x) + surr = rng.permutation(x) + prev = None + for _ in range(iters): + # impose spectrum + f = np.fft.rfft(surr) + surr = np.fft.irfft(amp * np.exp(1j * np.angle(f)), n=x.size) + # impose amplitude distribution (rank-match) + ranks = np.argsort(np.argsort(surr)) + surr = sorted_x[ranks] + if prev is not None and np.array_equal(np.argsort(surr), prev): + break + prev = np.argsort(surr) + return surr + + +NULLS = {"ar": lambda s, sd: ar_null(s, 10, sd), "iaaft": iaaft_null, "phaserand": phaserand_null} + + +def _survived(sig, seed): + t = sampen_lower_tail_test(np.asarray(sig, float), n_surrogates=NSUR, alpha=0.05, seed=seed) + return t["surrogate_converged"] and t["p_value"] <= ALPHA_EFF + + +def _wilson(k, n, z=1.96): + if n == 0: + return 0.0, 0.0, 1.0 + p = k / n + den = 1 + z * z / n + c = (p + z * z / (2 * n)) / den + h = (z * np.sqrt(p * (1 - p) / n + z * z / (4 * n * n))) / den + return float(p), float(max(0.0, c - h)), float(min(1.0, c + h)) + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--data-dir", default="examples/bonn_bright_line/bonn_data", type=Path) + ap.add_argument("--n-segments", type=int, default=50) + ap.add_argument("--seeds", type=int, nargs="+", default=SEEDS) + ap.add_argument("--nulls", nargs="+", default=["ar", "iaaft", "phaserand"]) + ap.add_argument( + "--output", type=Path, default=Path("artifacts/bonn_bright_line/MULTI_NULL_ROBUSTNESS.json") + ) + a = ap.parse_args(argv) + t0 = time.time() + A = [s.data for s in load_set(a.data_dir, "A", n_segments=a.n_segments)] + B = [s.data for s in load_set(a.data_dir, "B", n_segments=a.n_segments)] + sets = [("A", A), ("B", B)] + results = {} + for null in a.nulls: + gen = NULLS[null] + fp = tot = 0 + for sb in a.seeds: + for _label, sigs in sets: + for i, sig in enumerate(sigs): + if _survived(gen(sig, sb + i + 500), sb + i + 700): + fp += 1 + tot += 1 + print(f" [{null}] seed {sb} done ({fp}/{tot})", flush=True) + fpr, lo, hi = _wilson(fp, tot) + results[null] = { + "fpr": round(fpr, 4), + "wilson_95ci": [round(lo, 4), round(hi, 4)], + "n": tot, + "n_false_positives": fp, + "pass": bool(hi <= G2_MAX_FPR), + } + print( + f" [{null}] FPR={fpr:.4f} CI=[{lo:.4f},{hi:.4f}] pass={hi <= G2_MAX_FPR}", flush=True + ) + all_pass = all(r["pass"] for r in results.values()) + out = { + "schema": "bsff.multi_null_robustness/v1", + "verdict": "MULTI_NULL_ROBUST" if all_pass else "MULTI_NULL_NOT_ROBUST", + "all_nulls_pass": bool(all_pass), + "gate": "per-null seed-averaged FPR Wilson-95-CI-upper <= 0.05", + "n_seeds": len(a.seeds), + "n_segments_per_set": a.n_segments, + "n_surrogates": NSUR, + "nulls": results, + "timestamp_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "elapsed_sec": round(time.time() - t0, 1), + } + a.output.parent.mkdir(parents=True, exist_ok=True) + a.output.write_text(json.dumps(out, indent=2) + "\n") + print( + f"\n{out['verdict']} | " + + " ".join(f"{k}:{v['fpr']}(<=0.05?{v['pass']})" for k, v in results.items()) + ) + return 0 if all_pass else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) From 245dcc7843a4593449831ecc1f430c0bdeaf8552 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 05:27:59 +0300 Subject: [PATCH 10/12] Multi-null gate PASSED -> BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED (robustness earned) The final gate completed cleanly (authoritative, no reconstruction): specificity is robust across all three independent linear-null families, each seed-averaged Wilson-95-CI-upper <= 0.05: AR FPR 0.026 [0.018, 0.038] IAAFT FPR 0.032 [0.023, 0.045] (standalone Schreiber-Schmitz) phaserand FPR 0.034 [0.024, 0.047] (standalone FT phase randomization) Combined with the reproduced S3 seed-averaged result, the full robust gate is satisfied: - latest_validation_state = BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED - seed_robust_gate_passed = true; multi_null_robustness_state = PASSED; robust_gate_passed = true - FORMAL_VERDICT s1 + README + STATUS + CLAIM_AUDIT lead with the earned robust pass The full arc: nominal single-seed pass -> falsification (seed-7 FPR 0.067) -> calibration flagged not-robust (0.0354, CI-upper 0.056) -> larger pre-registered S3 passed and was reproduced byte-for- byte (0.028) -> multi-null confirmed. Robustness was earned through falsification, not assumed. Still NOT: clinical/regulatory, BNCI executed, multi-dataset replicated. Governance CERTIFIED. Co-Authored-By: Claude Opus 4.8 (1M context) --- FORMAL_VERDICT.md | 22 +++++----- README.md | 18 ++++---- STATUS.md | 2 +- .../MULTI_NULL_ROBUSTNESS.json | 43 +++++++++++++++++++ .../bonn_bright_line/logs/multi_null.log | 35 +++++++++++++++ artifacts/release/CLAIM_SAFETY_REPORT.json | 4 +- artifacts/release/CURRENT_TRUTH.json | 14 +++--- .../release/STATISTICAL_CLAIMS_REPORT.json | 8 ++-- .../release/TRUTH_CONSISTENCY_CHECK.json | 6 +-- docs/validation/CLAIM_AUDIT.md | 7 +++ tools/update_status.py | 14 +++--- 11 files changed, 131 insertions(+), 42 deletions(-) create mode 100644 artifacts/bonn_bright_line/MULTI_NULL_ROBUSTNESS.json create mode 100644 artifacts/bonn_bright_line/logs/multi_null.log diff --git a/FORMAL_VERDICT.md b/FORMAL_VERDICT.md index b5d096e..fc3be4f 100644 --- a/FORMAL_VERDICT.md +++ b/FORMAL_VERDICT.md @@ -5,19 +5,21 @@ Canonical machine-readable truth: [`artifacts/release/CURRENT_TRUTH.json`](artif This document must agree with it (enforced by `tools/validate_current_truth.py`). ## 1. Current canonical verdict -**`BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING`.** The pre-registered **S3 seed-averaged AR-null -confirmatory** (frozen lock before run; N=1000 over 10 seeds; independently re-run and **reproduced -byte-for-byte**) PASSES the robust gate: G1 power 0.94, G2 AR-null FPR **0.028**, Wilson 95% CI -**[0.0194, 0.0402]**, upper bound ≤ 0.05 (`S3_CONFIRMATORY_VERDICT.json`). This supersedes the smaller -N=480 calibration (FPR 0.0354, CI-upper 0.056) — the estimate is seed-set/N sensitive near the -boundary, and the larger pre-registered test passes. **Specificity is now robust to seed under the -AR null.** It is **not yet** robust across null models: **multi-null robustness (IAAFT / -phase-randomized) is NOT_DONE**, so the full robust claim is withheld (`robust_gate_passed = null`). +**`BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED`.** The bright line passes the full PI-grade gauntlet: +falsification → seed-averaged confirmation → byte-for-byte reproduction → multi-null robustness. +G1 power 0.94 (seed-averaged, robust). G2 specificity is robust to **both** seed and null-model +choice: the pre-registered **S3 seed-averaged AR-null** test (N=1000, 10 seeds, frozen lock before +run, re-run reproduced byte-for-byte) gives FPR **0.028**, Wilson 95% CI **[0.019, 0.040]**; and the +**multi-null** gate (`MULTI_NULL_ROBUSTNESS.json`) holds across all three independent linear-null +families — AR 0.026 [0.018, 0.038], IAAFT 0.032 [0.023, 0.045], phase-randomized 0.034 [0.024, 0.047] +— every Wilson CI-upper ≤ 0.05. `robust_gate_passed = true`. This survived (and superseded) a +smaller-N calibration that had flagged the estimate as seed-set/N sensitive near the boundary. - G1 (power): Set E SURVIVED **0.94** seed-averaged (≥ 0.80) — **robust**. -- G2 (specificity, seed-averaged AR-null): FPR **0.028**, CI **[0.019, 0.040]**, upper ≤ 0.05 — **robust (reproduced)**. -- Remaining gate: multi-null (AR/IAAFT/phase-randomized), `multi_null_robustness_state = NOT_DONE`. +- G2 (specificity): seed-averaged AR-null FPR **0.028** [0.019, 0.040]; multi-null all ≤ 0.05 — **robust**. +- `multi_null_robustness_state = PASSED` (AR / IAAFT / phase-randomized). - BNCI2014-001 chain: **UNLOCKED_FOR_PREREGISTRATION_ONLY** (execution not valid for narrowband epochs). +- Still NOT: clinical/regulatory; BNCI executed; multi-dataset replicated. - `CURRENT_TRUTH.bonn_s2_robustness_state = SEED_ROBUST_AR_NULL_PASS ... MULTINULL_PENDING`. > BSFF passed the Bonn S2 bright-line under the frozen finite-N-corrected SampEn protocol. diff --git a/README.md b/README.md index da0b391..2e5934f 100644 --- a/README.md +++ b/README.md @@ -36,15 +36,17 @@ BSFF aims at a **BCI/EEG signal claim** and tries to refute it under stated atta (surrogate nulls, controls, corroboration), emitting a bounded verdict — `SURVIVED` / `REFUTED` / `UNSUPPORTED` (see [`docs/VERDICT_SEMANTICS.md`](docs/VERDICT_SEMANTICS.md)). -**Current canonical evidence — `BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING`** +**Current canonical evidence — `BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED`** ([`artifacts/release/CURRENT_TRUTH.json`](artifacts/release/CURRENT_TRUTH.json)): on real -Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.94 seed-averaged) and, -under the pre-registered **S3 seed-averaged AR-null** confirmatory (N=1000, 10 seeds, frozen before -run, **independently re-run and reproduced byte-for-byte**), robust **specificity**: FPR 0.028, -Wilson 95% CI **[0.019, 0.040]**, upper ≤ 0.05 (`S3_CONFIRMATORY_VERDICT.json`). This survived a -falsification that had earlier flagged a smaller-N calibration (0.035, CI-upper 0.056). **Remaining -gate: multi-null robustness (IAAFT/phase-randomized) is not yet run**, so the full robust claim is -withheld. The earlier S1 negative result is preserved as evidence. +Andrzejak-2001 Bonn EEG the instrument has robust **power** (ictal SURVIVED 0.94 seed-averaged) and +**specificity that is robust to both seed and null-model choice**. The pre-registered **S3 +seed-averaged AR-null** confirmatory (N=1000, 10 seeds, frozen before run, **independently re-run and +reproduced byte-for-byte**) gives FPR 0.028, Wilson 95% CI **[0.019, 0.040]**; and the **multi-null** +gate holds across AR (0.026), IAAFT (0.032), and phase-randomized (0.034) nulls — every Wilson +CI-upper ≤ 0.05. This passed only after a falsification flagged, and a larger pre-registered test +superseded, a smaller-N calibration (0.035, CI-upper 0.056) — robustness was *earned*, not assumed. +Still not: clinical/regulatory, BNCI executed, or multi-dataset replicated. The S1 negative result is +preserved as evidence. ```bash git clone https://github.com/neuron7xLab/bsff && cd bsff diff --git a/STATUS.md b/STATUS.md index 7653d40..4ab682d 100644 --- a/STATUS.md +++ b/STATUS.md @@ -29,7 +29,7 @@ authoritative status: ## Validation level -Synthetic-ground-truth calibration PLUS a Bonn external benchmark whose pre-registered, reproduced seed-averaged AR-null specificity gate PASSES: BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING. S3 (N=1000, 10 seeds, frozen-before-run, re-run reproduced byte-for-byte): G1 power 0.94, G2 AR-null FPR 0.028, Wilson 95% CI [0.019, 0.040], upper <= 0.05. Specificity is robust to seed under the AR null; multi-null robustness (IAAFT/phase-randomized) is NOT_DONE, so the full robust claim is withheld (robust_gate_passed=null). BNCI2014-001 preregistration-only (BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or multi-null robust. Canonical state: artifacts/release/CURRENT_TRUTH.json. +Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is ROBUSTLY passed: BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED. Specificity is robust to BOTH seed and null-model choice. Pre-registered S3 seed-averaged AR-null (N=1000, 10 seeds, frozen-before-run, re-run reproduced byte-for-byte): G1 power 0.94, G2 FPR 0.028, Wilson 95% CI [0.019, 0.040]. Multi-null gate (AR/IAAFT/phase-randomized) all Wilson CI-upper <= 0.05 (robust_gate_passed=true). This survived and superseded a smaller-N calibration. BNCI2014-001 preregistration-only (execution not valid for narrowband epochs). NOT clinical, regulatory, BNCI-executed, or multi-dataset replicated. Canonical state: artifacts/release/CURRENT_TRUTH.json. See [`docs/VALIDATION.md`](docs/VALIDATION.md) for the full evidence tier table and [`docs/OPERATING_CHARACTERISTIC.md`](docs/OPERATING_CHARACTERISTIC.md) diff --git a/artifacts/bonn_bright_line/MULTI_NULL_ROBUSTNESS.json b/artifacts/bonn_bright_line/MULTI_NULL_ROBUSTNESS.json new file mode 100644 index 0000000..c8d18bd --- /dev/null +++ b/artifacts/bonn_bright_line/MULTI_NULL_ROBUSTNESS.json @@ -0,0 +1,43 @@ +{ + "schema": "bsff.multi_null_robustness/v1", + "verdict": "MULTI_NULL_ROBUST", + "all_nulls_pass": true, + "gate": "per-null seed-averaged FPR Wilson-95-CI-upper <= 0.05", + "n_seeds": 10, + "n_segments_per_set": 50, + "n_surrogates": 199, + "nulls": { + "ar": { + "fpr": 0.026, + "wilson_95ci": [ + 0.0178, + 0.0378 + ], + "n": 1000, + "n_false_positives": 26, + "pass": true + }, + "iaaft": { + "fpr": 0.032, + "wilson_95ci": [ + 0.0228, + 0.0448 + ], + "n": 1000, + "n_false_positives": 32, + "pass": true + }, + "phaserand": { + "fpr": 0.034, + "wilson_95ci": [ + 0.0244, + 0.0471 + ], + "n": 1000, + "n_false_positives": 34, + "pass": true + } + }, + "timestamp_utc": "2026-06-25T02:20:10Z", + "elapsed_sec": 10418.5 +} diff --git a/artifacts/bonn_bright_line/logs/multi_null.log b/artifacts/bonn_bright_line/logs/multi_null.log new file mode 100644 index 0000000..a693827 --- /dev/null +++ b/artifacts/bonn_bright_line/logs/multi_null.log @@ -0,0 +1,35 @@ + [ar] seed 20260623 done (1/100) + [ar] seed 7 done (3/200) + [ar] seed 999 done (5/300) + [ar] seed 314159 done (7/400) + [ar] seed 2718 done (10/500) + [ar] seed 42 done (12/600) + [ar] seed 161803 done (17/700) + [ar] seed 27182 done (19/800) + [ar] seed 31337 done (22/900) + [ar] seed 123456 done (26/1000) + [ar] FPR=0.0260 CI=[0.0178,0.0378] pass=True + [iaaft] seed 20260623 done (4/100) + [iaaft] seed 7 done (6/200) + [iaaft] seed 999 done (6/300) + [iaaft] seed 314159 done (8/400) + [iaaft] seed 2718 done (11/500) + [iaaft] seed 42 done (16/600) + [iaaft] seed 161803 done (20/700) + [iaaft] seed 27182 done (23/800) + [iaaft] seed 31337 done (28/900) + [iaaft] seed 123456 done (32/1000) + [iaaft] FPR=0.0320 CI=[0.0228,0.0448] pass=True + [phaserand] seed 20260623 done (3/100) + [phaserand] seed 7 done (8/200) + [phaserand] seed 999 done (9/300) + [phaserand] seed 314159 done (13/400) + [phaserand] seed 2718 done (16/500) + [phaserand] seed 42 done (19/600) + [phaserand] seed 161803 done (22/700) + [phaserand] seed 27182 done (24/800) + [phaserand] seed 31337 done (29/900) + [phaserand] seed 123456 done (34/1000) + [phaserand] FPR=0.0340 CI=[0.0244,0.0471] pass=True + +MULTI_NULL_ROBUST | ar:0.026(<=0.05?True) iaaft:0.032(<=0.05?True) phaserand:0.034(<=0.05?True) diff --git a/artifacts/release/CLAIM_SAFETY_REPORT.json b/artifacts/release/CLAIM_SAFETY_REPORT.json index 36f2b73..65d79cb 100644 --- a/artifacts/release/CLAIM_SAFETY_REPORT.json +++ b/artifacts/release/CLAIM_SAFETY_REPORT.json @@ -14,6 +14,6 @@ "docs/QUICKSTART.md" ], "violations": [], - "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", - "timestamp_utc": "2026-06-24T23:17:37Z" + "git_commit": "554adb3a0e5db9e51912ba6797af12723122d6a3", + "timestamp_utc": "2026-06-25T02:22:41Z" } diff --git a/artifacts/release/CURRENT_TRUTH.json b/artifacts/release/CURRENT_TRUTH.json index d8560f0..f4c5c72 100644 --- a/artifacts/release/CURRENT_TRUTH.json +++ b/artifacts/release/CURRENT_TRUTH.json @@ -1,16 +1,16 @@ { "schema": "bsff.current_truth/v2", "package_version": "0.4.0", - "main_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", - "latest_validation_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", + "main_commit": "554adb3a0e5db9e51912ba6797af12723122d6a3", + "latest_validation_state": "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", "bonn_s2_nominal_state": "PASSED_SINGLE_SEED", - "bonn_s2_robustness_state": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_PENDING", + "bonn_s2_robustness_state": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_CONFIRMED", "s2_seed_averaged_fpr": 0.028, "s2_wilson_ci_upper": 0.0402, "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", "seed_robust_gate_passed": true, - "multi_null_robustness_state": "NOT_DONE", - "robust_gate_passed": null, + "multi_null_robustness_state": "PASSED", + "robust_gate_passed": true, "bonn_s1_state": "BRIGHT_LINE_NOT_PASSED", "bonn_s2_state": "S2_BRIGHT_LINE_PASSED", "G1_metrics": { @@ -27,7 +27,7 @@ }, "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "bnci_execution_state": "BNCI_BLOCKED_METHOD", - "s2_robustness": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_PENDING", + "s2_robustness": "SEED_ROBUST_AR_NULL_PASS_CI_[0.0194, 0.0402]_MULTINULL_CONFIRMED", "multi_dataset_replication_state": "NOT_DONE", "pypi_deployment_state": "TESTPYPI_READY_PYPI_READY", "supported_claims": [ @@ -57,5 +57,5 @@ }, "hash_manifest_path": "artifacts/release/bonn_bright_line/HASHES.sha256", "reproduction_entrypoint": "REPRODUCE.md", - "timestamp_utc": "2026-06-24T23:10:01Z" + "timestamp_utc": "2026-06-25T02:21:38Z" } diff --git a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json index a134019..61c26f9 100644 --- a/artifacts/release/STATISTICAL_CLAIMS_REPORT.json +++ b/artifacts/release/STATISTICAL_CLAIMS_REPORT.json @@ -1,10 +1,10 @@ { "schema": "bsff.statistical_claims/v1", "status": "PASS", - "latest_validation_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", + "latest_validation_state": "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", "s2_wilson_ci_upper": 0.0402, - "robust_gate_passed": null, + "robust_gate_passed": true, "violations": [], - "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff", - "timestamp_utc": "2026-06-24T23:17:37Z" + "git_commit": "554adb3a0e5db9e51912ba6797af12723122d6a3", + "timestamp_utc": "2026-06-25T02:27:43Z" } diff --git a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json index 4731659..29bb6e1 100644 --- a/artifacts/release/TRUTH_CONSISTENCY_CHECK.json +++ b/artifacts/release/TRUTH_CONSISTENCY_CHECK.json @@ -1,6 +1,6 @@ { "status": "PASS", - "final_state": "BONN_S2_SEED_ROBUST_PASS_MULTINULL_PENDING", + "final_state": "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", "BNCI_chain_state": "UNLOCKED_FOR_PREREGISTRATION_ONLY", "checked_files": [ "FORMAL_VERDICT.md", @@ -14,6 +14,6 @@ ], "contradictions": [], "stale_claims": [], - "timestamp_utc": "2026-06-24T23:17:38Z", - "git_commit": "4bd046407948f34daed565eb585e49b0dc1ec9ff" + "timestamp_utc": "2026-06-25T02:27:44Z", + "git_commit": "554adb3a0e5db9e51912ba6797af12723122d6a3" } diff --git a/docs/validation/CLAIM_AUDIT.md b/docs/validation/CLAIM_AUDIT.md index 5f0c97c..c494aab 100644 --- a/docs/validation/CLAIM_AUDIT.md +++ b/docs/validation/CLAIM_AUDIT.md @@ -90,3 +90,10 @@ Still **FORBIDDEN**: clinical/medical/regulatory/device claims; final proof of b | The S2 not-robust calibration is superseded by the larger pre-registered S3 | PROVEN_BY_ARTIFACT | N=480 (0.0354) vs N=1000 (0.028); seed-set/N sensitive near boundary; larger test passes | | Bonn S2 is robust across null models | UNSUPPORTED (not yet) | multi-null (IAAFT/phase-randomized) NOT_DONE; `multi_null_robustness_state=NOT_DONE` | | Bonn S2 bright line is fully robustly passed | UNVERIFIED | requires multi-null; `robust_gate_passed=null` | + +## Multi-null robustness (final gate — PASSED) +| claim | status | evidence | +|-------|--------|----------| +| Specificity is robust across null models (AR/IAAFT/phase-randomized) | PROVEN_BY_ARTIFACT | `MULTI_NULL_ROBUSTNESS.json`: AR 0.026 [0.018,0.038], IAAFT 0.032 [0.023,0.045], phaserand 0.034 [0.024,0.047]; all CI-upper ≤ 0.05 | +| Bonn S2 bright line is robustly passed (seed AND null-model) | PROVEN_BY_ARTIFACT | S3 (seed) + multi-null; `robust_gate_passed=true`, `BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED` | +| Robustness was earned through falsification, not assumed | PROVEN_BY_ARTIFACT | calibration flagged not-robust → larger pre-registered S3 + multi-null confirmed | diff --git a/tools/update_status.py b/tools/update_status.py index bc033a9..1894fa4 100644 --- a/tools/update_status.py +++ b/tools/update_status.py @@ -41,13 +41,13 @@ CI_WORKFLOW = ".github/workflows/ci.yml" VALIDATION_LEVEL = ( - "Synthetic-ground-truth calibration PLUS a Bonn external benchmark whose pre-registered, " - "reproduced seed-averaged AR-null specificity gate PASSES: BONN_S2_SEED_ROBUST_PASS_MULTINULL_" - "PENDING. S3 (N=1000, 10 seeds, frozen-before-run, re-run reproduced byte-for-byte): G1 power " - "0.94, G2 AR-null FPR 0.028, Wilson 95% CI [0.019, 0.040], upper <= 0.05. Specificity is robust " - "to seed under the AR null; multi-null robustness (IAAFT/phase-randomized) is NOT_DONE, so the " - "full robust claim is withheld (robust_gate_passed=null). BNCI2014-001 preregistration-only " - "(BLOCKED_METHOD). NOT clinical, regulatory, multi-dataset replicated, or multi-null robust. " + "Synthetic-ground-truth calibration PLUS a Bonn external benchmark that is ROBUSTLY passed: " + "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED. Specificity is robust to BOTH seed and null-model choice. " + "Pre-registered S3 seed-averaged AR-null (N=1000, 10 seeds, frozen-before-run, re-run reproduced " + "byte-for-byte): G1 power 0.94, G2 FPR 0.028, Wilson 95% CI [0.019, 0.040]. Multi-null gate " + "(AR/IAAFT/phase-randomized) all Wilson CI-upper <= 0.05 (robust_gate_passed=true). This survived " + "and superseded a smaller-N calibration. BNCI2014-001 preregistration-only (execution not valid " + "for narrowband epochs). NOT clinical, regulatory, BNCI-executed, or multi-dataset replicated. " "Canonical state: artifacts/release/CURRENT_TRUTH.json." ) From 6eae021e2c54181c4e7326c05f93ef8c37484a1d Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 05:33:43 +0300 Subject: [PATCH 11/12] ruff format: generate_current_truth, validate_statistical_claims, test_statistical_claims Formatting-only (no behavior change); fixes lint-ruff format check. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_statistical_claims.py | 14 +++++++++++--- tools/generate_current_truth.py | 24 ++++++++++++++++++------ tools/validate_statistical_claims.py | 2 +- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/tests/test_statistical_claims.py b/tests/test_statistical_claims.py index 4e9acd0..c0b0e6e 100644 --- a/tests/test_statistical_claims.py +++ b/tests/test_statistical_claims.py @@ -13,7 +13,9 @@ def _vsc(): - spec = importlib.util.spec_from_file_location("vsc", ROOT / "tools" / "validate_statistical_claims.py") + spec = importlib.util.spec_from_file_location( + "vsc", ROOT / "tools" / "validate_statistical_claims.py" + ) mod = importlib.util.module_from_spec(spec) sys.modules["vsc"] = mod spec.loader.exec_module(mod) @@ -26,10 +28,16 @@ def test_repo_passes_statistical_claims(tmp_path): def test_truth_records_robustness_honestly(): t = json.loads((ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json").read_text()) - assert {"robust_gate", "robust_gate_passed", "s2_wilson_ci_upper", "bonn_s2_robustness_state"} <= set(t) + assert { + "robust_gate", + "robust_gate_passed", + "s2_wilson_ci_upper", + "bonn_s2_robustness_state", + } <= set(t) # If the specificity CI upper crosses 0.05, the state must NOT claim a robust/unqualified pass. if t.get("s2_wilson_ci_upper") and t["s2_wilson_ci_upper"] > 0.05: assert t["latest_validation_state"] not in { - "BONN_S2_BRIGHT_LINE_PASSED", "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED" + "BONN_S2_BRIGHT_LINE_PASSED", + "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED", } assert t["robust_gate_passed"] is False diff --git a/tools/generate_current_truth.py b/tools/generate_current_truth.py index 80c9464..050107a 100644 --- a/tools/generate_current_truth.py +++ b/tools/generate_current_truth.py @@ -46,7 +46,11 @@ def _s2_robustness() -> str: ci = d.get("G2", {}).get("wilson_95ci") if d.get("S3_PASS"): mn = _multi_null_passed() - tag = {True: "_MULTINULL_CONFIRMED", False: "_MULTINULL_FAILED", None: "_MULTINULL_PENDING"}[mn] + tag = { + True: "_MULTINULL_CONFIRMED", + False: "_MULTINULL_FAILED", + None: "_MULTINULL_PENDING", + }[mn] return f"SEED_ROBUST_AR_NULL_PASS_CI_{ci}{tag}" return f"S3_SEED_AVERAGED_NOT_ROBUST_CI_{ci}" cal = ROOT / "artifacts" / "bonn_bright_line" / "S2_SPECIFICITY_CALIBRATION.json" @@ -54,7 +58,9 @@ def _s2_robustness() -> str: c = json.loads(cal.read_text()) if not c.get("fpr_ci_upper_below_threshold", True): ci = c.get("wilson_95ci") - return f"NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_{c.get('pooled_fpr')}_CI_{ci}_crosses_0.05" + return ( + f"NOT_ROBUST_G2_SPECIFICITY_seed_avg_FPR_{c.get('pooled_fpr')}_CI_{ci}_crosses_0.05" + ) return "ROBUST" fals = ROOT / "artifacts" / "bonn_bright_line" / "S2_FALSIFICATION_REPORT.json" if fals.is_file() and not json.loads(fals.read_text()).get("claim_survives_attacks", True): @@ -104,8 +110,11 @@ def _bonn_robustness() -> dict: multi_null = _multi_null_passed() full_robust = (seed_robust is True and multi_null is True) if multi_null is not None else None return { - "seed_robust": seed_robust, "multi_null": multi_null, "full_robust": full_robust, - "seed_avg_fpr": fpr, "wilson_ci_upper": ci_upper, + "seed_robust": seed_robust, + "multi_null": multi_null, + "full_robust": full_robust, + "seed_avg_fpr": fpr, + "wilson_ci_upper": ci_upper, } @@ -143,8 +152,11 @@ def build() -> dict: "robust_gate": "G1_power>=0.80 AND G2_AR-null_FPR_Wilson95_CI_upper<=0.05", "seed_robust_gate_passed": rob["seed_robust"], # S3 seed-averaged AR-null (reproduced) "multi_null_robustness_state": ( - "PASSED" if rob["multi_null"] is True - else "FAILED" if rob["multi_null"] is False else "NOT_DONE" + "PASSED" + if rob["multi_null"] is True + else "FAILED" + if rob["multi_null"] is False + else "NOT_DONE" ), # robust_gate_passed requires BOTH seed-averaged AND multi-null robustness. "robust_gate_passed": rob["full_robust"], diff --git a/tools/validate_statistical_claims.py b/tools/validate_statistical_claims.py index f409143..5f576c5 100644 --- a/tools/validate_statistical_claims.py +++ b/tools/validate_statistical_claims.py @@ -70,7 +70,7 @@ def main(argv=None) -> int: lows = [ln.lower() for ln in lines] for i, ln in enumerate(lines): # negation context = this line + previous 2 (handles wrapped "not\nrobustly crossed"). - ctx = " ".join(lows[max(0, i - 2): i + 1]) + ctx = " ".join(lows[max(0, i - 2) : i + 1]) if bad.search(ln) and not neg.search(ctx): viol.append( f"{rel}:{i + 1}: claims robust pass while robust_gate_passed!=True: {ln.strip()[:60]}" From b296e264f8f4caeaa9a2c09e12bfbb36bc6b30e9 Mon Sep 17 00:00:00 2001 From: Yaroslav Vasylenko Date: Thu, 25 Jun 2026 05:50:49 +0300 Subject: [PATCH 12/12] Update canonical-state assertions to robustly-passed token Two tests hardcoded the pre-falsification token BONN_S2_BRIGHT_LINE_PASSED; the state evolved to BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED via the falsification->S3->multi-null arc. BNCI test now asserts the Bonn-prefix family (BNCI independently method-blocked). 515 offline tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/bci_generalization/test_bnci_lock_audit.py | 3 ++- tests/test_public_execution_layer.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/bci_generalization/test_bnci_lock_audit.py b/tests/bci_generalization/test_bnci_lock_audit.py index 4cc2640..8441622 100644 --- a/tests/bci_generalization/test_bnci_lock_audit.py +++ b/tests/bci_generalization/test_bnci_lock_audit.py @@ -48,4 +48,5 @@ def test_method_validity_is_blocked_method(): def test_canonical_bnci_state_blocked_method(): truth = json.loads((ROOT / "artifacts" / "release" / "CURRENT_TRUTH.json").read_text()) assert truth["bnci_execution_state"] == "BNCI_BLOCKED_METHOD" - assert truth["latest_validation_state"] == "BONN_S2_BRIGHT_LINE_PASSED" # preserved + # Bonn evolved to the robustly-passed state; BNCI remains independently method-blocked. + assert truth["latest_validation_state"].startswith("BONN_S2_BRIGHT_LINE") diff --git a/tests/test_public_execution_layer.py b/tests/test_public_execution_layer.py index f6cc2e1..97b20f6 100644 --- a/tests/test_public_execution_layer.py +++ b/tests/test_public_execution_layer.py @@ -44,7 +44,7 @@ def test_evidence_verify_on_repo_passes(): # Integration: the committed bundle must verify clean on the canonical repo. out = bench.verify_evidence(REPO) assert out["state"] == "PASS", out.get("failed") - assert out["canonical_state"] == "BONN_S2_BRIGHT_LINE_PASSED" + assert out["canonical_state"] == "BONN_S2_BRIGHT_LINE_ROBUSTLY_PASSED" def test_reproduce_bonn_s2_dry_run_passes():