diff --git a/.github/workflows/benchmark-validation.yml b/.github/workflows/benchmark-validation.yml index c88db0a..7c6706c 100644 --- a/.github/workflows/benchmark-validation.yml +++ b/.github/workflows/benchmark-validation.yml @@ -3,11 +3,10 @@ name: Benchmark Validation on: pull_request: paths: - - 'benchmarks/**' - - 'corpus/**' - - 'runners/**' - - 'schemas/**' - - 'results/**' + - 'data/**' + - 'dali/**' + - 'tools/**' + - '.github/workflows/benchmark-validation.yml' push: branches: - main @@ -38,7 +37,7 @@ jobs: run: | python3 -c " import json, sys - with open('benchmarks/tier1/corpus/citation_failure_cases.json') as f: + with open('data/benchmark/tier1/corpus/citation_failure_cases.json') as f: data = json.load(f) cases = data.get('records', data) if isinstance(data, dict) else data assert len(cases) >= 1, 'Corpus must have at least 1 record' @@ -47,9 +46,9 @@ jobs: - name: Run Tier 1 deterministic evaluator run: | - python runners/run_integrity.py \ - --corpus benchmarks/tier1/corpus/citation_failure_cases.json \ - --output results/ci/integrity.json + python -m dali.runners.run_integrity \ + --corpus data/benchmark/tier1/corpus/citation_failure_cases.json \ + --output data/results/ci/integrity.json echo "Evaluator completed" - name: Validate result schema @@ -57,9 +56,9 @@ jobs: python3 -c " import json, jsonschema - with open('schemas/integrity-result.schema.json') as f: + with open('dali/schemas/integrity-result.schema.json') as f: schema = json.load(f) - with open('results/ci/integrity.json') as f: + with open('data/results/ci/integrity.json') as f: results = json.load(f) records = results if isinstance(results, list) else results.get('results', []) @@ -80,7 +79,7 @@ jobs: run: | python3 -c " import json - with open('results/ci/integrity.json') as f: + with open('data/results/ci/integrity.json') as f: results = json.load(f) records = results if isinstance(results, list) else results.get('results', []) eligible = [r for r in records if r.get('scoring_eligible', True)] @@ -95,7 +94,7 @@ jobs: if: always() with: name: tier1-integrity-results - path: results/ci/integrity.json + path: data/results/ci/integrity.json retention-days: 30 corpus-quality-gate: @@ -116,7 +115,7 @@ jobs: python3 -c " import json, sys, pathlib - corpus_path = pathlib.Path('benchmarks/tier1/corpus/citation_failure_cases.json') + corpus_path = pathlib.Path('data/benchmark/tier1/corpus/citation_failure_cases.json') with open(corpus_path) as f: data = json.load(f) @@ -166,8 +165,8 @@ jobs: python3 -c " import json, pathlib, sys - schema_files = list(pathlib.Path('schemas').glob('*.json')) + \ - list(pathlib.Path('results').glob('**/schema.json')) + schema_files = list(pathlib.Path('dali/schemas').glob('*.json')) + \ + list(pathlib.Path('data/results').glob('**/schema.json')) errors = [] for f in schema_files: @@ -191,7 +190,7 @@ jobs: errors = [] total = 0 - for jsonl_file in pathlib.Path('synthetic').rglob('*.jsonl'): + for jsonl_file in pathlib.Path('data/benchmark/tier2').rglob('*.jsonl'): with open(jsonl_file) as f: for i, line in enumerate(f, 1): line = line.strip() diff --git a/.github/workflows/replay-verification.yml b/.github/workflows/replay-verification.yml index 22469f2..6e54e5c 100644 --- a/.github/workflows/replay-verification.yml +++ b/.github/workflows/replay-verification.yml @@ -34,9 +34,9 @@ jobs: # test for Dali's determinism claim — if it fails, the project's core # property is broken and the PR must not merge. run: | - python runners/run_integrity.py \ - --corpus benchmarks/tier1/corpus/citation_failure_cases.json \ - --output results/ci/replay-verify.json \ + python -m dali.runners.run_integrity \ + --corpus data/benchmark/tier1/corpus/citation_failure_cases.json \ + --output data/results/ci/replay-verify.json \ --verify-replay - name: Upload artifact @@ -44,5 +44,5 @@ jobs: uses: actions/upload-artifact@v4 with: name: replay-verify-output - path: results/ci/replay-verify.json + path: data/results/ci/replay-verify.json retention-days: 7 diff --git a/data/results/ci/integrity.json b/data/results/ci/integrity.json new file mode 100644 index 0000000..8144e4b --- /dev/null +++ b/data/results/ci/integrity.json @@ -0,0 +1,91 @@ +{ + "run_timestamp": "2026-06-04T19:52:48.371513+00:00", + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evaluator": "local-reference", + "cross_version_aggregation": false, + "policy_versions_present": [ + "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0" + ], + "summary": { + "total_evaluated": 3, + "defensibility_risk_distribution": { + "critical": 2, + "high": 1 + } + }, + "results": [ + { + "case_id": "mata-v-avianca-2023", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "nonexistent_authority", + "fabricated_quote", + "reconstructability_failure" + ], + "defensibility_risk": "critical", + "workflow_reconstructable": false, + "verification_recoverable": "infeasible", + "mutation_lineage": [], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "40182d0b7be3eece338b772831af7d528a4742a60f90278a31132b5d80b233ac", + "run_timestamp": "2026-06-04T19:52:48.370765+00:00", + "corpus_record_hash": "30dd70980404de1216bde548dfa7d67becf8bf9090df0a48f7b7c5876fb4eeb6", + "replay_hash": "b17c945f13e17c1d3a32c5c8805d717e1d3412ec0c316337d86f53eb2c6a4455" + }, + { + "case_id": "us-v-cohen-2023", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "nonexistent_authority", + "reconstructability_failure", + "provenance_gap" + ], + "defensibility_risk": "critical", + "workflow_reconstructable": false, + "verification_recoverable": "infeasible", + "mutation_lineage": [], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "701a28a39d742b2191a874954277f93cd12bb18dfb79a894524e08ebc4cf67cc", + "run_timestamp": "2026-06-04T19:52:48.371177+00:00", + "corpus_record_hash": "8642ed8585064218d035a9f4feab380198332e814018f882e87c8995e5711397", + "replay_hash": "9f1a6f3eb11227e1b793e5d25651b06ba912eca62425bdd550655bd240603294" + }, + { + "case_id": "mata-derivative-reporter-swap-001", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "citation_mutation", + "nonexistent_authority" + ], + "defensibility_risk": "high", + "workflow_reconstructable": "unknown", + "verification_recoverable": "manual", + "mutation_lineage": [ + "mata-v-avianca-2023" + ], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "0500f9593d7594f1df0e450ca87ee5ce39f137f5fb8c916cbcd8d2d708ce446f", + "run_timestamp": "2026-06-04T19:52:48.371412+00:00", + "corpus_record_hash": "99543d33ef7a6c236c81ce7725af62aa403f5578261b2f019bf75571d8ce58e3", + "replay_hash": "68f68d236c7a00e505606c517f16512c63d0e95759d10210b23b63b324c6bcd9" + } + ] +} \ No newline at end of file diff --git a/data/results/ci/replay-verify.json b/data/results/ci/replay-verify.json new file mode 100644 index 0000000..01e68de --- /dev/null +++ b/data/results/ci/replay-verify.json @@ -0,0 +1,91 @@ +{ + "run_timestamp": "2026-06-04T19:52:48.371765+00:00", + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evaluator": "local-reference", + "cross_version_aggregation": false, + "policy_versions_present": [ + "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0" + ], + "summary": { + "total_evaluated": 3, + "defensibility_risk_distribution": { + "critical": 2, + "high": 1 + } + }, + "results": [ + { + "case_id": "mata-v-avianca-2023", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "nonexistent_authority", + "fabricated_quote", + "reconstructability_failure" + ], + "defensibility_risk": "critical", + "workflow_reconstructable": false, + "verification_recoverable": "infeasible", + "mutation_lineage": [], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "581bca10f3a10230815a106428e6f1584e41231d5a30da6e8ffcb538a7dcd4a5", + "run_timestamp": "2026-06-04T19:52:48.370536+00:00", + "corpus_record_hash": "30dd70980404de1216bde548dfa7d67becf8bf9090df0a48f7b7c5876fb4eeb6", + "replay_hash": "b17c945f13e17c1d3a32c5c8805d717e1d3412ec0c316337d86f53eb2c6a4455" + }, + { + "case_id": "us-v-cohen-2023", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "nonexistent_authority", + "reconstructability_failure", + "provenance_gap" + ], + "defensibility_risk": "critical", + "workflow_reconstructable": false, + "verification_recoverable": "infeasible", + "mutation_lineage": [], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "5d164660668c920c3bfe249d559b464cc80b5d64a5d882f0d77547bae15509f9", + "run_timestamp": "2026-06-04T19:52:48.371392+00:00", + "corpus_record_hash": "8642ed8585064218d035a9f4feab380198332e814018f882e87c8995e5711397", + "replay_hash": "9f1a6f3eb11227e1b793e5d25651b06ba912eca62425bdd550655bd240603294" + }, + { + "case_id": "mata-derivative-reporter-swap-001", + "citation_exists": false, + "authority_reachable": false, + "semantic_alignment": "misaligned", + "quote_fidelity": "fabricated", + "temporal_validity": "unknown", + "jurisdiction_match": "unknown", + "provenance_complete": false, + "failure_classes_detected": [ + "citation_mutation", + "nonexistent_authority" + ], + "defensibility_risk": "high", + "workflow_reconstructable": "unknown", + "verification_recoverable": "manual", + "mutation_lineage": [ + "mata-v-avianca-2023" + ], + "policy_version": "taxonomy=2.0.0;rubric=1.0.0;scoring=1.0.0;normalization=1.0.0;schema=1.0.0", + "evidence_hash": "11d7cd18d910188a4a2473e8dcb54338a69e8c660b7bb237f00122e7d1249be0", + "run_timestamp": "2026-06-04T19:52:48.371649+00:00", + "corpus_record_hash": "99543d33ef7a6c236c81ce7725af62aa403f5578261b2f019bf75571d8ce58e3", + "replay_hash": "68f68d236c7a00e505606c517f16512c63d0e95759d10210b23b63b324c6bcd9" + } + ] +} \ No newline at end of file