diff --git a/openverifiablellm/verify.py b/openverifiablellm/verify.py index 0431c41..28b1076 100644 --- a/openverifiablellm/verify.py +++ b/openverifiablellm/verify.py @@ -331,7 +331,6 @@ def verify_preprocessing( logger.info("Re-running preprocessing in temp dir: %s", tmp_dir) try: - env = os.environ.copy() env["PYTHONPATH"] = os.pathsep.join(p for p in sys.path if p) @@ -410,9 +409,7 @@ def verify_preprocessing( actual=reproduced_manifest.get("preprocessing_version"), detail="Preprocessing version tag", ) - # verify chunk size recorded in the manifest matches whatever - # the preprocessing run produced. this needs the reproduced - # manifest, so we only perform it once the file has been loaded. + if "chunk_size_bytes" in manifest: _check_field( report, "manifest_chunk_size_bytes", @@ -420,6 +417,12 @@ def verify_preprocessing( actual=reproduced_manifest.get("chunk_size_bytes"), detail="Merkle chunk size used during preprocessing", ) + else: + report.add(CheckResult( + name="manifest_chunk_size_bytes", + status=CheckStatus.SKIP, + detail="Field absent from manifest (older version)", + )) else: report.add(CheckResult( name="manifest_regenerated", @@ -478,4 +481,4 @@ def main(argv=None): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/tests/test_util.py b/tests/test_util.py index 4dcb989..430e1be 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -269,4 +269,27 @@ def test_export_and_load_merkle_proof(tmp_path): proof_file_path=proof_file, chunk_data=chunk, expected_root=root, - ) \ No newline at end of file + ) + +def test_extract_text_from_xml_malformed_xml(tmp_path, monkeypatch): + import defusedxml.ElementTree as ET + + malformed_xml_content = """ + + + + Hello [[Malformed]] + + + + """ + + input_file = tmp_path / "simplewiki-20260201-pages-malformed.xml" + + with open(input_file, "w", encoding="utf-8") as f: + f.write(malformed_xml_content) + + monkeypatch.chdir(tmp_path) + + with pytest.raises(ET.ParseError): + utils.extract_text_from_xml(input_file) diff --git a/tests/test_verify.py b/tests/test_verify.py index 784c60d..1d0832c 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -377,12 +377,11 @@ def setUp(self): def test_merkle_checks_are_skipped(self): r = verify_preprocessing(self.dump, project_root=self.tmp) - for name in ("raw_merkle_root", "processed_merkle_root"): + for name in ("raw_merkle_root", "processed_merkle_root", "manifest_chunk_size_bytes"): c = next((x for x in r.checks if x.name == name), None) self.assertIsNotNone(c, f"check '{name}' not found") self.assertEqual(c.status, CheckStatus.SKIP) - # legacy manifests should not even contain the chunk-size - self.assertFalse(any(c.name == "manifest_chunk_size_bytes" for c in r.checks)) + self.assertIn("Field absent from manifest (older version)", c.detail) def test_other_checks_still_pass(self): r = verify_preprocessing(self.dump, project_root=self.tmp) @@ -391,4 +390,4 @@ def test_other_checks_still_pass(self): if __name__ == "__main__": - unittest.main(verbosity=2) + unittest.main(verbosity=2) \ No newline at end of file