From 28048ec8223aba4d011ec22dc2460cca316dd22e Mon Sep 17 00:00:00 2001 From: RomirJ Date: Wed, 10 Jun 2026 16:23:42 -0700 Subject: [PATCH] fix(exporters): scope fp16 external-data cleanup to this model (data-loss) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit §3.3 / Part 1 #12. convert_fp32_to_fp16's pre-save cleanup globbed every `*.bin` and `*.data` file in `dst.parent` and unlinked them. The intent was only to clear leftover external data from a prior run of the model being converted — but when a user converts into a directory that also holds OTHER models' weight files (a shared export dir, a folder with an unrelated `model.onnx.data`), those get destroyed. Extracted `_remove_stale_external_data(dst)`: deletes only `.bin`/`.data` files whose name starts with `dst.stem`, never `dst` itself. Pure pathlib, so it's unit-testable without onnx/torch. Added tests/test_fp16_convert_external_data.py (5 cases): removes this model's stale .bin + legacy .data variants, preserves sibling models' weights (the data-loss guard), never unlinks dst, no-op when clean. All pass; ruff clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/tether/exporters/fp16_convert.py | 42 ++++++++++++---- tests/test_fp16_convert_external_data.py | 61 ++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 tests/test_fp16_convert_external_data.py diff --git a/src/tether/exporters/fp16_convert.py b/src/tether/exporters/fp16_convert.py index 7c6bfe53..d389fcdf 100644 --- a/src/tether/exporters/fp16_convert.py +++ b/src/tether/exporters/fp16_convert.py @@ -115,6 +115,35 @@ def parity_gate( } +def _remove_stale_external_data(dst: Path) -> list[Path]: + """Delete external-data leftovers from a PRIOR conversion of *this* model. + + ``onnx.save`` writes the FP16 weights to ``{dst.stem}.bin`` next to ``dst``. + A leftover ``.bin``/``.data`` from an earlier run at the same path would sit + alongside the new one and inflate our size accounting, so we clear it first. + + Scope is strictly this model's stem. The previous implementation globbed + every ``*.bin`` / ``*.data`` in ``dst.parent`` and unlinked them — which + destroyed *other* models' weight files whenever a user converted into a + shared export directory (real data-loss bug). ``dst`` itself (the ``.onnx``) + is never touched. + + Returns the list of paths actually removed (for logging / tests). + """ + removed: list[Path] = [] + for old in dst.parent.glob(f"{dst.stem}*"): + if old == dst: + continue + if old.suffix not in (".bin", ".data"): + continue + try: + old.unlink() + removed.append(old) + except OSError: + pass + return removed + + def convert_fp32_to_fp16( fp32_onnx_path: str | Path, fp16_onnx_path: str | Path, @@ -205,15 +234,10 @@ def convert_fp32_to_fp16( ) logger.info("[fp16] Saving %s...", dst) - # Remove any leftover external data from a prior run at this path — - # onnx.save would otherwise leave both the old and new .bin on disk - # and our size accounting would double-count. - for pat in ("*.bin", "*.data"): - for old in dst.parent.glob(pat): - try: - old.unlink() - except Exception: - pass + # Clear leftover external data from a prior conversion of THIS model only. + # (Scoped to dst.stem — see _remove_stale_external_data; a blanket + # *.bin/*.data sweep would delete other models' weights in a shared dir.) + _remove_stale_external_data(dst) onnx.save( model_fp16, diff --git a/tests/test_fp16_convert_external_data.py b/tests/test_fp16_convert_external_data.py new file mode 100644 index 00000000..0c3776fb --- /dev/null +++ b/tests/test_fp16_convert_external_data.py @@ -0,0 +1,61 @@ +"""Regression tests for fp16_convert external-data cleanup scoping. + +Guards the data-loss bug where the pre-save cleanup globbed every *.bin/*.data +in the destination's parent directory and unlinked them — destroying OTHER +models' weight files when a user converted into a shared export dir. The fix +scopes deletion strictly to the model being converted (dst.stem). +""" +from __future__ import annotations + +from pathlib import Path + +from tether.exporters.fp16_convert import _remove_stale_external_data + + +def _touch(p: Path) -> Path: + p.write_bytes(b"x") + return p + + +def test_removes_this_models_stale_bin(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + stale = _touch(tmp_path / "model_fp16.bin") + removed = _remove_stale_external_data(dst) + assert stale in removed + assert not stale.exists() + + +def test_removes_legacy_data_variants_for_this_model(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + legacy1 = _touch(tmp_path / "model_fp16.data") + legacy2 = _touch(tmp_path / "model_fp16.onnx.data") + removed = _remove_stale_external_data(dst) + assert legacy1 in removed and legacy2 in removed + assert not legacy1.exists() and not legacy2.exists() + + +def test_preserves_other_models_weights(tmp_path: Path) -> None: + """The core data-loss guard: a sibling model's external data must survive.""" + dst = tmp_path / "model_fp16.onnx" + _touch(tmp_path / "model_fp16.bin") # ours — will be removed + other_bin = _touch(tmp_path / "other_model.bin") + other_data = _touch(tmp_path / "unrelated.onnx.data") + other_onnx = _touch(tmp_path / "other_model.onnx") + + removed = _remove_stale_external_data(dst) + + assert other_bin.exists(), "sibling model .bin was destroyed" + assert other_data.exists(), "unrelated .onnx.data was destroyed" + assert other_onnx.exists() + assert other_bin not in removed and other_data not in removed + + +def test_never_removes_dst_itself(tmp_path: Path) -> None: + dst = _touch(tmp_path / "model_fp16.onnx") + _remove_stale_external_data(dst) + assert dst.exists(), "dst .onnx must never be unlinked" + + +def test_no_leftovers_is_noop(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + assert _remove_stale_external_data(dst) == []