diff --git a/src/tether/exporters/fp16_convert.py b/src/tether/exporters/fp16_convert.py index 7c6bfe5..d389fcd 100644 --- a/src/tether/exporters/fp16_convert.py +++ b/src/tether/exporters/fp16_convert.py @@ -115,6 +115,35 @@ def parity_gate( } +def _remove_stale_external_data(dst: Path) -> list[Path]: + """Delete external-data leftovers from a PRIOR conversion of *this* model. + + ``onnx.save`` writes the FP16 weights to ``{dst.stem}.bin`` next to ``dst``. + A leftover ``.bin``/``.data`` from an earlier run at the same path would sit + alongside the new one and inflate our size accounting, so we clear it first. + + Scope is strictly this model's stem. The previous implementation globbed + every ``*.bin`` / ``*.data`` in ``dst.parent`` and unlinked them — which + destroyed *other* models' weight files whenever a user converted into a + shared export directory (real data-loss bug). ``dst`` itself (the ``.onnx``) + is never touched. + + Returns the list of paths actually removed (for logging / tests). + """ + removed: list[Path] = [] + for old in dst.parent.glob(f"{dst.stem}*"): + if old == dst: + continue + if old.suffix not in (".bin", ".data"): + continue + try: + old.unlink() + removed.append(old) + except OSError: + pass + return removed + + def convert_fp32_to_fp16( fp32_onnx_path: str | Path, fp16_onnx_path: str | Path, @@ -205,15 +234,10 @@ def convert_fp32_to_fp16( ) logger.info("[fp16] Saving %s...", dst) - # Remove any leftover external data from a prior run at this path — - # onnx.save would otherwise leave both the old and new .bin on disk - # and our size accounting would double-count. - for pat in ("*.bin", "*.data"): - for old in dst.parent.glob(pat): - try: - old.unlink() - except Exception: - pass + # Clear leftover external data from a prior conversion of THIS model only. + # (Scoped to dst.stem — see _remove_stale_external_data; a blanket + # *.bin/*.data sweep would delete other models' weights in a shared dir.) + _remove_stale_external_data(dst) onnx.save( model_fp16, diff --git a/tests/test_fp16_convert_external_data.py b/tests/test_fp16_convert_external_data.py new file mode 100644 index 0000000..0c3776f --- /dev/null +++ b/tests/test_fp16_convert_external_data.py @@ -0,0 +1,61 @@ +"""Regression tests for fp16_convert external-data cleanup scoping. + +Guards the data-loss bug where the pre-save cleanup globbed every *.bin/*.data +in the destination's parent directory and unlinked them — destroying OTHER +models' weight files when a user converted into a shared export dir. The fix +scopes deletion strictly to the model being converted (dst.stem). +""" +from __future__ import annotations + +from pathlib import Path + +from tether.exporters.fp16_convert import _remove_stale_external_data + + +def _touch(p: Path) -> Path: + p.write_bytes(b"x") + return p + + +def test_removes_this_models_stale_bin(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + stale = _touch(tmp_path / "model_fp16.bin") + removed = _remove_stale_external_data(dst) + assert stale in removed + assert not stale.exists() + + +def test_removes_legacy_data_variants_for_this_model(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + legacy1 = _touch(tmp_path / "model_fp16.data") + legacy2 = _touch(tmp_path / "model_fp16.onnx.data") + removed = _remove_stale_external_data(dst) + assert legacy1 in removed and legacy2 in removed + assert not legacy1.exists() and not legacy2.exists() + + +def test_preserves_other_models_weights(tmp_path: Path) -> None: + """The core data-loss guard: a sibling model's external data must survive.""" + dst = tmp_path / "model_fp16.onnx" + _touch(tmp_path / "model_fp16.bin") # ours — will be removed + other_bin = _touch(tmp_path / "other_model.bin") + other_data = _touch(tmp_path / "unrelated.onnx.data") + other_onnx = _touch(tmp_path / "other_model.onnx") + + removed = _remove_stale_external_data(dst) + + assert other_bin.exists(), "sibling model .bin was destroyed" + assert other_data.exists(), "unrelated .onnx.data was destroyed" + assert other_onnx.exists() + assert other_bin not in removed and other_data not in removed + + +def test_never_removes_dst_itself(tmp_path: Path) -> None: + dst = _touch(tmp_path / "model_fp16.onnx") + _remove_stale_external_data(dst) + assert dst.exists(), "dst .onnx must never be unlinked" + + +def test_no_leftovers_is_noop(tmp_path: Path) -> None: + dst = tmp_path / "model_fp16.onnx" + assert _remove_stale_external_data(dst) == []