diff --git a/tests/contract/test_turboquant_estimate_contract.py b/tests/contract/test_turboquant_estimate_contract.py new file mode 100644 index 0000000..5916d6e --- /dev/null +++ b/tests/contract/test_turboquant_estimate_contract.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import numpy as np +import pytest + +from semafold import EncodeMetric, EncodeObjective, VectorEncodeRequest +from semafold.turboquant import ( + TurboQuantMSEConfig, + TurboQuantMSEVectorCodec, + TurboQuantProdConfig, + TurboQuantProdVectorCodec, +) + + +def _normalized_rows(*, seed: int, shape: tuple[int, int], dtype: type[np.generic]) -> np.ndarray: + rng = np.random.default_rng(seed) + rows = rng.normal(size=shape).astype(np.float32) + norms = np.linalg.norm(rows.astype(np.float64), axis=1, keepdims=True).astype(np.float32) + norms = np.where(norms == 0.0, np.float32(1.0), norms) + return np.asarray(rows / norms, dtype=dtype) + + +@pytest.mark.parametrize( + ("codec", "encode_request"), + [ + ( + TurboQuantMSEVectorCodec( + config=TurboQuantMSEConfig(default_bits_per_scalar=3, default_rotation_seed=7) + ), + VectorEncodeRequest( + data=np.random.default_rng(17).normal(size=(8, 32)).astype(np.float32), + objective=EncodeObjective.RECONSTRUCTION, + metric=EncodeMetric.MSE, + role="embedding", + seed=19, + ), + ), + ( + TurboQuantProdVectorCodec( + config=TurboQuantProdConfig(total_bits_per_scalar=4, default_rotation_seed=7, default_qjl_seed=11) + ), + VectorEncodeRequest( + data=_normalized_rows(seed=23, shape=(8, 32), dtype=np.float32), + objective=EncodeObjective.INNER_PRODUCT_ESTIMATION, + metric=EncodeMetric.DOT_PRODUCT_ERROR, + role="embedding", + seed=29, + ), + ), + ], +) +def test_turboquant_estimate_contract_exposes_exact_accounting_fields( + codec, + encode_request: VectorEncodeRequest, +) -> None: + estimate = codec.estimate(encode_request) + encoding = codec.encode(encode_request) + + assert estimate.baseline_bytes == int(encode_request.data.nbytes) + assert estimate.estimated_payload_bytes is not None + assert estimate.estimated_metadata_bytes is not None + assert estimate.estimated_sidecar_bytes is not None + assert estimate.estimated_protected_passthrough_bytes == 0 + assert estimate.estimated_decoder_state_bytes == 0 + assert estimate.estimated_total_bytes is not None + assert estimate.estimated_compression_ratio is not None + + assert estimate.estimated_total_bytes == ( + estimate.estimated_payload_bytes + + estimate.estimated_metadata_bytes + + estimate.estimated_sidecar_bytes + + estimate.estimated_protected_passthrough_bytes + + estimate.estimated_decoder_state_bytes + ) + assert estimate.estimated_compression_ratio == pytest.approx( + float(estimate.baseline_bytes) / float(estimate.estimated_total_bytes) + ) + + assert encoding.footprint.payload_bytes == estimate.estimated_payload_bytes + assert encoding.footprint.metadata_bytes == estimate.estimated_metadata_bytes + assert encoding.footprint.sidecar_bytes == estimate.estimated_sidecar_bytes + assert encoding.footprint.protected_passthrough_bytes == estimate.estimated_protected_passthrough_bytes + assert encoding.footprint.decoder_state_bytes == estimate.estimated_decoder_state_bytes + assert encoding.footprint.total_bytes == estimate.estimated_total_bytes + assert encoding.footprint.compression_ratio == pytest.approx(estimate.estimated_compression_ratio) diff --git a/tests/integration/test_turboquant_estimate_consistency.py b/tests/integration/test_turboquant_estimate_consistency.py new file mode 100644 index 0000000..22ebc5b --- /dev/null +++ b/tests/integration/test_turboquant_estimate_consistency.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import numpy as np +import pytest + +from semafold import VectorEncodeRequest +from semafold.turboquant import ( + TurboQuantMSEConfig, + TurboQuantMSEVectorCodec, + TurboQuantProdConfig, + TurboQuantProdVectorCodec, +) +from semafold.vector.models import EncodeMetric, EncodeObjective + + +def _normalized_data(*, seed: int, shape: tuple[int, ...], dtype: type[np.generic]) -> np.ndarray: + rng = np.random.default_rng(seed) + data = rng.normal(size=shape).astype(np.float32) + if len(shape) == 2: + norms = np.linalg.norm(data.astype(np.float64), axis=1, keepdims=True).astype(np.float32) + norms = np.where(norms == 0.0, np.float32(1.0), norms) + data = np.asarray(data / norms, dtype=np.float32) + return data.astype(dtype) + + +@pytest.mark.parametrize( + ("codec_factory", "request_factory", "seed"), + [ + ( + lambda: TurboQuantMSEVectorCodec( + config=TurboQuantMSEConfig(default_bits_per_scalar=1, default_rotation_seed=5) + ), + lambda seed: VectorEncodeRequest( + data=_normalized_data(seed=seed, shape=(16,), dtype=np.float32), + objective=EncodeObjective.RECONSTRUCTION, + metric=EncodeMetric.MSE, + role="embedding", + seed=11, + ), + 101, + ), + ( + lambda: TurboQuantMSEVectorCodec( + config=TurboQuantMSEConfig(default_bits_per_scalar=4, default_rotation_seed=5) + ), + lambda seed: VectorEncodeRequest( + data=_normalized_data(seed=seed, shape=(6, 32), dtype=np.float64), + objective=EncodeObjective.RECONSTRUCTION, + metric=EncodeMetric.MSE, + role="embedding", + seed=13, + ), + 202, + ), + ( + lambda: TurboQuantProdVectorCodec( + config=TurboQuantProdConfig(total_bits_per_scalar=2, default_rotation_seed=7, default_qjl_seed=17) + ), + lambda seed: VectorEncodeRequest( + data=_normalized_data(seed=seed, shape=(8, 32), dtype=np.float32), + objective=EncodeObjective.INNER_PRODUCT_ESTIMATION, + metric=EncodeMetric.DOT_PRODUCT_ERROR, + role="embedding", + seed=19, + ), + 303, + ), + ( + lambda: TurboQuantProdVectorCodec( + config=TurboQuantProdConfig(total_bits_per_scalar=5, default_rotation_seed=7, default_qjl_seed=17) + ), + lambda seed: VectorEncodeRequest( + data=_normalized_data(seed=seed, shape=(4, 64), dtype=np.float16), + objective=EncodeObjective.INNER_PRODUCT_ESTIMATION, + metric=EncodeMetric.DOT_PRODUCT_ERROR, + role="embedding", + seed=23, + ), + 404, + ), + ], +) +def test_turboquant_estimate_matches_encode_across_supported_shapes_and_precisions( + codec_factory, + request_factory, + seed: int, +) -> None: + codec = codec_factory() + encode_request = request_factory(seed) + + estimate = codec.estimate(encode_request) + encoding = codec.encode(encode_request) + + assert estimate.estimated_total_bytes is not None + assert estimate.estimated_payload_bytes is not None + assert estimate.estimated_metadata_bytes is not None + assert estimate.estimated_sidecar_bytes is not None + assert estimate.estimated_compression_ratio is not None + + assert encoding.footprint.total_bytes == estimate.estimated_total_bytes + assert encoding.footprint.payload_bytes == estimate.estimated_payload_bytes + assert encoding.footprint.metadata_bytes == estimate.estimated_metadata_bytes + assert encoding.footprint.sidecar_bytes == estimate.estimated_sidecar_bytes + assert encoding.footprint.compression_ratio == pytest.approx(estimate.estimated_compression_ratio) + assert encoding.footprint.baseline_bytes == estimate.baseline_bytes diff --git a/tests/integration/test_turboquant_kv_rate_distortion.py b/tests/integration/test_turboquant_kv_rate_distortion.py new file mode 100644 index 0000000..5e27307 --- /dev/null +++ b/tests/integration/test_turboquant_kv_rate_distortion.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import numpy as np + +from semafold.turboquant.kv import TurboQuantKVConfig, TurboQuantKVPreviewCodec + + +def _normalize_last_axis(array: np.ndarray) -> np.ndarray: + norms = np.linalg.norm(array.astype(np.float64), axis=-1, keepdims=True).astype(np.float32) + norms = np.where(norms == 0.0, np.float32(1.0), norms) + return np.asarray(array / norms, dtype=np.float32) + + +def _softmax(array: np.ndarray, *, axis: int = -1) -> np.ndarray: + shifted = array - np.max(array, axis=axis, keepdims=True) + exp = np.exp(shifted) + return exp / np.sum(exp, axis=axis, keepdims=True) + + +def _attention_output(queries: np.ndarray, keys: np.ndarray, values: np.ndarray) -> np.ndarray: + scale = float(np.sqrt(keys.shape[-1], dtype=np.float32)) + scores = np.einsum("bhqd,bhkd->bhqk", queries.astype(np.float64), keys.astype(np.float64)) / scale + weights = _softmax(scores, axis=-1) + return np.einsum("bhqk,bhkd->bhqd", weights, values.astype(np.float64)) + + +def _sample_attention_inputs(*, seed: int = 123) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + rng = np.random.default_rng(seed) + queries = _normalize_last_axis(rng.standard_normal((2, 2, 5, 16), dtype=np.float32)) + keys = _normalize_last_axis(rng.standard_normal((2, 2, 7, 16), dtype=np.float32)) + values = rng.standard_normal((2, 2, 7, 16), dtype=np.float32) + return queries, keys, values + + +def _attention_quality( + *, + queries: np.ndarray, + keys: np.ndarray, + values: np.ndarray, + codec: TurboQuantKVPreviewCodec, +) -> tuple[dict[str, float | int], float, float]: + artifact = codec.compress(keys, values) + restored_keys, restored_values = codec.decompress(artifact) + exact_output = _attention_output(queries, keys, values) + approx_output = _attention_output(queries, restored_keys, restored_values) + mse = float(np.mean(np.square(exact_output - approx_output))) + cosine_similarity = float( + np.sum(exact_output * approx_output) + / ((np.linalg.norm(exact_output) + 1e-12) * (np.linalg.norm(approx_output) + 1e-12)) + ) + return codec.memory_stats(artifact), mse, cosine_similarity + + +def test_turboquant_kv_rate_distortion_tradeoff_is_visible_in_memory_stats_and_attention_quality() -> None: + queries, keys, values = _sample_attention_inputs() + + low_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=2, + value_bits_per_scalar=1, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + high_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=5, + value_bits_per_scalar=4, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + + low_stats, low_mse, low_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=low_codec, + ) + high_stats, high_mse, high_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=high_codec, + ) + + assert int(low_stats["combined_bytes"]) < int(high_stats["combined_bytes"]) + assert int(low_stats["key_bytes"]) < int(high_stats["key_bytes"]) + assert int(low_stats["value_bytes"]) < int(high_stats["value_bytes"]) + assert float(low_stats["combined_compression_ratio"]) > float(high_stats["combined_compression_ratio"]) + + assert high_mse < low_mse + assert high_cosine > low_cosine + + +def test_turboquant_kv_key_bits_mainly_move_key_memory_and_attention_quality() -> None: + queries, keys, values = _sample_attention_inputs() + + low_key_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=2, + value_bits_per_scalar=3, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + high_key_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=5, + value_bits_per_scalar=3, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + + low_stats, low_mse, low_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=low_key_codec, + ) + high_stats, high_mse, high_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=high_key_codec, + ) + + assert int(low_stats["key_bytes"]) < int(high_stats["key_bytes"]) + assert int(low_stats["combined_bytes"]) < int(high_stats["combined_bytes"]) + assert abs(int(low_stats["value_bytes"]) - int(high_stats["value_bytes"])) <= 16 + + assert high_mse < low_mse + assert high_cosine > low_cosine + + +def test_turboquant_kv_value_bits_mainly_move_value_memory_and_attention_quality() -> None: + queries, keys, values = _sample_attention_inputs() + + low_value_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=3, + value_bits_per_scalar=1, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + high_value_codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=3, + value_bits_per_scalar=4, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + + low_stats, low_mse, low_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=low_value_codec, + ) + high_stats, high_mse, high_cosine = _attention_quality( + queries=queries, + keys=keys, + values=values, + codec=high_value_codec, + ) + + assert int(low_stats["value_bytes"]) < int(high_stats["value_bytes"]) + assert int(low_stats["combined_bytes"]) < int(high_stats["combined_bytes"]) + assert abs(int(low_stats["key_bytes"]) - int(high_stats["key_bytes"])) <= 16 + + assert high_mse < low_mse + assert high_cosine > low_cosine diff --git a/tests/integration/test_turboquant_mse_rate_distortion.py b/tests/integration/test_turboquant_mse_rate_distortion.py new file mode 100644 index 0000000..bb737cf --- /dev/null +++ b/tests/integration/test_turboquant_mse_rate_distortion.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import numpy as np + +from semafold import VectorDecodeRequest +from semafold import VectorEncodeRequest +from semafold.turboquant import TurboQuantMSEConfig +from semafold.turboquant import TurboQuantMSEVectorCodec +from semafold.vector.models import EncodeMetric, EncodeObjective + + +def _observed_mse(encoding) -> float: # type: ignore[no-untyped-def] + value = next(guarantee.value for guarantee in encoding.guarantees if guarantee.metric == "observed_mse") + assert isinstance(value, float) + return value + + +def _decode_mse(*, data: np.ndarray, encoding) -> float: # type: ignore[no-untyped-def] + decoded = TurboQuantMSEVectorCodec().decode(VectorDecodeRequest(encoding=encoding)).data + return float(np.mean((data.astype(np.float64) - decoded.astype(np.float64)) ** 2)) + + +def test_turboquant_mse_rate_distortion_tradeoff_is_visible_in_artifact_size_and_decode_error() -> None: + rng = np.random.default_rng(7) + data = rng.normal(size=(12, 64)).astype(np.float32) + request = VectorEncodeRequest( + data=data, + objective=EncodeObjective.RECONSTRUCTION, + metric=EncodeMetric.MSE, + role="embedding", + seed=11, + ) + + low = TurboQuantMSEVectorCodec( + config=TurboQuantMSEConfig(default_bits_per_scalar=1, default_rotation_seed=5) + ).encode(request) + high = TurboQuantMSEVectorCodec( + config=TurboQuantMSEConfig(default_bits_per_scalar=4, default_rotation_seed=5) + ).encode(request) + + assert low.footprint.total_bytes < high.footprint.total_bytes + assert low.footprint.payload_bytes < high.footprint.payload_bytes + assert low.footprint.compression_ratio > high.footprint.compression_ratio + + assert _observed_mse(high) < _observed_mse(low) + assert _decode_mse(data=data, encoding=high) < _decode_mse(data=data, encoding=low) diff --git a/tests/integration/test_turboquant_prod_rate_distortion.py b/tests/integration/test_turboquant_prod_rate_distortion.py new file mode 100644 index 0000000..f6ce18e --- /dev/null +++ b/tests/integration/test_turboquant_prod_rate_distortion.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import numpy as np + +from semafold import VectorDecodeRequest +from semafold import VectorEncodeRequest +from semafold.turboquant import TurboQuantProdConfig +from semafold.turboquant import TurboQuantProdVectorCodec +from semafold.vector.models import EncodeMetric, EncodeObjective + + +def _unit_rows(*, seed: int, shape: tuple[int, int]) -> np.ndarray: + rng = np.random.default_rng(seed) + rows = rng.normal(size=shape).astype(np.float32) + norms = np.linalg.norm(rows.astype(np.float64), axis=1, keepdims=True).astype(np.float32) + norms = np.where(norms == 0.0, np.float32(1.0), norms) + return np.asarray(rows / norms, dtype=np.float32) + + +def _mean_inner_product_error(*, queries: np.ndarray, data: np.ndarray, encoding) -> float: # type: ignore[no-untyped-def] + decoded = TurboQuantProdVectorCodec().decode(VectorDecodeRequest(encoding=encoding)).data.astype(np.float64) + exact_scores = queries.astype(np.float64) @ data.astype(np.float64).T + approx_scores = queries.astype(np.float64) @ decoded.T + return float(np.mean(np.abs(approx_scores - exact_scores))) + + +def _theory_proxy(encoding) -> float: # type: ignore[no-untyped-def] + evidence = next(item for item in encoding.evidence if item.scope == "theory_proxy") + value = evidence.metrics["mean_query_free_variance_factor"] + assert isinstance(value, float) + return value + + +def test_turboquant_prod_rate_distortion_tradeoff_is_visible_in_artifact_size_and_inner_product_quality() -> None: + data = _unit_rows(seed=123, shape=(12, 64)) + queries = _unit_rows(seed=456, shape=(7, 64)) + request = VectorEncodeRequest( + data=data, + objective=EncodeObjective.INNER_PRODUCT_ESTIMATION, + metric=EncodeMetric.DOT_PRODUCT_ERROR, + role="embedding", + seed=11, + ) + + low = TurboQuantProdVectorCodec( + config=TurboQuantProdConfig(total_bits_per_scalar=2, default_rotation_seed=5, default_qjl_seed=17) + ).encode(request) + high = TurboQuantProdVectorCodec( + config=TurboQuantProdConfig(total_bits_per_scalar=5, default_rotation_seed=5, default_qjl_seed=17) + ).encode(request) + + assert low.footprint.total_bytes < high.footprint.total_bytes + assert low.footprint.payload_bytes < high.footprint.payload_bytes + assert low.footprint.compression_ratio > high.footprint.compression_ratio + + assert _theory_proxy(high) < _theory_proxy(low) + assert _mean_inner_product_error(queries=queries, data=data, encoding=high) < _mean_inner_product_error( + queries=queries, + data=data, + encoding=low, + ) diff --git a/tests/regression/test_turboquant_kv_memory_stats_golden.py b/tests/regression/test_turboquant_kv_memory_stats_golden.py new file mode 100644 index 0000000..2dec653 --- /dev/null +++ b/tests/regression/test_turboquant_kv_memory_stats_golden.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import numpy as np + +from semafold.turboquant.kv import TurboQuantKVConfig, TurboQuantKVPreviewCodec + + +def _normalize_last_axis(array: np.ndarray) -> np.ndarray: + norms = np.linalg.norm(array.astype(np.float64), axis=-1, keepdims=True).astype(np.float32) + norms = np.where(norms == 0.0, np.float32(1.0), norms) + return np.asarray(array / norms, dtype=np.float32) + + +def test_turboquant_kv_memory_stats_golden_snapshot() -> None: + rng = np.random.default_rng(31) + keys = _normalize_last_axis(rng.standard_normal((2, 2, 6, 16), dtype=np.float32)) + values = rng.standard_normal((2, 2, 6, 16), dtype=np.float32) + codec = TurboQuantKVPreviewCodec( + config=TurboQuantKVConfig( + key_total_bits_per_scalar=3, + value_bits_per_scalar=3, + default_key_rotation_seed=7, + default_key_qjl_seed=11, + default_value_rotation_seed=17, + ) + ) + + artifact = codec.compress(keys, values) + + assert codec.memory_stats(artifact) == { + "baseline_bytes": 3072, + "baseline_fp16_bytes": 1536, + "baseline_bf16_bytes": 1536, + "key_bytes": 829, + "value_bytes": 722, + "combined_bytes": 1551, + "combined_compression_ratio": 1.9806576402321083, + "combined_compression_ratio_vs_fp16": 0.9903288201160542, + "combined_compression_ratio_vs_bf16": 0.9903288201160542, + }