From ecfd72f2db53986b7047ebf20c6441f8294bf448 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 10 Jun 2026 11:25:33 -0700 Subject: [PATCH] [cuda] int4: stabilize two-layer decode test via CUDA-seeded init _make_int4_linear built the throwaway nn.Linear on CPU, so reset_parameters() drew from the CPU RNG between the two layer constructions and shifted the stream that seeds the quantized weights. That pushed test_two_layer_mlp's genuine INT4 error from 0.1405 to 0.1556, crossing the 0.15 bound. Build the module with device=cuda so init draws from the CUDA RNG, leaving the CPU stream (and the measured error) deterministic. Test-only; dequant math is unchanged. --- backends/cuda/tests/test_int4_dispatch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backends/cuda/tests/test_int4_dispatch.py b/backends/cuda/tests/test_int4_dispatch.py index fd748ae8584..ecf1a53e48e 100644 --- a/backends/cuda/tests/test_int4_dispatch.py +++ b/backends/cuda/tests/test_int4_dispatch.py @@ -59,7 +59,10 @@ def _make_int4_linear(N, K, group_size=128, symmetric=False, bias=False): ) int4_w = quantize_weight(w_bf16, config) - module = nn.Linear(K, N, bias=bias, dtype=torch.bfloat16) + # device="cuda" so the random init draws from the CUDA RNG to match the + # same random weight as regular int4 dispatch and fit the same numerical + # error tolerance. + module = nn.Linear(K, N, bias=bias, dtype=torch.bfloat16, device="cuda") pack_linear_for_cuda(module, {"weight": int4_w}) module.cuda() return module, w_bf16.cuda()