From 114af7a5e8b7d21977847befa612bd5f2651ae31 Mon Sep 17 00:00:00 2001 From: Evgeny Kotov Date: Fri, 29 May 2026 12:27:45 +0200 Subject: [PATCH 1/4] [PT FE] Support torch_fused GPTQ quant type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gptqmodel auto-selects a TorchFusedQuantLinear backend whose layers report QUANT_TYPE == "torch_fused" while using the standard 4-bit/int32 GPTQ weight packing. The GPTQ patcher only accepted a fixed whitelist of quant types, so both the convert and torch.export paths rejected such models — convert then failed downstream with "No conversion rule for aten::bitwise_right_shift". Add "torch_fused" to supported_quant_types; the single whitelist gates both paths, so OpenVINO's decompression pattern is produced and the weights fold to a u4 constant as before. Pin and document the transformers/gptqmodel versions in the LLM model-hub env so the backend selection cannot drift silently, and re-enable the opt_gptq model-hub entries. Add hermetic PyTorch FE layer tests covering the convert (keeps u4, no live BitwiseRightShift) and export paths. --- .../src/openvino/frontend/pytorch/gptq.py | 2 +- .../py_frontend_tests/test_torch_frontend.py | 98 +++++++++++++++++++ tests/model_hub_tests/pytorch/envs/llm.txt | 6 ++ tests/model_hub_tests/pytorch/test_llm.py | 4 +- 4 files changed, 107 insertions(+), 3 deletions(-) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py index f185109b9945c2..329a64e891c51f 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py @@ -94,7 +94,7 @@ def patched_forward_sym(self, *args, **kwargs): # All the following AutoGPTQ/GPTQModel quant types are supposed to have the same weights packing schema -supported_quant_types = ["triton", "exllama", "exllamav2", "cuda-old", "hf_kernel"] +supported_quant_types = ["triton", "exllama", "exllamav2", "cuda-old", "hf_kernel", "torch_fused"] def patch_model(model): diff --git a/tests/layer_tests/py_frontend_tests/test_torch_frontend.py b/tests/layer_tests/py_frontend_tests/test_torch_frontend.py index 47c0296d09c079..a293347e2367be 100644 --- a/tests/layer_tests/py_frontend_tests/test_torch_frontend.py +++ b/tests/layer_tests/py_frontend_tests/test_torch_frontend.py @@ -2317,6 +2317,104 @@ def forward(self, x): assert not hasattr(m, "_openvino_quantized_patch_orig_forward") +def _make_torch_fused_gptq_model(in_features=32, out_features=64, group_size=32): + """Build a minimal GPTQ model whose linear layer mimics gptqmodel's + ``TorchFusedQuantLinear`` backend (``QUANT_TYPE == "torch_fused"``), using the + standard 4-bit/int32 weight packing the OpenVINO GPTQ patcher expects. The + layer's own ``forward`` is a placeholder — OpenVINO replaces it with its + decompression forward before tracing/export, so only the packed buffers and + attributes need to be realistic. + """ + bits = 4 + pack_num = 32 // bits # 8 nibbles per int32 + + class FakeQuantConfig: + quant_method = "gptq" + sym = True + + class FakeConfig: + quantization_config = FakeQuantConfig() + + class TorchFusedLinear(torch.nn.Module): + QUANT_TYPE = "torch_fused" + + def __init__(self): + super().__init__() + self.bits = bits + self.group_size = group_size + # Real GPTQ backends register the packed tensors as buffers (not + # parameters); the OpenVINO patcher re-assigns plain tensors to them. + self.register_buffer("qweight", torch.randint( + 0, 2 ** 31, (in_features // pack_num, out_features), + dtype=torch.int32)) + self.register_buffer("qzeros", torch.randint( + 0, 2 ** 31, (in_features // group_size, out_features // pack_num), + dtype=torch.int32)) + self.register_buffer("scales", torch.randn( + in_features // group_size, out_features, dtype=torch.float16)) + self.bias = None + + def forward(self, x): + return torch.zeros(*x.shape[:-1], out_features, dtype=x.dtype) + + class GPTQModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.config = FakeConfig() + self.linear = TorchFusedLinear() + + def forward(self, x): + return self.linear(x) + + return GPTQModel(), torch.randn(2, in_features) + + +def test_gptq_torch_fused_convert_keeps_u4(): + """A GPTQ model whose layers report ``QUANT_TYPE == "torch_fused"`` must convert + via the TorchScript path and keep its 4-bit weight packing: the resulting + ov::Model must contain a 4-bit (i4/u4) Constant and no live ``BitwiseRightShift`` + weight-unpacking op.""" + from openvino.frontend.pytorch.ts_decoder import TorchScriptPythonDecoder + + model, x = _make_torch_fused_gptq_model() + model.eval() + + # Convert through the frontend directly: TorchScriptPythonDecoder traces the + # model and auto-applies the GPTQ patch, and FrontEnd.convert keeps the u4 + # weight constant produced by the u4_compression_stack fold. The full + # openvino.convert_model MOC pipeline would constant-fold the all-constant + # dequant subgraph of this tiny fixture, hiding the packing under test. + decoder = TorchScriptPythonDecoder(model, example_input=(x,)) + fe = FrontEndManager().load_by_framework("pytorch") + ov_model = fe.convert(fe.load(decoder)) + assert ov_model + + ops = ov_model.get_ops() + type_names = [o.get_type_name() for o in ops] + # The GPTQ unpacking must have been folded away (no runtime bit-shift unpacking). + assert "BitwiseRightShift" not in type_names + # ...and the weights must be stored as a packed 4-bit constant. + four_bit_consts = [o for o in ops + if o.get_type_name() == "Constant" + and o.get_output_element_type(0) in (Type.i4, Type.u4)] + assert four_bit_consts, "expected a packed 4-bit (i4/u4) weight constant" + + +def test_gptq_torch_fused_export_supported(): + """``patch_quantized_for_export`` must accept ``QUANT_TYPE == "torch_fused"`` + rather than raising ``ValueError`` for the unsupported quant type.""" + from openvino.frontend.pytorch.quantized import ( + patch_quantized_for_export, unpatch_quantized_for_export) + + model, _ = _make_torch_fused_gptq_model() + + patch_quantized_for_export(model) # must not raise + try: + assert hasattr(model.linear, "_openvino_quantized_patch_orig_forward") + finally: + unpatch_quantized_for_export(model) + + # ────────────────────────────────────────────────────────────────────── # Tests for dynamo=True auto-patching of quantized models # ────────────────────────────────────────────────────────────────────── diff --git a/tests/model_hub_tests/pytorch/envs/llm.txt b/tests/model_hub_tests/pytorch/envs/llm.txt index d83136f8536d56..054d739fc2bb76 100644 --- a/tests/model_hub_tests/pytorch/envs/llm.txt +++ b/tests/model_hub_tests/pytorch/envs/llm.txt @@ -1,5 +1,11 @@ # Extra dependencies for test_llm.py (LLM quantized models) # These are NOT needed by test_hf_transformers.py +# +# Versions below are hard-pinned (==) intentionally. transformers/gptqmodel are bumped +# deliberately, not automatically: a silent upgrade changes the GPTQ backend selection +# (e.g. gptqmodel auto-selecting TorchFusedQuantLinear -> QUANT_TYPE "torch_fused") and the +# generated graph, which previously broke conversion. Bump these together and re-validate +# the opt_gptq entry in test_llm.py before raising them. transformers==5.5.3 huggingface-hub==1.10.1 diff --git a/tests/model_hub_tests/pytorch/test_llm.py b/tests/model_hub_tests/pytorch/test_llm.py index c3277efe5e98c5..f401824fc5a5cf 100644 --- a/tests/model_hub_tests/pytorch/test_llm.py +++ b/tests/model_hub_tests/pytorch/test_llm.py @@ -596,7 +596,7 @@ def get_supported_precommit_models(): ] if platform.machine() not in ['arm', 'armv7l', 'aarch64', 'arm64', 'ARM64']: models.extend([ - #("opt_gptq", "katuni4ka/opt-125m-gptq"), + ("opt_gptq", "katuni4ka/opt-125m-gptq"), ("llama", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), ("llama_awq", "casperhansen/tinyllama-1b-awq"), ]) @@ -658,7 +658,7 @@ def get_supported_export_precommit_models(): return [] return [ ("llama_awq", "casperhansen/tinyllama-1b-awq"), - #("opt_gptq", "katuni4ka/opt-125m-gptq"), + ("opt_gptq", "katuni4ka/opt-125m-gptq"), ] @pytest.mark.parametrize("type,name", get_supported_export_precommit_models()) From f9dbbd84fa2f5669f30cca6f3c94eb2e3dcf0929 Mon Sep 17 00:00:00 2001 From: Evgeny Kotov Date: Mon, 1 Jun 2026 14:10:49 +0200 Subject: [PATCH 2/4] [PT FE] Pin kernels to 0.14.1 in LLM model-hub env kernels is pulled transitively by transformers but was left unpinned, so it drifted to 0.15.x in CI. kernels>=0.15 made LayerRepository require a version or revision, which transformers 5.5.3's hub_kernels.py constructs without, breaking 'import transformers' and failing every LLM model-hub test. Pin kernels/kernels-data to the validated 0.14.1 so the backend selection cannot drift silently. --- tests/model_hub_tests/pytorch/envs/llm.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/model_hub_tests/pytorch/envs/llm.txt b/tests/model_hub_tests/pytorch/envs/llm.txt index 054d739fc2bb76..652eddc89b62b8 100644 --- a/tests/model_hub_tests/pytorch/envs/llm.txt +++ b/tests/model_hub_tests/pytorch/envs/llm.txt @@ -10,6 +10,12 @@ transformers==5.5.3 huggingface-hub==1.10.1 +# kernels is pulled transitively by transformers; pin it too. kernels>=0.15 made +# LayerRepository require a version/revision, which transformers 5.5.3's hub_kernels.py +# constructs without -> ImportError at "import transformers". Keep at the validated 0.14.1. +kernels==0.14.1 +kernels-data==0.14.1 + # quantized model deps autoawq==0.2.9; platform_system == "Linux" and platform_machine == "x86_64" triton==3.6.0; platform_system == "Linux" and platform_machine == "x86_64" From ebd34c899ed6c7515799ed9a7b544afb09c4d325 Mon Sep 17 00:00:00 2001 From: Evgeny Kotov Date: Wed, 3 Jun 2026 11:50:35 +0200 Subject: [PATCH 3/4] [PT FE] Drop duplicate kernels pin in LLM model-hub env kernels==0.14.1 was pinned twice: a pre-existing conditional pin (scoped to x86_64 + python<3.12, for gptqmodel's uncapped kernels dependency) and a newer unconditional pin added because transformers also pulls kernels uncapped and import transformers breaks with kernels>=0.15. The unconditional pin is a strict superset, so the conditional one is redundant. Keep the single unconditional kernels/kernels-data pin and fold the gptqmodel rationale into its comment; remove the redundant conditional line. --- tests/model_hub_tests/pytorch/envs/llm.txt | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/model_hub_tests/pytorch/envs/llm.txt b/tests/model_hub_tests/pytorch/envs/llm.txt index 652eddc89b62b8..aa1de5617bbc3c 100644 --- a/tests/model_hub_tests/pytorch/envs/llm.txt +++ b/tests/model_hub_tests/pytorch/envs/llm.txt @@ -10,9 +10,10 @@ transformers==5.5.3 huggingface-hub==1.10.1 -# kernels is pulled transitively by transformers; pin it too. kernels>=0.15 made -# LayerRepository require a version/revision, which transformers 5.5.3's hub_kernels.py -# constructs without -> ImportError at "import transformers". Keep at the validated 0.14.1. +# kernels (and kernels-data) are pulled transitively by transformers and gptqmodel, neither of +# which caps the version. kernels>=0.15 made LayerRepository require a version/revision, which +# transformers 5.5.3's hub_kernels.py constructs without -> ImportError at "import transformers". +# Keep at the validated 0.14.1. kernels==0.14.1 kernels-data==0.14.1 @@ -20,9 +21,6 @@ kernels-data==0.14.1 autoawq==0.2.9; platform_system == "Linux" and platform_machine == "x86_64" triton==3.6.0; platform_system == "Linux" and platform_machine == "x86_64" gptqmodel==6.0.3; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" -# `gptqmodel` depends on `kernels`, but doesn't have upper boundary for version, which caused test failures after -# `kernels` was updated to 0.15.1 -kernels==0.14.1; platform_machine == "x86_64" and python_version < "3.12" peft==0.18.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" From f88507afe09beb0eb897ba87c3c8bb91f8d0d686 Mon Sep 17 00:00:00 2001 From: Evgeny Kotov Date: Fri, 5 Jun 2026 15:44:54 +0200 Subject: [PATCH 4/4] [PT FE] Inherit input device in torch_fused GPTQ test fixture The placeholder forward in the torch_fused GPTQ test fixture built its output on the default device. Inherit the input's device alongside its dtype so the fixture stays correct if the example input is ever placed on a non-default device. OpenVINO replaces this forward before tracing, so behaviour on CPU is unchanged. --- tests/layer_tests/py_frontend_tests/test_torch_frontend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/layer_tests/py_frontend_tests/test_torch_frontend.py b/tests/layer_tests/py_frontend_tests/test_torch_frontend.py index a293347e2367be..cda59c2a7bdf34 100644 --- a/tests/layer_tests/py_frontend_tests/test_torch_frontend.py +++ b/tests/layer_tests/py_frontend_tests/test_torch_frontend.py @@ -2355,7 +2355,7 @@ def __init__(self): self.bias = None def forward(self, x): - return torch.zeros(*x.shape[:-1], out_features, dtype=x.dtype) + return torch.zeros(*x.shape[:-1], out_features, dtype=x.dtype, device=x.device) class GPTQModel(torch.nn.Module): def __init__(self):