From 11571f93d398cb43391af8c2fd7bcedf92f7de02 Mon Sep 17 00:00:00 2001
From: Nitin Jain <jainnitin@meta.com>
Date: Thu, 28 Aug 2025 19:06:46 -0700
Subject: [PATCH] Add 16A8W support and test for mul operation

Add 16A8W quantization support and test for the mul operation in ExecutorTorch ARM backend.

This follows the pattern established for linear operations, extending int16 support to mul operations.

Changes:
- Add INT16 dtype validation support in op_mul.py
- Add test_mul_tensor_16a8w_tosa_INT test function
- Enable test_mul.py in test targets configuration

The 16A8W configuration uses 16-bit activations with 8-bit weights, enabling higher precision for activations while maintaining weight efficiency.

Differential Revision: [D80510628](https://our.internmc.facebook.com/intern/diff/D80510628/)

[ghstack-poisoned]
---
 backends/arm/operators/op_mul.py  |  2 +-
 backends/arm/test/ops/test_mul.py | 51 ++++++++++++++++++++++++++++++-
 backends/arm/test/targets.bzl     |  1 +
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/backends/arm/operators/op_mul.py b/backends/arm/operators/op_mul.py
index 7d9f6eac6aa..669dd506fc2 100644
--- a/backends/arm/operators/op_mul.py
+++ b/backends/arm/operators/op_mul.py
@@ -51,7 +51,7 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT32],
+            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32],
             output.tosa_spec,
         )
 
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index d8f9e947ce3..ebe3ed7f8f4 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -9,8 +9,12 @@
 from typing import Tuple
 
 import torch
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_a16w8_quantization_config,
+    TOSAQuantizer,
+)
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     EthosU85PipelineINT,
@@ -18,6 +22,8 @@
     TosaPipelineINT,
     VgfPipeline,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.xnnpack.test.tester import Quantize
 
 input_t1 = Tuple[torch.Tensor, torch.Tensor]  # Input x
 aten_op = "torch.ops.aten.mul.Tensor"
@@ -268,3 +274,46 @@ def test_mul_tensor_vgf_INT_int32(test_data: torch.Tensor):
     )
     pipeline.pop_stage("check.quant_nodes")
     pipeline.run()
+
+
+def get_symmetric_a16w8_mul_quantizer(u55_config=False, per_channel_quantization=False):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+    quantizer.set_global(
+        get_symmetric_a16w8_quantization_config(is_per_channel=per_channel_quantization)
+    )
+
+    return Quantize(
+        quantizer,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+
+@common.parametrize("test_data", test_data_suite)
+def test_mul_tensor_16a8w_tosa_INT(test_data: input_t1):
+    """Test mul operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
+    per_channel_quantization = False
+
+    pipeline = TosaPipelineINT[input_t1](
+        Mul(),
+        test_data(),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        tosa_extensions=["int16"],
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_mul_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    pipeline.run()
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 405f1bbf081..b438e556cca 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -16,6 +16,7 @@ def define_arm_tests():
         "ops/test_add.py",
         "ops/test_avg_pool2d.py",
         "ops/test_linear.py", 
+        "ops/test_mul.py",
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
         "ops/test_tanh.py",