pytorch · psiddh · Jun 16, 2026 · Apr 16, 2026
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -264,7 +264,7 @@ test_model_with_qnn() {
         ;;
   esac
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} --build_folder ${CMAKE_OUTPUT_DIR} --soc_model ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 

@@ -58,11 +58,9 @@
 from .recompose_pad_maxpool2d import RecomposePadMaxPool2d
 from .recompose_pixel_unshuffle import RecomposePixelUnshuffle
 from .recompose_rms_norm import RecomposeRmsNorm
-from .reduce_dynamic_range import ReduceDynamicRange
 from .remove_0d_tensor import Remove0DTensor
 from .remove_redundancy import RemoveRedundancy
 from .replace_arange_args import ReplaceArangeArgs
-from .replace_inf_values import ReplaceInfValues
 from .resolve_debug_handle import ResolveDebugHandle
 from .seq_mse import SeqMSE
 from .tag_quant_io import TagQuantIO
@@ -122,11 +120,9 @@
     RecomposePadMaxPool2d,
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
-    ReduceDynamicRange,
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceInfValues,
     ResolveDebugHandle,
     SeqMSE,
     TagQuantIO,

@@ -30,6 +30,8 @@ def _decompose_im2col(self, graph_module: torch.fx.GraphModule):
             if node.target == self.im2col_op:
                 input_node = node.args[0]
                 kernel_size = node.args[1]
+                dilation = node.args[2]
+                padding = node.args[3]
                 stride = node.args[4]
                 batch_size = node.meta["val"].shape[0]
                 assert (
@@ -41,6 +43,12 @@ def _decompose_im2col(self, graph_module: torch.fx.GraphModule):
                 assert (
                     kernel_size[0] == kernel_size[1]
                 ), "im2col can only be converted when kernel height == width"
+                assert all(
+                    d == 1 for d in dilation
+                ), "col2im can only be converted when dilation equals to (1, 1)"
+                assert all(
+                    p == 0 for p in padding
+                ), "col2im can only be converted when padding equals to (0, 0)"
                 users = list(node.users.keys())
                 with graph_module.graph.inserting_after(input_node):
                     pixel_unshuffle_node = graph_module.graph.create_node(
@@ -77,6 +85,8 @@ def _decompose_col2im(self, graph_module: torch.fx.GraphModule):
                 input_node = node.args[0]
                 output_size = node.args[1]
                 kernel_size = node.args[2]
+                dilation = node.args[3]
+                padding = node.args[4]
                 stride = node.args[5]
                 batch_size = node.meta["val"].shape[0]
                 assert (
@@ -88,6 +98,13 @@ def _decompose_col2im(self, graph_module: torch.fx.GraphModule):
                 assert (
                     kernel_size[0] == kernel_size[1]
                 ), "col2im can only be converted when kernel height == width"
+                assert all(
+                    d == 1 for d in dilation
+                ), "col2im can only be converted when dilation equals to (1, 1)"
+                assert all(
+                    p == 0 for p in padding
+                ), "col2im can only be converted when padding equals to (0, 0)"
+
                 users = list(node.users.keys())
                 with graph_module.graph.inserting_after(input_node):
                     view_tensor = input_node.meta["val"].reshape(

@@ -25,6 +25,12 @@ def forward(self, x):
             self.dim = 0
 
         x = torch.abs(x)
+
+        # QNN would not be able to compute pow where exponential is inf or -inf.
+        if self.exp == float("inf"):
+            return torch.amax(x, dim=self.dim, keepdim=self.keepdim)
+        if self.exp == float("-inf"):
+            return torch.amin(x, dim=self.dim, keepdim=self.keepdim)
         x = torch.pow(x, self.exp)
         x = torch.sum(x, dim=self.dim, keepdim=self.keepdim)
         return torch.pow(x, 1.0 / self.exp)

@@ -23,8 +23,10 @@ def __init__(self):
         super(DecomposeRemainder, self).__init__()
         self.remainder_targets = {
             torch.ops.aten.remainder.Scalar,
+            torch.ops.aten.remainder.Scalar_Tensor,
             torch.ops.aten.remainder.Tensor,
             exir_ops.edge.aten.remainder.Scalar,
+            exir_ops.edge.aten.remainder.Scalar_Tensor,
             exir_ops.edge.aten.remainder.Tensor,
         }
 
@@ -35,7 +37,7 @@ def call(self, graph_module: torch.fx.GraphModule):
 
         for node in list(graph.nodes):
             if node.op == "call_function" and node.target in self.remainder_targets:
-                x_node = node.args[0]
+                x_arg = node.args[0]
                 y_arg = node.args[1]
                 is_edge = isinstance(node.target, EdgeOpOverload)
                 meta = node.meta
@@ -61,8 +63,21 @@ def call(self, graph_module: torch.fx.GraphModule):
                     else torch.ops.aten.sub.Tensor
                 )
 
-                is_scalar = not isinstance(y_arg, torch.fx.Node)
-                if is_scalar and is_edge:
+                is_x_scalar = not isinstance(x_arg, torch.fx.Node)
+                if is_x_scalar and is_edge:
+                    if x_arg not in const_cache:
+                        attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
+                            graph_module
+                        )
+                        const_cache[x_arg] = get_const_node(
+                            graph, graph_module, attr_name, x_arg, node
+                        )
+                    x_node = const_cache[x_arg]
+                else:
+                    x_node = x_arg
+
+                is_y_scalar = not isinstance(y_arg, torch.fx.Node)
+                if is_y_scalar and is_edge:
                     if y_arg not in const_cache:
                         attr_name = get_new_attr_name_with_prefix("_remainder_const_")(
                             graph_module

@@ -16,7 +16,7 @@ def __init__(self, val_shape, shifts, dims):
         super().__init__()
         self.val_shape = val_shape
         if dims[0] is None:
-            self.shifts = [shifts[0] % torch.numel(torch.tensor(val_shape))]
+            self.shifts = [shifts[0] % torch.numel(torch.empty(val_shape))]
         else:
             self.shifts = [shift % val_shape[dim] for shift, dim in zip(shifts, dims)]
         self.dims = dims

@@ -33,11 +33,17 @@ class TensorOpInfo:
 
 SCALAR_OPS = {
     aten.eq.Scalar: TensorOpInfo(aten.eq.Tensor, False, False),
+    aten.eq.Tensor: TensorOpInfo(aten.eq.Tensor, False, False),
     aten.ge.Scalar: TensorOpInfo(aten.ge.Tensor, False, False),
+    aten.ge.Tensor: TensorOpInfo(aten.ge.Tensor, False, False),
     aten.gt.Scalar: TensorOpInfo(aten.gt.Tensor, False, False),
+    aten.gt.Tensor: TensorOpInfo(aten.gt.Tensor, False, False),
     aten.le.Scalar: TensorOpInfo(aten.le.Tensor, False, False),
+    aten.le.Tensor: TensorOpInfo(aten.le.Tensor, False, False),
     aten.lt.Scalar: TensorOpInfo(aten.lt.Tensor, False, False),
+    aten.lt.Tensor: TensorOpInfo(aten.lt.Tensor, False, False),
     aten.ne.Scalar: TensorOpInfo(aten.ne.Tensor, False, False),
+    aten.ne.Tensor: TensorOpInfo(aten.ne.Tensor, False, False),
     aten.add.Scalar: TensorOpInfo(aten.add.Tensor, False, False),
     aten.add_.Scalar: TensorOpInfo(aten.add_.Tensor, False, False),
     # For below cases, refer to LiftAddTensor Model in UT for sample
@@ -88,6 +94,7 @@ def _build_tensor_constant(
     ) -> TensorConstant:
         # For dtype, in some cases, we cannot use node.args[0] as scalar dtype.
         # Ex: Where op args[0] can be bool, however, we probably want args[1] and args[2] to be dtype same as node.meta["val"] instead of bool type
+
         first_arg = node.args[0]
         tensor = torch.tensor(
             const_val,

@@ -61,11 +61,9 @@
     RecomposePadMaxPool2d,
     RecomposePixelUnshuffle,
     RecomposeRmsNorm,
-    ReduceDynamicRange,
     Remove0DTensor,
     RemoveRedundancy,
     ReplaceArangeArgs,
-    ReplaceInfValues,
     ResolveDebugHandle,
     TagQuantIO,
 )
@@ -158,7 +156,6 @@ def get_annotation_passes(cls):
         """Return annotation pipeline pass classes. Override in subclasses to add backend-specific passes."""
         return [
             RemoveRedundancy,
-            ReduceDynamicRange,
             RecomposePixelUnshuffle,
             RecomposeRmsNorm,
             ReplaceArangeArgs,
@@ -186,7 +183,6 @@ def get_annotation_passes(cls):
             DecomposeSelectScatter,
             DecomposeLinalgVectorNorm,
             DecomposeLogVariants,
-            ReplaceInfValues,
             LiftConstantScalarOperands,
             InsertReshapeForReduceOps,
         ]

@@ -57,9 +57,9 @@ def define_node(
 
         height = (output_height - 1) * stride_height + filter_height - input_height
         width = (output_width - 1) * stride_width + filter_width - input_width
-        if height % 2 != 0 or width % 2 != 0:
+        if any(x != 0 for x in (height, width)):
             warnings.warn(
-                "[QNN Delegate Op Builder]: Height or Width is not divisble by 2 with no remainder, fall back op",
+                "[QNN Delegate Op Builder]: Height or Width is not suitable, fall back op",
                 stacklevel=1,
             )
             return

@@ -27,8 +27,7 @@ def define_node(
     ) -> PyQnnManager.PyQnnOpWrapper:
         start, end = node.args[0:2]
         step = node.args[2] if len(node.args) > 2 else 1
-        arange_tensor = self.get_tensor(node, node)
-        out_tensor = torch.arange(start, end, step).to(arange_tensor.dtype)
+        out_tensor = torch.arange(start, end, step, dtype=node.meta["val"].dtype)
 
         # since we can derive the constant value of current op in AoT stage
         # we only build static tensor here for consumers of current node

@@ -54,35 +54,23 @@ def define_node(
             nodes_to_wrappers,
         )
 
-        pt_ceil_mode = node.args[4] if len(node.args) > 4 else False
-
         # kernel info
-        input_shape = input_node.meta["val"].shape
-        input_h, input_w = input_shape[2], input_shape[3]
         filter_size = self._get_filter_size(node)
-        if pt_ceil_mode:
-            # filter_size might larger than input_h, input_w, use min of them
-            filter_size = [min(filter_size[0], input_h), min(filter_size[1], input_w)]
         filter_size_shape = [len(filter_size)]
 
         padding = [0, 0]
         if len(node.args) > 3:
             padding = cast(List[int], node.args[3])
             if len(padding) == 1:
                 padding = padding + padding
-            if pt_ceil_mode:
-                ori_filter_h, ori_filter_w = self._get_filter_size(node)
-                padding = [
-                    0 if ori_filter_h > input_h else padding[0],
-                    0 if ori_filter_w > input_w else padding[1],
-                ]
 
         padding_shape = [len(padding), len(padding)]
 
         # if ceil mode is True, use ceil instead of floor to compute the output shape
+        ceil_mode = node.args[4] if len(node.args) > 4 else False
         mode = (
             OpPoolAvg2d.RoundingMode.CEIL
-            if pt_ceil_mode
+            if ceil_mode
             else OpPoolAvg2d.RoundingMode.FLOOR
         )
 
@@ -95,11 +83,6 @@ def define_node(
         count_include_pad = True
         if len(node.args) > 5:
             count_include_pad = cast(bool, node.args[5])
-        # TODO: If count_include_pad = False, it seems not to compute average with padding in Qnn.
-        # But it still compute average with padding value, and change divisor in torch
-        # if not count_include_pad:
-        #     print("Not support count_include_pad = False.")
-        #     return
 
         pooling_region = filter_size[0] * filter_size[1]
         divisor_override = pooling_region  # Default divisor is pooling_region
-Original file line number
+Diff line change
@@ Expand Up / @@ -264,7 +264,7 @@ test_model_with_qnn() { @@
             ;;
       esac
-      "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
+      "${PYTHON_EXECUTABLE}" -m examples.qualcomm.${SCRIPT_FOLDER}.${EXPORT_SCRIPT} --build_folder ${CMAKE_OUTPUT_DIR} --soc_model ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
       EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
     }
@@ Expand Down @@