Qualcomm AI Engine Direct - tensor dumping at lpai backend

jethroqti · jethroqti · commit 4dc27a287fd8 · 2026-06-17T19:42:57.000-07:00
Summary:
Currently, using --dump_intermediate_outputs and RPC mode will dump tensor

Test plan:
python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedUtils.test_qnn_backend_dump_intermediate_outputs_simple_model -b build-android -s ${SN} -m ${CHIPID} --backend lpai
python backends/qualcomm/tests/test_qnn_delegate.py -k TestQNNQuantizedUtils.test_qnn_backend_dump_intermediate_outputs_simple_model -b build-android -s ${SN} -m ${CHIPID} --dump_intermediate_outputs --backend lpai
diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md
@@ -156,6 +156,8 @@ After `build_executorch_binary()`, the debugger holds:
 
 Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported.
 
+**Note:** Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
+
 ```python
 from executorch.examples.qualcomm.utils import SimpleADB
 
@@ -266,7 +268,7 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build
 3. Does not support graphs with partitions (partial delegation).
 4. Does not support LLM models.
 5. Does not support graphs with multiple methods.
-
+6. Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends.
 
 ## ExecuTorch QNN HTP Heap Profiling
 
diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py
@@ -276,6 +276,10 @@ def __init__(
         self.skip_push = qnn_config.skip_push
         self.backend_library_paths = {}
 
+        if self.direct_build_folder and self.dump_intermediate_outputs:
+            raise ValueError(
+                "Per-tensor dumping is currently not supported in direct mode."
+            )
         if self.direct_build_folder:
             direct_general_artifacts = [
                 f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so",
@@ -437,9 +441,8 @@ def execute(
                         f"--input_list_path {self.input_list_filename}",
                         f"--etdump_path {self.etdump_path}",
                         "--shared_buffer" if self.shared_buffer else "",
-                        f"--debug_output_path {self.debug_output_path}",
                         (
-                            "--dump_intermediate_outputs"
+                            f"--debug_output_path {self.debug_output_path} --dump_intermediate_outputs"
                             if self.dump_intermediate_outputs
                             else ""
                         ),
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -6110,6 +6110,10 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
+        if self.direct_build_folder:
+            self.skipTest(
+                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
+            )
         backend_options = generate_htp_compiler_spec(use_fp16=True)
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
@@ -6840,20 +6844,38 @@ def output_callback(log_msg):
         )
 
     def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
-        backend_options = generate_htp_compiler_spec(use_fp16=False)
+        # TODO: LPAI direct mode support per-tensor dumping.
+        if self.direct_build_folder:
+            self.skipTest(
+                "Direct mode does not support per-tensor dumping (HTP/LPAI backends)."
+            )
+        match get_backend_type(self.backend):
+            case QnnExecuTorchBackendType.kHtpBackend:
+                backend_options = generate_htp_compiler_spec(use_fp16=False)
+                expected_compared_events = 14
+            case QnnExecuTorchBackendType.kLpaiBackend:
+                backend_options = generate_lpai_compiler_spec(
+                    target_env=self.get_lpai_target_env()
+                )
+                # I/O q/dq nodes fall back to CPU via FoldQDQ LPAI workaround
+                # and are excluded from QNN etdump; update after first LPAI run
+                expected_compared_events = 17
+            case _:
+                raise ValueError("Backend is not implemented yet")
         TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
             soc_model=self.chipset_table[TestQNN.soc_model],
             backend_options=backend_options,
             dump_intermediate_outputs=True,
         )
         module = SimpleModel()  # noqa: F405
+        torch.manual_seed(8)
         sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
-        module = self.get_qdq_module(module, sample_input)
+        qdq_module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(
-            module,
+            qdq_module,
             sample_input,
             expected_partitions=1,
-            expected_compared_events=14,
+            expected_compared_events=expected_compared_events,
         )
 
     def test_qnn_backend_dump_intermediate_outputs_topk(self):