diff --git a/backends/qualcomm/debugger/README.md b/backends/qualcomm/debugger/README.md index 8300920d1d5..09b4c1918df 100644 --- a/backends/qualcomm/debugger/README.md +++ b/backends/qualcomm/debugger/README.md @@ -156,6 +156,8 @@ After `build_executorch_binary()`, the debugger holds: Ensure `dump_intermediate_outputs` is enabled in your `QnnConfig` (or pass `--dump_intermediate_outputs` via CLI). Only run **one inference** for debugging — multiple executions are not supported. +**Note:** Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends. + ```python from executorch.examples.qualcomm.utils import SimpleADB @@ -266,7 +268,7 @@ python -m examples.qualcomm.util_scripts.qnn_intermediate_debugger_demo -b build 3. Does not support graphs with partitions (partial delegation). 4. Does not support LLM models. 5. Does not support graphs with multiple methods. - +6. Intermediate tensor dumping is not currently supported in direct mode on HTP/LPAI backends. ## ExecuTorch QNN HTP Heap Profiling diff --git a/backends/qualcomm/export_utils.py b/backends/qualcomm/export_utils.py index 28b7952ef33..bcba08ecc5a 100644 --- a/backends/qualcomm/export_utils.py +++ b/backends/qualcomm/export_utils.py @@ -276,6 +276,10 @@ def __init__( self.skip_push = qnn_config.skip_push self.backend_library_paths = {} + if self.direct_build_folder and self.dump_intermediate_outputs: + raise ValueError( + "Per-tensor dumping is currently not supported in direct mode." + ) if self.direct_build_folder: direct_general_artifacts = [ f"{self.build_path}/examples/qualcomm/direct_executor_runner/libqnn_executorch_stub.so", @@ -437,9 +441,8 @@ def execute( f"--input_list_path {self.input_list_filename}", f"--etdump_path {self.etdump_path}", "--shared_buffer" if self.shared_buffer else "", - f"--debug_output_path {self.debug_output_path}", ( - "--dump_intermediate_outputs" + f"--debug_output_path {self.debug_output_path} --dump_intermediate_outputs" if self.dump_intermediate_outputs else "" ), diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 00ea5546e13..ac2829fcf33 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -6110,6 +6110,10 @@ def test_qnn_backend_dump_intermediate_outputs_topk(self): ) def test_qnn_backend_dump_intermediate_outputs_simple_model(self): + if self.direct_build_folder: + self.skipTest( + "Direct mode does not support per-tensor dumping (HTP/LPAI backends)." + ) backend_options = generate_htp_compiler_spec(use_fp16=True) TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.soc_model], @@ -6840,20 +6844,38 @@ def output_callback(log_msg): ) def test_qnn_backend_dump_intermediate_outputs_simple_model(self): - backend_options = generate_htp_compiler_spec(use_fp16=False) + # TODO: LPAI direct mode support per-tensor dumping. + if self.direct_build_folder: + self.skipTest( + "Direct mode does not support per-tensor dumping (HTP/LPAI backends)." + ) + match get_backend_type(self.backend): + case QnnExecuTorchBackendType.kHtpBackend: + backend_options = generate_htp_compiler_spec(use_fp16=False) + expected_compared_events = 14 + case QnnExecuTorchBackendType.kLpaiBackend: + backend_options = generate_lpai_compiler_spec( + target_env=self.get_lpai_target_env() + ) + # I/O q/dq nodes fall back to CPU via FoldQDQ LPAI workaround + # and are excluded from QNN etdump; update after first LPAI run + expected_compared_events = 17 + case _: + raise ValueError("Backend is not implemented yet") TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec( soc_model=self.chipset_table[TestQNN.soc_model], backend_options=backend_options, dump_intermediate_outputs=True, ) module = SimpleModel() # noqa: F405 + torch.manual_seed(8) sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28)) - module = self.get_qdq_module(module, sample_input) + qdq_module = self.get_qdq_module(module, sample_input) self.lower_module_and_test_output( - module, + qdq_module, sample_input, expected_partitions=1, - expected_compared_events=14, + expected_compared_events=expected_compared_events, ) def test_qnn_backend_dump_intermediate_outputs_topk(self):