-
Notifications
You must be signed in to change notification settings - Fork 3.2k
[NPU] Update commandlist for dynamic pipeline #35626
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
d75095f
509707d
cb3d6d6
b80741c
e94eea3
cb8b75c
c0eee3f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,12 +10,14 @@ | |
| #include "compiler_impl.hpp" | ||
| #include "intel_npu/common/compiler_adapter_factory.hpp" | ||
| #include "intel_npu/config/options.hpp" | ||
| #include "intel_npu/npu_private_properties.hpp" | ||
| #include "intel_npu/prefix.hpp" | ||
| #include "intel_npu/utils/utils.hpp" | ||
| #include "intel_npu/utils/zero/zero_api.hpp" | ||
| #include "intel_npu/utils/zero/zero_cmd_queue_pool.hpp" | ||
| #include "intel_npu/utils/zero/zero_utils.hpp" | ||
| #include "openvino/runtime/make_tensor.hpp" | ||
| #include "ze_graph_ext_wrappers.hpp" | ||
|
|
||
| namespace intel_npu { | ||
|
|
||
|
|
@@ -24,13 +26,19 @@ class DynamicGraphImpl : public DynamicGraph::Impl { | |
| using MemRefType = DynamicGraph::MemRefType; | ||
|
|
||
| public: | ||
| DynamicGraphImpl() : _engineProperties{}, _logger("DynamicGraphImpl", Logger::global().level()) {} | ||
| DynamicGraphImpl(const FilteredConfig& config) | ||
| : _engineProperties{}, | ||
| _bindingCommandListMode(config.get<COMMANDLIST_MODE>()), | ||
| _logger("DynamicGraphImpl", Logger::global().level()) {} | ||
| void initialize(std::optional<ov::Tensor>& blob, NetworkMetadata& metadata) override; | ||
| void createExecutionEngine(std::optional<ov::Tensor>& blob); | ||
| void prepareMetadata(NetworkMetadata& metadata); | ||
| void initializeDynamicGraphExecution(std::optional<ov::Tensor>& blob, NetworkMetadata& metadata); | ||
| void setArgumentValue(uint32_t argi, const void* argv) override; | ||
| void setArgumentValueWithStrides(uint32_t argi, const void* argv, const std::vector<size_t>& strides) override; | ||
| void setOptimizedDynamicStridesMode(bool enabled) override { | ||
| _optimizedDynamicStridesMode = enabled; | ||
| } | ||
| uint64_t getNumSubgraphs() override { | ||
| return _engineProperties.numOfSubGraphs; | ||
| } | ||
|
|
@@ -61,6 +69,8 @@ class DynamicGraphImpl : public DynamicGraph::Impl { | |
| npu_vm_runtime_handle_t _engine = nullptr; | ||
| npu_vm_runtime_properties_t _engineProperties; | ||
| DynamicGraph::GraphArguments _binding; | ||
| ov::intel_npu::CommandListMode _bindingCommandListMode; | ||
| bool _optimizedDynamicStridesMode = false; | ||
| bool _initialized = false; | ||
| Logger _logger; | ||
| }; | ||
|
|
@@ -305,13 +315,21 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder> | |
| ze_event_handle_t event, | ||
| ze_graph_profiling_pool_handle_t profiling) { | ||
| _logger.debug("Start to execute graph with runtime engine"); | ||
|
|
||
| std::shared_ptr<DynamicGraph::GraphArgumentsImpl> argsImpl = | ||
| args._impl ? std::static_pointer_cast<DynamicGraph::GraphArgumentsImpl>(args._impl) | ||
| : std::make_shared<DynamicGraph::GraphArgumentsImpl>(); | ||
|
|
||
| bool noTensorChange = true; | ||
| // Force record commandlist for first execution or the mode is set to FORCE_COMMANDLIST_RECORDING_ONLY | ||
| bool commandListRecordingRequired = | ||
| (args._impl == nullptr) || | ||
| _bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_COMMANDLIST_RECORDING_ONLY; | ||
| std::vector<uint64_t> commandListIndexArray; | ||
|
|
||
| npu_vm_runtime_execute_params_t* params = &argsImpl->_executeParams; | ||
| for (auto& in : args._inputs) { | ||
| auto inputSize = args._inputs.size(); | ||
| for (size_t i = 0; i < inputSize; ++i) { | ||
| auto& in = args._inputs[i]; | ||
| std::shared_ptr<DynamicGraph::MemRefTypeImpl> inImpl = | ||
| std::static_pointer_cast<DynamicGraph::MemRefTypeImpl>(in._impl); | ||
| if (inImpl == nullptr) { | ||
|
|
@@ -320,12 +338,37 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder> | |
| } | ||
| inImpl->UpdateMemRefHandleStatus(in); | ||
| if (args._impl == nullptr) { | ||
| // First execution | ||
| argsImpl->_inputMemRefs.push_back(inImpl->_memRef); | ||
| } else if (inImpl->_ptrUpdated || inImpl->_shapeUpdated || inImpl->_strideUpdated) { | ||
| noTensorChange = false; | ||
| } else if (_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) { | ||
| if (!commandListRecordingRequired) { | ||
| if (inImpl->_shapeUpdated || inImpl->_strideUpdated) { | ||
| // If shape or stride change, need recording commandlist | ||
| commandListRecordingRequired = true; | ||
| } else { | ||
| // If force update commandlist, then pass all index info | ||
|
XinWangIntel marked this conversation as resolved.
|
||
| commandListIndexArray.push_back(i); | ||
| } | ||
| } | ||
| } else if (!commandListRecordingRequired && | ||
| (inImpl->_ptrUpdated || inImpl->_shapeUpdated || inImpl->_strideUpdated)) { | ||
| if (inImpl->_ptrUpdated && _optimizedDynamicStridesMode && | ||
| _bindingCommandListMode == ov::intel_npu::CommandListMode::ENABLE_MUTABLE_COMMANDLIST) { | ||
| _logger.debug( | ||
|
XinWangIntel marked this conversation as resolved.
|
||
| "Input tensor pointer change detected for index %d, and optimized dynamic stride is supported, " | ||
| "which can be updated with UpdateMutableCommandList API without recording a new command list.", | ||
| static_cast<int>(i)); | ||
| commandListIndexArray.push_back(i); | ||
| } else { | ||
| // For shape change, stride change, ptr change without optimized dynamic stride supported, need record | ||
| // commandlist | ||
| _logger.debug("Input tensor %d trigger command list recording", static_cast<int>(i)); | ||
| commandListRecordingRequired = true; | ||
| } | ||
| } | ||
| } | ||
| for (auto& out : args._outputs) { | ||
| for (size_t i = 0; i < args._outputs.size(); ++i) { | ||
| auto& out = args._outputs[i]; | ||
| std::shared_ptr<DynamicGraph::MemRefTypeImpl> outImpl = | ||
| std::static_pointer_cast<DynamicGraph::MemRefTypeImpl>(out._impl); | ||
| if (outImpl == nullptr) { | ||
|
|
@@ -334,28 +377,66 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder> | |
| } | ||
| outImpl->UpdateMemRefHandleStatus(out); | ||
| if (args._impl == nullptr) { | ||
| // First execution | ||
| argsImpl->_outputMemRefs.push_back(outImpl->_memRef); | ||
| } else if (outImpl->_ptrUpdated || outImpl->_shapeUpdated || outImpl->_strideUpdated) { | ||
| noTensorChange = false; | ||
| } else if (_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) { | ||
| if (!commandListRecordingRequired) { | ||
| if (outImpl->_shapeUpdated || outImpl->_strideUpdated) { | ||
| // If shape or stride change, need recording commandlist | ||
| commandListRecordingRequired = true; | ||
| } else { | ||
| // If force update commandlist, then pass all index info | ||
| commandListIndexArray.push_back(inputSize + i); | ||
| } | ||
| } | ||
| } else if (!commandListRecordingRequired && | ||
| (outImpl->_ptrUpdated || outImpl->_shapeUpdated || outImpl->_strideUpdated)) { | ||
| if (outImpl->_ptrUpdated && _optimizedDynamicStridesMode && | ||
| _bindingCommandListMode == ov::intel_npu::CommandListMode::ENABLE_MUTABLE_COMMANDLIST) { | ||
| _logger.debug( | ||
| "Output tensor pointer change detected for index %d, and optimized dynamic stride is supported, " | ||
| "which can be updated with UpdateMutableCommandList API without recording a new command list.", | ||
| static_cast<int>(i)); | ||
| commandListIndexArray.push_back(inputSize + i); | ||
| } else { | ||
| _logger.debug("Output tensor %d trigger command list recording", static_cast<int>(i)); | ||
| // For shape change, need record commandlist | ||
| commandListRecordingRequired = true; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (args._impl == nullptr || !noTensorChange) { | ||
| if (args._impl == nullptr || commandListRecordingRequired) { | ||
| _logger.debug("Reset command list to run with runtime"); | ||
| // Reset commandLists since there are tensor with new shapes or it is the first execution, can not reuse command | ||
| // list with update | ||
| for (auto& cmdList : commandLists) { | ||
| zeCommandListReset(cmdList); | ||
| } | ||
| } else { | ||
| _logger.debug("Reuse command list without update since no tensor change detected"); | ||
|
|
||
| auto result = zeCommandQueueExecuteCommandLists(commandQueue, | ||
| static_cast<uint32_t>(commandLists.size()), | ||
| commandLists.data(), | ||
| fence); | ||
| if (result != ZE_RESULT_SUCCESS) { | ||
| OPENVINO_THROW("Failed to submit command lists"); | ||
| if (!commandListIndexArray.empty() || | ||
| _bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) { | ||
| _logger.debug("Update command list and execute directly"); | ||
| if (params->executionContext == nullptr) { | ||
| OPENVINO_THROW( | ||
| "Execution context is not created, can not reuse command list with UpdateMutableCommandList API"); | ||
| } | ||
|
|
||
| if (npuVMRuntimeUpdateMutableCommandList(_engine, | ||
| params, | ||
| const_cast<uint64_t*>(commandListIndexArray.data()), | ||
| commandListIndexArray.size()) != NPU_VM_RUNTIME_RESULT_SUCCESS) { | ||
| OPENVINO_THROW("Failed to execute VM runtime engine to update commandlist"); | ||
| } | ||
| } else { | ||
| _logger.debug("Reuse command list without update since no tensor change detected"); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move command list submission here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, but this impact current test which only update tensor ptr and then we update commandlist too |
||
| auto result = zeCommandQueueExecuteCommandLists(commandQueue, | ||
| static_cast<uint32_t>(commandLists.size()), | ||
| commandLists.data(), | ||
| fence); | ||
| if (result != ZE_RESULT_SUCCESS) { | ||
| OPENVINO_THROW("Failed to submit command lists"); | ||
| } | ||
| } | ||
| return; | ||
| } | ||
|
|
@@ -451,7 +532,7 @@ DynamicGraph::DynamicGraph(const std::shared_ptr<ZeroInitStructsHolder>& zeroIni | |
| return; | ||
| } | ||
|
|
||
| _impl = std::make_unique<DynamicGraphImpl>(); | ||
| _impl = std::make_unique<DynamicGraphImpl>(config); | ||
|
|
||
| // TODO: metadata needs to be parsed even when CREATE_EXECUTOR is 0 or DEFER_WEIGHTS_LOAD is YES, keep here to | ||
| // support pure compilation without vm runtime initialize VM execution engine, metadata, input&output | ||
|
|
@@ -583,7 +664,7 @@ void DynamicGraph::initialize_impl(const FilteredConfig& config) { | |
| _logger.debug("Graph initialize start"); | ||
|
|
||
| if (!_impl) { | ||
| _impl = std::make_unique<DynamicGraphImpl>(); | ||
| _impl = std::make_unique<DynamicGraphImpl>(config); | ||
| // initialize VM execution engine, metadata, input&output descriptors | ||
| _impl->initialize(_blob, _metadata); | ||
| _num_of_subgraphs = _impl->getNumSubgraphs(); | ||
|
|
@@ -596,6 +677,8 @@ void DynamicGraph::initialize_impl(const FilteredConfig& config) { | |
|
|
||
| _logger.debug("Graph initialize without graph handle"); | ||
|
|
||
| _impl->setOptimizedDynamicStridesMode(ZeGraphExtWrappers(_zeroInitStruct).isOptimizedDynamicStridesSupported()); | ||
|
XinWangIntel marked this conversation as resolved.
XinWangIntel marked this conversation as resolved.
|
||
|
|
||
| uint32_t commandQueueOptions = 0; | ||
| if (config.has<TURBO>() && config.get<TURBO>()) { | ||
| OPENVINO_ASSERT(_zeroInitStruct->getCommandQueueDdiTable().version() >= ZE_MAKE_VERSION(1, 0), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.