Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,50 @@ struct BATCH_MODE final : OptionBase<BATCH_MODE, ov::intel_npu::BatchMode> {
}
};

struct COMMANDLIST_MODE final : OptionBase<COMMANDLIST_MODE, ov::intel_npu::CommandListMode> {
static std::string_view key() {
return ov::intel_npu::commandlist_mode.name();
}

static constexpr std::string_view getTypeName() {
return "ov::intel_npu::CommandListMode";
}

static ov::intel_npu::CommandListMode defaultValue() {
return ov::intel_npu::CommandListMode::DEFAULT;
}

static bool isPublic() {
return false;
}

static OptionMode mode() {
return OptionMode::RunTime;
}

static ov::intel_npu::CommandListMode parse(std::string_view val) {
if (val == "DEFAULT") {
return ov::intel_npu::CommandListMode::DEFAULT;
} else if (val == "ENABLE_MUTABLE_COMMANDLIST") {
return ov::intel_npu::CommandListMode::ENABLE_MUTABLE_COMMANDLIST;
} else if (val == "FORCE_COMMANDLIST_RECORDING_ONLY") {
return ov::intel_npu::CommandListMode::FORCE_COMMANDLIST_RECORDING_ONLY;
} else if (val == "FORCE_UPDATE_MUTABLE_COMMANDLIST") {
return ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST;
}

OPENVINO_THROW("Value '", val, "' is not a valid COMMANDLIST_MODE option");
}

static std::string toString(const ov::intel_npu::CommandListMode& val) {
std::stringstream strStream;

strStream << val;

return strStream.str();
}
};

struct PROFILING_TYPE final : OptionBase<PROFILING_TYPE, ov::intel_npu::ProfilingType> {
static std::string_view key() {
return ov::intel_npu::profiling_type.name();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,47 @@ inline std::ostream& operator<<(std::ostream& out, const BatchMode& fmt) {
return out;
}

/**
* @brief [Only for NPU Plugin]
* Type: String. Default is "DEFAULT".
* Selects the command list update strategy used by the plugin.
* Possible values: "DEFAULT", "ENABLE_MUTABLE_COMMANDLIST", "FORCE_UPDATE_MUTABLE_COMMANDLIST",
* "FORCE_COMMANDLIST_RECORDING_ONLY"
*/
Comment thread
XinWangIntel marked this conversation as resolved.
enum class CommandListMode {
DEFAULT = 0,
ENABLE_MUTABLE_COMMANDLIST = 1,
FORCE_UPDATE_MUTABLE_COMMANDLIST = 2,
FORCE_COMMANDLIST_RECORDING_ONLY = 3,
};

/**
* @brief Prints a string representation of ov::intel_npu::CommandListMode to a stream
* @param out An output stream to send to
* @param fmt A command list mode value to print to a stream
* @return A reference to the `out` stream
* @note Configuration API v 2.0
*/
inline std::ostream& operator<<(std::ostream& out, const CommandListMode& fmt) {
switch (fmt) {
case CommandListMode::DEFAULT: {
out << "DEFAULT";
} break;
case CommandListMode::ENABLE_MUTABLE_COMMANDLIST: {
out << "ENABLE_MUTABLE_COMMANDLIST";
} break;
case CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST: {
out << "FORCE_UPDATE_MUTABLE_COMMANDLIST";
} break;
case CommandListMode::FORCE_COMMANDLIST_RECORDING_ONLY: {
out << "FORCE_COMMANDLIST_RECORDING_ONLY";
} break;
default:
OPENVINO_THROW("Unsupported value for the command list mode:", fmt);
}
return out;
}

/**
* @brief [Only for NPU Plugin]
* Default is "ITERATIVE".
Expand Down Expand Up @@ -328,6 +369,14 @@ static constexpr ov::Property<ProfilingType> profiling_type{"NPU_PROFILING_TYPE"
*/
static constexpr ov::Property<BatchMode> batch_mode{"NPU_BATCH_MODE"};

/**
* @brief [Only for NPU Plugin]
* Type: String. Default is "DEFAULT".
* Selects the command list update strategy used by the plugin.
* Possible values: "DEFAULT", "FORCE_COMMANDLIST_RECORDING_ONLY", "FORCE_UPDATE_MUTABLE_COMMANDLIST".
*/
static constexpr ov::Property<CommandListMode> commandlist_mode{"NPU_COMMANDLIST_MODE"};
Comment thread
XinWangIntel marked this conversation as resolved.

/**
* @brief [Experimental, only for NPU Plugin]
* Type: enum. Default is "ITERATIVE". If the compiler-in-plugin is used (intel_npu::compiler_type =
Expand Down Expand Up @@ -448,6 +497,5 @@ static constexpr ov::Property<bool> export_raw_blob{"NPU_EXPORT_RAW_BLOB"};
* models from each other, which can be required for some use cases.
*/
static constexpr ov::Property<bool> shared_common_queue{"NPU_SHARED_COMMON_QUEUE"};

} // namespace intel_npu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class DynamicGraph final : public IDynamicGraph {
virtual void setArgumentValueWithStrides(uint32_t argi,
const void* argv,
const std::vector<size_t>& strides) = 0;
virtual void setOptimizedDynamicStridesMode(bool enabled) = 0;
virtual uint64_t getNumSubgraphs() = 0;
virtual void getBinding(GraphArguments& binding) = 0;
virtual void executeGraph(const std::shared_ptr<ZeroInitStructsHolder>& zeroInitStruct,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ class ZeGraphExtWrappers {

void evict_memory(const GraphDescriptor& graphDescriptor) const;

bool isOptimizedDynamicStridesSupported() const {
return _isOptimizedDynamicStridesSupported;
}

private:
void getMetadata(ze_graph_handle_t graphHandle,
uint32_t indexUsedByDriver,
Expand All @@ -103,6 +107,7 @@ class ZeGraphExtWrappers {
std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
uint32_t _graphExtVersion;
bool _isCompilerOptionQuerySupported;
bool _isOptimizedDynamicStridesSupported = false;

Logger _logger;
};
Expand Down
121 changes: 102 additions & 19 deletions src/plugins/intel_npu/src/compiler_adapter/src/dynamic_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
#include "compiler_impl.hpp"
#include "intel_npu/common/compiler_adapter_factory.hpp"
#include "intel_npu/config/options.hpp"
#include "intel_npu/npu_private_properties.hpp"
#include "intel_npu/prefix.hpp"
#include "intel_npu/utils/utils.hpp"
#include "intel_npu/utils/zero/zero_api.hpp"
#include "intel_npu/utils/zero/zero_cmd_queue_pool.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"
#include "openvino/runtime/make_tensor.hpp"
#include "ze_graph_ext_wrappers.hpp"

namespace intel_npu {

Expand All @@ -24,13 +26,19 @@ class DynamicGraphImpl : public DynamicGraph::Impl {
using MemRefType = DynamicGraph::MemRefType;

public:
DynamicGraphImpl() : _engineProperties{}, _logger("DynamicGraphImpl", Logger::global().level()) {}
DynamicGraphImpl(const FilteredConfig& config)
: _engineProperties{},
_bindingCommandListMode(config.get<COMMANDLIST_MODE>()),
_logger("DynamicGraphImpl", Logger::global().level()) {}
void initialize(std::optional<ov::Tensor>& blob, NetworkMetadata& metadata) override;
void createExecutionEngine(std::optional<ov::Tensor>& blob);
void prepareMetadata(NetworkMetadata& metadata);
void initializeDynamicGraphExecution(std::optional<ov::Tensor>& blob, NetworkMetadata& metadata);
void setArgumentValue(uint32_t argi, const void* argv) override;
void setArgumentValueWithStrides(uint32_t argi, const void* argv, const std::vector<size_t>& strides) override;
void setOptimizedDynamicStridesMode(bool enabled) override {
_optimizedDynamicStridesMode = enabled;
}
uint64_t getNumSubgraphs() override {
return _engineProperties.numOfSubGraphs;
}
Expand Down Expand Up @@ -61,6 +69,8 @@ class DynamicGraphImpl : public DynamicGraph::Impl {
npu_vm_runtime_handle_t _engine = nullptr;
npu_vm_runtime_properties_t _engineProperties;
DynamicGraph::GraphArguments _binding;
ov::intel_npu::CommandListMode _bindingCommandListMode;
bool _optimizedDynamicStridesMode = false;
bool _initialized = false;
Logger _logger;
};
Expand Down Expand Up @@ -305,13 +315,21 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder>
ze_event_handle_t event,
ze_graph_profiling_pool_handle_t profiling) {
_logger.debug("Start to execute graph with runtime engine");

std::shared_ptr<DynamicGraph::GraphArgumentsImpl> argsImpl =
args._impl ? std::static_pointer_cast<DynamicGraph::GraphArgumentsImpl>(args._impl)
: std::make_shared<DynamicGraph::GraphArgumentsImpl>();

bool noTensorChange = true;
// Force record commandlist for first execution or the mode is set to FORCE_COMMANDLIST_RECORDING_ONLY
bool commandListRecordingRequired =
(args._impl == nullptr) ||
_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_COMMANDLIST_RECORDING_ONLY;
std::vector<uint64_t> commandListIndexArray;

npu_vm_runtime_execute_params_t* params = &argsImpl->_executeParams;
for (auto& in : args._inputs) {
auto inputSize = args._inputs.size();
for (size_t i = 0; i < inputSize; ++i) {
auto& in = args._inputs[i];
std::shared_ptr<DynamicGraph::MemRefTypeImpl> inImpl =
std::static_pointer_cast<DynamicGraph::MemRefTypeImpl>(in._impl);
if (inImpl == nullptr) {
Expand All @@ -320,12 +338,37 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder>
}
inImpl->UpdateMemRefHandleStatus(in);
if (args._impl == nullptr) {
// First execution
argsImpl->_inputMemRefs.push_back(inImpl->_memRef);
} else if (inImpl->_ptrUpdated || inImpl->_shapeUpdated || inImpl->_strideUpdated) {
noTensorChange = false;
} else if (_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) {
if (!commandListRecordingRequired) {
if (inImpl->_shapeUpdated || inImpl->_strideUpdated) {
// If shape or stride change, need recording commandlist
commandListRecordingRequired = true;
} else {
// If force update commandlist, then pass all index info
Comment thread
XinWangIntel marked this conversation as resolved.
commandListIndexArray.push_back(i);
}
}
} else if (!commandListRecordingRequired &&
(inImpl->_ptrUpdated || inImpl->_shapeUpdated || inImpl->_strideUpdated)) {
if (inImpl->_ptrUpdated && _optimizedDynamicStridesMode &&
_bindingCommandListMode == ov::intel_npu::CommandListMode::ENABLE_MUTABLE_COMMANDLIST) {
_logger.debug(
Comment thread
XinWangIntel marked this conversation as resolved.
"Input tensor pointer change detected for index %d, and optimized dynamic stride is supported, "
"which can be updated with UpdateMutableCommandList API without recording a new command list.",
static_cast<int>(i));
commandListIndexArray.push_back(i);
} else {
// For shape change, stride change, ptr change without optimized dynamic stride supported, need record
// commandlist
_logger.debug("Input tensor %d trigger command list recording", static_cast<int>(i));
commandListRecordingRequired = true;
}
}
}
for (auto& out : args._outputs) {
for (size_t i = 0; i < args._outputs.size(); ++i) {
auto& out = args._outputs[i];
std::shared_ptr<DynamicGraph::MemRefTypeImpl> outImpl =
std::static_pointer_cast<DynamicGraph::MemRefTypeImpl>(out._impl);
if (outImpl == nullptr) {
Expand All @@ -334,28 +377,66 @@ void DynamicGraphImpl::executeGraph(const std::shared_ptr<ZeroInitStructsHolder>
}
outImpl->UpdateMemRefHandleStatus(out);
if (args._impl == nullptr) {
// First execution
argsImpl->_outputMemRefs.push_back(outImpl->_memRef);
} else if (outImpl->_ptrUpdated || outImpl->_shapeUpdated || outImpl->_strideUpdated) {
noTensorChange = false;
} else if (_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) {
if (!commandListRecordingRequired) {
if (outImpl->_shapeUpdated || outImpl->_strideUpdated) {
// If shape or stride change, need recording commandlist
commandListRecordingRequired = true;
} else {
// If force update commandlist, then pass all index info
commandListIndexArray.push_back(inputSize + i);
}
}
} else if (!commandListRecordingRequired &&
(outImpl->_ptrUpdated || outImpl->_shapeUpdated || outImpl->_strideUpdated)) {
if (outImpl->_ptrUpdated && _optimizedDynamicStridesMode &&
_bindingCommandListMode == ov::intel_npu::CommandListMode::ENABLE_MUTABLE_COMMANDLIST) {
_logger.debug(
"Output tensor pointer change detected for index %d, and optimized dynamic stride is supported, "
"which can be updated with UpdateMutableCommandList API without recording a new command list.",
static_cast<int>(i));
commandListIndexArray.push_back(inputSize + i);
} else {
_logger.debug("Output tensor %d trigger command list recording", static_cast<int>(i));
// For shape change, need record commandlist
commandListRecordingRequired = true;
}
}
}

if (args._impl == nullptr || !noTensorChange) {
if (args._impl == nullptr || commandListRecordingRequired) {
_logger.debug("Reset command list to run with runtime");
// Reset commandLists since there are tensor with new shapes or it is the first execution, can not reuse command
// list with update
for (auto& cmdList : commandLists) {
zeCommandListReset(cmdList);
}
} else {
_logger.debug("Reuse command list without update since no tensor change detected");

auto result = zeCommandQueueExecuteCommandLists(commandQueue,
static_cast<uint32_t>(commandLists.size()),
commandLists.data(),
fence);
if (result != ZE_RESULT_SUCCESS) {
OPENVINO_THROW("Failed to submit command lists");
if (!commandListIndexArray.empty() ||
_bindingCommandListMode == ov::intel_npu::CommandListMode::FORCE_UPDATE_MUTABLE_COMMANDLIST) {
_logger.debug("Update command list and execute directly");
if (params->executionContext == nullptr) {
OPENVINO_THROW(
"Execution context is not created, can not reuse command list with UpdateMutableCommandList API");
}

if (npuVMRuntimeUpdateMutableCommandList(_engine,
params,
const_cast<uint64_t*>(commandListIndexArray.data()),
commandListIndexArray.size()) != NPU_VM_RUNTIME_RESULT_SUCCESS) {
OPENVINO_THROW("Failed to execute VM runtime engine to update commandlist");
}
} else {
_logger.debug("Reuse command list without update since no tensor change detected");
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move command list submission here.
auto result = zeCommandQueueExecuteCommandLists(commandQueue,
static_cast<uint32_t>(commandLists.size()),
commandLists.data(),
fence);
if (result != ZE_RESULT_SUCCESS) {
OPENVINO_THROW("Failed to submit command lists");
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, but this impact current test which only update tensor ptr and then we update commandlist too

auto result = zeCommandQueueExecuteCommandLists(commandQueue,
static_cast<uint32_t>(commandLists.size()),
commandLists.data(),
fence);
if (result != ZE_RESULT_SUCCESS) {
OPENVINO_THROW("Failed to submit command lists");
}
}
return;
}
Expand Down Expand Up @@ -451,7 +532,7 @@ DynamicGraph::DynamicGraph(const std::shared_ptr<ZeroInitStructsHolder>& zeroIni
return;
}

_impl = std::make_unique<DynamicGraphImpl>();
_impl = std::make_unique<DynamicGraphImpl>(config);

// TODO: metadata needs to be parsed even when CREATE_EXECUTOR is 0 or DEFER_WEIGHTS_LOAD is YES, keep here to
// support pure compilation without vm runtime initialize VM execution engine, metadata, input&output
Expand Down Expand Up @@ -583,7 +664,7 @@ void DynamicGraph::initialize_impl(const FilteredConfig& config) {
_logger.debug("Graph initialize start");

if (!_impl) {
_impl = std::make_unique<DynamicGraphImpl>();
_impl = std::make_unique<DynamicGraphImpl>(config);
// initialize VM execution engine, metadata, input&output descriptors
_impl->initialize(_blob, _metadata);
_num_of_subgraphs = _impl->getNumSubgraphs();
Expand All @@ -596,6 +677,8 @@ void DynamicGraph::initialize_impl(const FilteredConfig& config) {

_logger.debug("Graph initialize without graph handle");

_impl->setOptimizedDynamicStridesMode(ZeGraphExtWrappers(_zeroInitStruct).isOptimizedDynamicStridesSupported());
Comment thread
XinWangIntel marked this conversation as resolved.
Comment thread
XinWangIntel marked this conversation as resolved.

uint32_t commandQueueOptions = 0;
if (config.has<TURBO>() && config.get<TURBO>()) {
OPENVINO_ASSERT(_zeroInitStruct->getCommandQueueDdiTable().version() >= ZE_MAKE_VERSION(1, 0),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,15 @@ ZeGraphExtWrappers::ZeGraphExtWrappers(const std::shared_ptr<ZeroInitStructsHold
nullptr) == ZE_RESULT_SUCCESS;
}
#endif
if (_isCompilerOptionQuerySupported) {
_isOptimizedDynamicStridesSupported =
_zeroInitStruct->getGraphDdiTable().pfnCompilerIsOptionSupported(_zeroInitStruct->getDevice(),
ZE_NPU_DRIVER_OPTIONS,
"OPTIMIZED_DYNAMIC_STRIDE",
nullptr) == ZE_RESULT_SUCCESS;
_logger.debug("OPTIMIZED_DYNAMIC_STRIDE compiler option is %s",
_isOptimizedDynamicStridesSupported ? "supported" : "not supported");
}
}

ZeGraphExtWrappers::~ZeGraphExtWrappers() {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/plugin/src/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ void init_config(const IEngineBackend* backend, OptionsDesc& options, FilteredCo
REGISTER_OPTION(PROFILING_TYPE);
REGISTER_OPTION(BACKEND_COMPILATION_PARAMS);
REGISTER_OPTION(BATCH_MODE);
REGISTER_OPTION(COMMANDLIST_MODE);
REGISTER_OPTION(BYPASS_UMD_CACHING);
REGISTER_OPTION(DEFER_WEIGHTS_LOAD);
REGISTER_OPTION(WEIGHTS_PATH);
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/src/properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ std::map<std::string, std::string> any_copy(const ov::AnyMap& params) {

inline bool isSpecialBothProperty(const std::string& key) {
return key == ov::hint::performance_mode.name() || key == ov::enable_profiling.name() ||
key == ov::log::level.name();
key == ov::log::level.name() || key == ov::intel_npu::commandlist_mode.name();
}

inline void logCpuPinningDeprecationWarning(intel_npu::Logger& logger) {
Expand Down
Loading
Loading