diff --git a/docs/articles_en/assets/snippets/gpu/remote_objects_creation.cpp b/docs/articles_en/assets/snippets/gpu/remote_objects_creation.cpp index 35e2c86af16c25..b2a7cb4170e3f7 100644 --- a/docs/articles_en/assets/snippets/gpu/remote_objects_creation.cpp +++ b/docs/articles_en/assets/snippets/gpu/remote_objects_creation.cpp @@ -1,3 +1,7 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + #include #include #include @@ -14,6 +18,7 @@ cl_context get_cl_context(); cl_command_queue get_cl_queue(); cl::Buffer allocate_buffer(size_t size); cl::Image2D allocate_image(size_t size); +ov::intel_gpu::ocl::os_handle_param get_shared_handle(); #ifdef WIN32 @@ -62,6 +67,16 @@ int main() { //! [wrap_cl_image] } +{ + //! [wrap_shared_handle] + auto shared_handle = get_shared_handle(); + auto remote_tensor = gpu_context.create_tensor(in_element_type, + in_shape, + shared_handle, + ov::intel_gpu::MemType::SHARED_BUF); + //! [wrap_shared_handle] +} + { //! [allocate_usm_device] auto remote_tensor = gpu_context.create_usm_device_tensor(in_element_type, in_shape); diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device/remote-tensor-api-gpu-plugin.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device/remote-tensor-api-gpu-plugin.rst index adac7c64a9e192..8a014404459f5f 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device/remote-tensor-api-gpu-plugin.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device/remote-tensor-api-gpu-plugin.rst @@ -254,6 +254,19 @@ For more details, see the code snippets below: :language: cpp :fragment: [wrap_cl_image] + .. tab-item:: external shared handle + :sync: external-shared-handle + + Use this overload when your application already owns an OS-level shared memory handle + (for example, DX12 NT handle on Windows or DMA-BUF file descriptor on Linux). + + .. doxygensnippet:: docs/articles_en/assets/snippets/gpu/remote_objects_creation.cpp + :language: cpp + :fragment: [wrap_shared_handle] + + The ``shape`` and ``element type`` must describe the same memory layout as the external buffer. + The handle must remain valid for the whole lifetime of the created remote tensor. + .. tab-item:: biplanar NV12 surface :sync: biplanar-nv12-surface diff --git a/src/inference/include/openvino/runtime/intel_gpu/ocl/ocl.hpp b/src/inference/include/openvino/runtime/intel_gpu/ocl/ocl.hpp index da8c296db76df7..5308255349120d 100644 --- a/src/inference/include/openvino/runtime/intel_gpu/ocl/ocl.hpp +++ b/src/inference/include/openvino/runtime/intel_gpu/ocl/ocl.hpp @@ -38,8 +38,19 @@ namespace ocl { * @brief Shortcut for defining a handle parameter * @ingroup ov_runtime_ocl_gpu_cpp_api */ + using gpu_handle_param = void*; +/** + * @brief Shortcut for defining a HANDLE on windows or file descriptor on linux + * @ingroup ov_runtime_ocl_gpu_cpp_api + */ +#ifdef __linux__ +using os_handle_param = int; +#else +using os_handle_param = void*; +#endif + /** * @brief This class represents an abstraction for GPU plugin remote tensor * which can be shared with user-supplied OpenCL buffer. @@ -58,6 +69,7 @@ class ClBufferTensor : public RemoteTensor { {{std::string(ov::intel_gpu::mem_handle.name()), {}}, {std::string(ov::intel_gpu::shared_mem_type.name()), {ov::Any(ov::intel_gpu::SharedMemType::OCL_BUFFER).as(), + ov::Any(ov::intel_gpu::SharedMemType::BUFFER_FROM_HANDLE).as(), ov::Any(ov::intel_gpu::SharedMemType::DX_BUFFER).as()}}}); } @@ -307,6 +319,30 @@ class ClContext : public RemoteContext { return create_tensor(type, shape, params).as(); } + /** + * @brief This function is used to obtain a remote tensor object from a user-supplied external memory handle + * The API mirrors the NPU pointer-based create_tensor form. + * @param type Tensor element type + * @param shape Tensor shape + * @param shared_buffer External memory handle from another API (DX12 shared NT handle on Windows passed as void*, + * DMA-BUF fd on Linux passed as int) + * @param memory_type Memory type to use; only MemType::SHARED_BUF is currently supported + * @return A remote tensor instance + */ + ClBufferTensor create_tensor(const element::Type type, + const Shape& shape, + os_handle_param shared_buffer, + const MemType memory_type) { +#ifndef __linux__ + OPENVINO_ASSERT(shared_buffer != nullptr, "shared_buffer must not be nullptr for SHARED_BUF memory type"); +#endif + OPENVINO_ASSERT(memory_type == MemType::SHARED_BUF, + "Only SHARED_BUF memory type is supported for raw buffer pointer or NT handle"); + AnyMap params = {{ov::intel_gpu::shared_mem_type.name(), ov::intel_gpu::SharedMemType::BUFFER_FROM_HANDLE}, + {ov::intel_gpu::mem_handle.name(), reinterpret_cast(shared_buffer)}}; + return create_tensor(type, shape, params).as(); + } + /** * @brief This function is used to obtain remote tensor object from user-supplied USM pointer * @param type Tensor element type diff --git a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp index c44c2d2f0d5f4b..3dc4cb3c6195db 100644 --- a/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp +++ b/src/inference/include/openvino/runtime/intel_gpu/remote_properties.hpp @@ -103,13 +103,23 @@ static constexpr Property va_device{"VA_DEVICE"}; * @ingroup ov_runtime_ocl_gpu_cpp_api */ enum class SharedMemType { - OCL_BUFFER = 0, //!< Shared OpenCL buffer blob - OCL_IMAGE2D = 1, //!< Shared OpenCL 2D image blob - USM_USER_BUFFER = 2, //!< Shared USM pointer allocated by user - USM_HOST_BUFFER = 3, //!< Shared USM pointer type with host allocation type allocated by plugin - USM_DEVICE_BUFFER = 4, //!< Shared USM pointer type with device allocation type allocated by plugin - VA_SURFACE = 5, //!< Shared video decoder surface or D3D 2D texture blob - DX_BUFFER = 6 //!< Shared D3D buffer blob + OCL_BUFFER = 0, //!< Shared OpenCL buffer blob + OCL_IMAGE2D = 1, //!< Shared OpenCL 2D image blob + USM_USER_BUFFER = 2, //!< Shared USM pointer allocated by user + USM_HOST_BUFFER = 3, //!< Shared USM pointer type with host allocation type allocated by plugin + USM_DEVICE_BUFFER = 4, //!< Shared USM pointer type with device allocation type allocated by plugin + VA_SURFACE = 5, //!< Shared video decoder surface or D3D 2D texture blob + DX_BUFFER = 6, //!< Shared D3D buffer blob + BUFFER_FROM_HANDLE = 7, //!< OS-level external memory handle (e.g. DX12 NT handle on Windows, + //!< DMA-BUF fd on Linux) imported by the plugin into a cl_mem +}; + +/** + * @brief Enum to define memory type for pointer-based tensor sharing API. + * @ingroup ov_runtime_ocl_gpu_cpp_api + */ +enum class MemType { + SHARED_BUF = 0, //!< Shared OpenCL buffer handle passed as void* }; /** @cond INTERNAL */ @@ -129,6 +139,8 @@ inline std::ostream& operator<<(std::ostream& os, const SharedMemType& share_mem return os << "VA_SURFACE"; case SharedMemType::DX_BUFFER: return os << "DX_BUFFER"; + case SharedMemType::BUFFER_FROM_HANDLE: + return os << "BUFFER_FROM_HANDLE"; default: OPENVINO_THROW("Unsupported memory type"); } @@ -151,6 +163,8 @@ inline std::istream& operator>>(std::istream& is, SharedMemType& share_mem_type) share_mem_type = SharedMemType::VA_SURFACE; } else if (str == "DX_BUFFER") { share_mem_type = SharedMemType::DX_BUFFER; + } else if (str == "BUFFER_FROM_HANDLE") { + share_mem_type = SharedMemType::BUFFER_FROM_HANDLE; } else { OPENVINO_THROW("Unsupported memory type: ", str); } diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp index 90066acfc649a6..a5f1d9e3379d0e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp @@ -23,6 +23,7 @@ enum class TensorType { BT_EMPTY, BT_BUF_INTERNAL, BT_BUF_SHARED, + BT_BUF_SHARED_FROM_HANDLE, BT_USM_SHARED, BT_USM_HOST_INTERNAL, BT_USM_DEVICE_INTERNAL, diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index b15dac3e2ff7d6..ea400e30ba7006 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -67,6 +67,9 @@ class engine { /// Create shared memory object using user-supplied memory buffer @p buf using specified @p layout memory_ptr share_buffer(const layout& layout, shared_handle buf); + //Create memory object from user-supplied shared handle e.g from system HANDLE created by DX12 + virtual memory_ptr import_buffer(const layout& layout, shared_handle external_handle) = 0; + /// Create shared memory object using user-supplied USM pointer @p usm_ptr using specified @p layout memory_ptr share_usm(const layout& layout, shared_handle usm_ptr); diff --git a/src/plugins/intel_gpu/src/plugin/remote_context.cpp b/src/plugins/intel_gpu/src/plugin/remote_context.cpp index c59149c898d2a9..95578ba895e866 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_context.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_context.cpp @@ -169,6 +169,9 @@ ov::SoPtr RemoteContextImpl::create_tensor(const ov::element: mem = extract_object(params, ov::intel_gpu::dev_object_handle); check_if_shared(); #endif + } else if (ov::intel_gpu::SharedMemType::BUFFER_FROM_HANDLE == mem_type) { + tensor_type = TensorType::BT_BUF_SHARED_FROM_HANDLE; + mem = extract_object(params, ov::intel_gpu::mem_handle); } else { OPENVINO_THROW("[GPU] Unsupported shared object type ", mem_type); } diff --git a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp index c8de7996cf02ae..dc472f238c2e3e 100644 --- a/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp +++ b/src/plugins/intel_gpu/src/plugin/remote_tensor.cpp @@ -340,6 +340,10 @@ void RemoteTensorImpl::allocate() { m_memory_object = engine.share_buffer(m_layout, m_mem); break; } + case TensorType::BT_BUF_SHARED_FROM_HANDLE: { + m_memory_object = engine.import_buffer(m_layout, m_mem); + break; + } case TensorType::BT_USM_SHARED: { m_memory_object = engine.share_usm(m_layout, m_mem); break; @@ -380,6 +384,7 @@ const std::string& RemoteTensorImpl::get_device_name() const { bool RemoteTensorImpl::is_shared() const noexcept { return m_mem_type == TensorType::BT_BUF_SHARED || + m_mem_type == TensorType::BT_BUF_SHARED_FROM_HANDLE || m_mem_type == TensorType::BT_USM_SHARED || m_mem_type == TensorType::BT_IMG_SHARED || m_mem_type == TensorType::BT_SURF_SHARED || @@ -451,6 +456,13 @@ void RemoteTensorImpl::update_properties() { ov::intel_gpu::mem_handle(params.mem), }; break; + case TensorType::BT_BUF_SHARED_FROM_HANDLE: + m_properties = { + ov::intel_gpu::shared_mem_type(ov::intel_gpu::SharedMemType::BUFFER_FROM_HANDLE), + ov::intel_gpu::ocl_context(params.context), + ov::intel_gpu::mem_handle(params.mem), + }; + break; case TensorType::BT_USM_SHARED: m_properties = { ov::intel_gpu::shared_mem_type(ov::intel_gpu::SharedMemType::USM_USER_BUFFER), diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 141c3fc2ccc877..1ecfecee43f1dd 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include // NOTE: Due to buggy scope transition of warnings we need to disable warning in place of use/instantation @@ -46,6 +45,16 @@ ocl_error::ocl_error(cl::Error const& err) : ov::Exception("[GPU] " + std::string(err.what()) + std::string(", error code: ") + std::to_string(err.err())) {} OPENVINO_SUPPRESS_DEPRECATED_END +namespace { +cl_platform_id get_platform_id_for_device(const cl::Device& device) { + cl_platform_id platform = nullptr; + cl_int err = clGetDeviceInfo(device.get(), CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr); + OPENVINO_ASSERT(err == CL_SUCCESS && platform != nullptr, + "[GPU] Failed to retrieve CL_DEVICE_PLATFORM, error: ", err); + return platform; +} +} // namespace + ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type) : engine(dev) { OPENVINO_ASSERT(runtime_type == runtime_types::ocl, "[GPU] Invalid runtime type specified for OCL engine. Only OCL runtime is supported"); @@ -96,6 +105,63 @@ allocation_type ocl_engine::detect_usm_allocation_type(const void* memory) const : allocation_type::unknown; } +memory::ptr ocl_engine::import_buffer(const layout& layout, shared_handle external_handle) { + OPENVINO_ASSERT(external_handle != nullptr, "[GPU] External memory handle must not be null"); + OPENVINO_ASSERT(extension_supported("cl_khr_external_memory"), + "[GPU] Selected OpenCL device does not advertise cl_khr_external_memory; " + "external memory import is not supported"); + +#ifndef CL_VERSION_3_0 + OPENVINO_THROW("[GPU] External memory import is not supported on this platform"); +#else +#ifdef _WIN32 + constexpr auto handle_type_token = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR; +#elif defined(__linux__) + constexpr auto handle_type_token = CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR; +#else + OPENVINO_THROW("[GPU] External memory import is not supported on this platform"); +#endif + + cl_mem_properties props[] = { + static_cast(handle_type_token), + static_cast(reinterpret_cast(external_handle)), + 0, + }; + + cl_int errcode = CL_SUCCESS; + auto cl_ctx = static_cast(get_user_context()); + OPENVINO_ASSERT(cl_ctx != nullptr, "[GPU] OpenCL context is null while importing external buffer"); + const auto byte_size = layout.bytes_count(); + cl_mem imported = clCreateBufferWithProperties(cl_ctx, props, CL_MEM_READ_WRITE, byte_size, nullptr, &errcode); + OPENVINO_ASSERT(errcode == CL_SUCCESS && imported != nullptr, + "[GPU] Failed to import external memory handle via clCreateBufferWithProperties, error: ", + errcode); + + cl_platform_id platform = get_platform_id_for_device(get_cl_device()); + auto& svc_stream = downcast(get_service_stream()); + cl_command_queue q = svc_stream.get_cl_queue().get(); + cl_int acquire_err = cl::ExternalMemoryHelper::acquire(platform, q, imported); + if (acquire_err != CL_SUCCESS) { + clReleaseMemObject(imported); + OPENVINO_THROW("[GPU] clEnqueueAcquireExternalMemObjectsKHR failed or unavailable, error: ", acquire_err); + } + clFinish(q); + cl::Buffer buf(imported, true); + auto memory = std::make_shared(this, layout, buf, nullptr); + clReleaseMemObject(imported); + return memory; +#endif +} + +void ocl_engine::release_external_memory(cl_mem mem) const { + cl_platform_id platform = get_platform_id_for_device(get_cl_device()); + auto& opencl_stream = downcast(get_service_stream()); + cl_command_queue q = opencl_stream.get_cl_queue().get(); + // If the extension entrypoint is missing, the cl_mem refcount drop on dtor will still proceed. + cl::ExternalMemoryHelper::release(platform, q, mem); + clFinish(q); +} + memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type type, bool reset) { OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate memory for dynamic layout"); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp index df6c34c11b0c73..0c615a29587aba 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.hpp @@ -10,8 +10,6 @@ #include "ocl_device.hpp" #include -#include -#include #include #include @@ -28,8 +26,11 @@ class ocl_engine : public engine { memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; + memory_ptr import_buffer(const layout&, shared_handle external_handle) override; bool is_the_same_buffer(const memory& mem1, const memory& mem2) override; + void release_external_memory(cl_mem) const; + void* get_user_context() const override; allocation_type get_default_allocation_type() const override { return allocation_type::cl_mem; } diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp index c5609b8fdf6cfd..4f11d7d28ff015 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_ext.hpp @@ -693,6 +693,56 @@ class BufferDX : public Buffer { }; #endif +class ExternalMemoryHelper { + typedef CL_API_ENTRY cl_int(CL_API_CALL * PFN_clEnqueueAcquireExternalMemObjectsKHR)( + cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */); + + typedef CL_API_ENTRY cl_int(CL_API_CALL * PFN_clEnqueueReleaseExternalMemObjectsKHR)( + cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */); +public: + + static cl_int acquire(cl_platform_id platform, cl_command_queue queue, const cl_mem& mem) { + auto pfn = get_acquire(platform); + if (pfn == nullptr) + return CL_INVALID_OPERATION; + return pfn(queue, 1, &mem, 0, nullptr, nullptr); + } + + static cl_int release(cl_platform_id platform, cl_command_queue queue, const cl_mem& mem) { + auto pfn = get_release(platform); + if (pfn == nullptr) + return CL_INVALID_OPERATION; + return pfn(queue, 1, &mem, 0, nullptr, nullptr); + } +private: + static PFN_clEnqueueAcquireExternalMemObjectsKHR get_acquire(cl_platform_id platform) { + static PFN_clEnqueueAcquireExternalMemObjectsKHR fn = nullptr; + if (!fn) { + fn = try_load_entrypoint(platform, "clEnqueueAcquireExternalMemObjectsKHR"); + } + return fn; + } + + static PFN_clEnqueueReleaseExternalMemObjectsKHR get_release(cl_platform_id platform) { + static PFN_clEnqueueReleaseExternalMemObjectsKHR fn = nullptr; + if (!fn) { + fn = try_load_entrypoint(platform, "clEnqueueReleaseExternalMemObjectsKHR"); + } + return fn; + } + +}; + class PlatformVA : public Platform { public: //! \brief Default constructor - initializes to NULL. diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index 57eedf9bb0413c..59070067cc9a43 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -219,6 +219,11 @@ dnnl::memory gpu_buffer::get_onednn_grouped_memory(dnnl::memory::desc desc, cons } #endif +gpu_buffer_from_handle::~gpu_buffer_from_handle() { + auto cl_engine = downcast(_engine); + cl_engine->release_external_memory(static_cast(_buffer.get())); +} + gpu_image2d::gpu_image2d(ocl_engine* engine, const layout& layout) : lockable_gpu_mem() , memory(engine, layout, allocation_type::cl_mem, nullptr) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp index c942e1062836e8..a2cc3db172c294 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp @@ -31,7 +31,8 @@ struct lockable_gpu_mem { }; struct gpu_buffer : public lockable_gpu_mem, public memory { - gpu_buffer(ocl_engine* engine, const layout& new_layout, const cl::Buffer& buffer, std::shared_ptr mem_tracker); + gpu_buffer(ocl_engine* engine, const layout& new_layout, const cl::Buffer& buffer, + std::shared_ptr mem_tracker); gpu_buffer(ocl_engine* engine, const layout& layout); void* lock(const stream& stream, mem_lock_type type = mem_lock_type::read_write) override; @@ -60,6 +61,11 @@ struct gpu_buffer : public lockable_gpu_mem, public memory { cl::Buffer _buffer; }; +struct gpu_buffer_from_handle : public gpu_buffer { + using gpu_buffer::gpu_buffer; // constructor inheritance + ~gpu_buffer_from_handle() override; +}; + struct gpu_image2d : public lockable_gpu_mem, public memory { gpu_image2d(ocl_engine* engine, const layout& new_layout, const cl::Image2D& buffer, std::shared_ptr mem_tracker); gpu_image2d(ocl_engine* engine, const layout& layout); diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp index 6619fcd15fe4db..d068f8149b75e0 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.cpp @@ -97,6 +97,10 @@ memory::ptr ze_engine::allocate_memory(const layout& layout, allocation_type typ } } +memory::ptr ze_engine::import_buffer(const layout& layout, shared_handle external_handle) { + OPENVINO_NOT_IMPLEMENTED; +} + memory::ptr ze_engine::reinterpret_buffer(const memory& memory, const layout& new_layout) { OPENVINO_ASSERT(memory.get_engine() == this, "[GPU] trying to reinterpret buffer allocated by a different engine"); OPENVINO_ASSERT(new_layout.format.is_image() == memory.get_layout().format.is_image(), diff --git a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp index 0af7a2aac12554..73210fa3698ba9 100644 --- a/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp +++ b/src/plugins/intel_gpu/src/runtime/ze/ze_engine.hpp @@ -22,6 +22,7 @@ class ze_engine : public engine { runtime_types runtime_type() const override { return runtime_types::ze; }; memory_ptr allocate_memory(const layout& layout, allocation_type type, bool reset = true) override; + memory_ptr import_buffer(const layout& layout, shared_handle external_handle) override; memory_ptr reinterpret_handle(const layout& new_layout, shared_mem_params params) override; memory_ptr create_subbuffer(const memory& memory, const layout& new_layout, size_t byte_offset) override; memory_ptr reinterpret_buffer(const memory& memory, const layout& new_layout) override; diff --git a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt index f4db96602ca416..29655d0aa4ce04 100644 --- a/src/plugins/intel_gpu/tests/functional/CMakeLists.txt +++ b/src/plugins/intel_gpu/tests/functional/CMakeLists.txt @@ -55,8 +55,24 @@ if(libva_FOUND) endif() if(WIN32) - target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_DX11) - target_link_libraries(${TARGET_NAME} PRIVATE d3d11 dxgi) + include(CheckIncludeFileCXX) + # DX11 and DX12 SDK headers may be available independently; enable each + # interop test set only when its corresponding header is present to avoid + # build breaks on environments that ship only one of the SDKs. + check_include_file_cxx(d3d11.h OV_GPU_FUNC_TESTS_HAVE_D3D11_H) + check_include_file_cxx(d3d12.h OV_GPU_FUNC_TESTS_HAVE_D3D12_H) + + if(OV_GPU_FUNC_TESTS_HAVE_D3D11_H) + target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_DX11) + target_link_libraries(${TARGET_NAME} PRIVATE d3d11) + endif() + if(OV_GPU_FUNC_TESTS_HAVE_D3D12_H) + target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_DX12) + target_link_libraries(${TARGET_NAME} PRIVATE d3d12) + endif() + if(OV_GPU_FUNC_TESTS_HAVE_D3D11_H OR OV_GPU_FUNC_TESTS_HAVE_D3D12_H) + target_link_libraries(${TARGET_NAME} PRIVATE dxgi dxguid) + endif() endif() ov_build_target_faster(${TARGET_NAME} PCH) diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx11_nthandle.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx11_nthandle.cpp new file mode 100644 index 00000000000000..a758d2a34b348c --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx11_nthandle.cpp @@ -0,0 +1,302 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#if defined(OV_GPU_WITH_OCL_RT) && defined(_WIN32) && defined(ENABLE_DX11) +#include +#include +#include +#include + +#ifndef NOMINMAX +#define NOMINMAX +#define NOMINMAX_DEFINED_SHARED_BUF_TEST +#endif +#include +#include +#include +#ifdef NOMINMAX_DEFINED_SHARED_BUF_TEST +#undef NOMINMAX +#undef NOMINMAX_DEFINED_SHARED_BUF_TEST +#endif +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/intel_gpu/ocl/ocl.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" + +namespace { +bool get_context_device_luid(cl_context cl_ctx, std::array& cl_luid) { + size_t devices_size = 0; + if (clGetContextInfo(cl_ctx, CL_CONTEXT_DEVICES, 0, nullptr, &devices_size) != CL_SUCCESS || + devices_size < sizeof(cl_device_id)) { + return false; + } + + std::vector cl_devices(devices_size / sizeof(cl_device_id)); + if (clGetContextInfo(cl_ctx, CL_CONTEXT_DEVICES, devices_size, cl_devices.data(), nullptr) != CL_SUCCESS || + cl_devices.empty()) { + return false; + } + + cl_bool cl_luid_valid = CL_FALSE; + if (clGetDeviceInfo(cl_devices[0], CL_DEVICE_LUID_VALID_KHR, sizeof(cl_luid_valid), &cl_luid_valid, nullptr) != + CL_SUCCESS || + cl_luid_valid != CL_TRUE) { + return false; + } + + return clGetDeviceInfo(cl_devices[0], CL_DEVICE_LUID_KHR, cl_luid.size(), cl_luid.data(), nullptr) == CL_SUCCESS; +} + +// Keep data unchanged while still forcing an explicit output tensor write path. +std::shared_ptr make_copy_model(const ov::Shape& shape) { + auto param = std::make_shared(ov::element::f32, shape); + auto zero = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {0.0f}); + auto add = std::make_shared(param, zero); + auto result = std::make_shared(add); + return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{param}); +} + + +struct Dx11TestContext { + CComPtr device; + CComPtr device_ctx; +}; + +struct Dx11SharedBuffer { + CComPtr buffer; + HANDLE shared_handle = nullptr; +}; + +void close_nt_handle(HANDLE& handle) { + if (handle != nullptr) { + CloseHandle(handle); + handle = nullptr; + } +} + +struct NtHandleGuard { + HANDLE& handle; + + ~NtHandleGuard() { + close_nt_handle(handle); + } +}; + +Dx11TestContext create_dx11_test_context(const std::array& target_luid) { + IDXGIFactory* raw_factory = nullptr; + HRESULT hr = CreateDXGIFactory(__uuidof(IDXGIFactory), reinterpret_cast(&raw_factory)); + EXPECT_FALSE(FAILED(hr)); + if (FAILED(hr)) { + return {}; + } + CComPtr factory(raw_factory); + if (!factory) { + return {}; + } + + UINT adapter_index = 0; + IDXGIAdapter* raw_adapter = nullptr; + while (factory->EnumAdapters(adapter_index, &raw_adapter) != DXGI_ERROR_NOT_FOUND) { + CComPtr adapter(raw_adapter); + DXGI_ADAPTER_DESC desc{}; + adapter->GetDesc(&desc); + + std::array adapter_luid{}; + memcpy(adapter_luid.data(), &desc.AdapterLuid, sizeof(desc.AdapterLuid)); + if (memcmp(adapter_luid.data(), target_luid.data(), target_luid.size()) != 0) { + ++adapter_index; + continue; + } + + D3D_FEATURE_LEVEL feature_levels[] = {D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0}; + D3D_FEATURE_LEVEL feature_level; + ID3D11Device* raw_device = nullptr; + ID3D11DeviceContext* raw_ctx = nullptr; + hr = D3D11CreateDevice(adapter, + D3D_DRIVER_TYPE_UNKNOWN, + nullptr, + 0, + feature_levels, + ARRAYSIZE(feature_levels), + D3D11_SDK_VERSION, + &raw_device, + &feature_level, + &raw_ctx); + if (FAILED(hr)) { + return {}; + } + + return {CComPtr(raw_device), CComPtr(raw_ctx)}; + } + + return {}; +} + +Dx11SharedBuffer create_dx11_shared_buffer(ID3D11Device* device, size_t byte_size, const void* data = nullptr) { + // D3D11 does not allow SHARED_NTHANDLE on ID3D11Buffer; use an R32_FLOAT 4x4 Texture2D as backing storage. + const UINT element_count = static_cast(byte_size / sizeof(float)); + const UINT tex_width = 4; + const UINT tex_height = element_count / tex_width; + D3D11_TEXTURE2D_DESC desc{}; + desc.Width = tex_width; + desc.Height = tex_height; + desc.MipLevels = 1; + desc.ArraySize = 1; + desc.Format = DXGI_FORMAT_R32_FLOAT; + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.Usage = D3D11_USAGE_DEFAULT; + // Keep UAV-capable shared buffer; CPU writes are done via UpdateSubresource. + desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS; + desc.CPUAccessFlags = 0; + desc.MiscFlags = D3D11_RESOURCE_MISC_SHARED_NTHANDLE | D3D11_RESOURCE_MISC_SHARED; + + D3D11_SUBRESOURCE_DATA init_data{}; + init_data.pSysMem = data; + init_data.SysMemPitch = tex_width * static_cast(sizeof(float)); + init_data.SysMemSlicePitch = init_data.SysMemPitch * tex_height; + + ID3D11Texture2D* raw_texture = nullptr; + HRESULT hr = device->CreateTexture2D(&desc, data ? &init_data : nullptr, &raw_texture); + + if (FAILED(hr)) { + return {}; + } + CComPtr shared_texture(raw_texture); + + HANDLE shared_handle = nullptr; + CComPtr dxgi_resource; + hr = shared_texture->QueryInterface(__uuidof(IDXGIResource1), reinterpret_cast(&dxgi_resource)); + if (FAILED(hr)) { + return {}; + } + if (dxgi_resource) { + hr = dxgi_resource->CreateSharedHandle(nullptr, + DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE, + nullptr, + &shared_handle); + } + if (FAILED(hr)) { + return {}; + } + if (shared_handle == nullptr) { + return {}; + } + + return {shared_texture, shared_handle}; +} + +CComPtr open_dx11_shared_buffer(ID3D11Device* device, HANDLE shared_handle) { + CComPtr device1; + HRESULT hr = device->QueryInterface(__uuidof(ID3D11Device1), reinterpret_cast(&device1)); + EXPECT_FALSE(FAILED(hr)); + ID3D11Texture2D* raw_opened_texture = nullptr; + hr = device1->OpenSharedResource1(shared_handle, + __uuidof(ID3D11Texture2D), + reinterpret_cast(&raw_opened_texture)); + if(FAILED(hr)) { + return {}; + } + return CComPtr(raw_opened_texture); +} + +TEST(GpuSharedBufferRemoteTensor, smoke_Dx11RemoteInputToRemoteOutputCopyAndCompare) { +#ifndef CL_VERSION_3_0 + GTEST_SKIP() << "OpenCL version 3.0 is required for external memory sharing"; +#endif + //test work on 32.101.7076 - not tried with older driver + ov::Core core; + const ov::Shape shape{16}; + const size_t element_count = ov::shape_size(shape); + const size_t byte_size = element_count * sizeof(float); + + // Declare GPU device number + const std::string selected_gpu_id = "0"; + const std::string selected_gpu_device = "GPU." + selected_gpu_id; + + // Get OpenCL context for the selected GPU + auto candidate_ctx = core.get_default_context(selected_gpu_device).as(); + auto params = candidate_ctx.get_params(); + auto it = params.find(ov::intel_gpu::ocl_context.name()); + if (it == params.end()) { + GTEST_SKIP() << "Failed to get OpenCL context for " << selected_gpu_device; + } + + // Extract LUID from OpenCL context + auto cl_ctx = static_cast(it->second.as()); + std::array cl_luid{}; + if (!get_context_device_luid(cl_ctx, cl_luid)) { + GTEST_SKIP() << "Failed to get LUID for " << selected_gpu_device; + } + + // Create DX11 context for the selected GPU's LUID + Dx11TestContext dx11 = create_dx11_test_context(cl_luid); + if (!dx11.device) { + GTEST_SKIP() << "Failed to create DX11 context for " << selected_gpu_device; + } + + std::vector input_init(element_count, 2.0f); + auto dx_input_shared = create_dx11_shared_buffer(dx11.device, byte_size, input_init.data()); + NtHandleGuard input_handle_guard{dx_input_shared.shared_handle}; + std::vector output_init(element_count, 0.0f); + auto dx_output_shared = create_dx11_shared_buffer(dx11.device, byte_size, output_init.data()); + NtHandleGuard output_handle_guard{dx_output_shared.shared_handle}; + + auto dx_input_buffer = open_dx11_shared_buffer(dx11.device, + dx_input_shared.shared_handle); + if (dx_input_buffer == nullptr) { + GTEST_SKIP() << "Failed to open shared input buffer in DX11 context for " << selected_gpu_device; + } + + auto dx_output_buffer = open_dx11_shared_buffer(dx11.device, + dx_output_shared.shared_handle); + if (dx_output_buffer == nullptr) { + GTEST_SKIP() << "Failed to open shared output buffer in DX11 context for " << selected_gpu_device; + } + + // Initialize opened shared input texture explicitly to avoid driver-dependent init visibility. + const UINT row_pitch = 4u * static_cast(sizeof(float)); // 4 floats per row + dx11.device_ctx->UpdateSubresource(dx_input_buffer, + 0, + nullptr, + input_init.data(), + row_pitch, + static_cast(byte_size)); + dx11.device_ctx->Flush(); + + auto ocl_ctx = core.get_default_context(selected_gpu_device).as(); + + auto remote_input_tensor = ocl_ctx.create_tensor(ov::element::f32, + shape, + dx_input_shared.shared_handle, + ov::intel_gpu::MemType::SHARED_BUF); + auto remote_output_tensor = ocl_ctx.create_tensor(ov::element::f32, + shape, + dx_output_shared.shared_handle, + ov::intel_gpu::MemType::SHARED_BUF); + + auto model = make_copy_model(shape); + auto compiled = core.compile_model(model, ocl_ctx); + auto infer_req = compiled.create_infer_request(); + infer_req.set_tensor(compiled.input(), remote_input_tensor); + infer_req.set_tensor(compiled.output(), remote_output_tensor); + + ov::Tensor host_input(ov::element::f32, shape); + remote_input_tensor.copy_to(host_input); + const auto* input_values = host_input.data(); + for (size_t i = 0; i < element_count; ++i) { + EXPECT_FLOAT_EQ(input_values[i], 2.0f) << "Input mismatch at index " << i; + } + infer_req.infer(); + ov::Tensor host_output(ov::element::f32, shape); + remote_output_tensor.copy_to(host_output); + const auto* output_values = host_output.data(); + for (size_t i = 0; i < element_count; ++i) { + EXPECT_FLOAT_EQ(output_values[i], 2.0f) << "Mismatch at index " << i; + } +} +} // namespace +#endif diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_nthandle.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_nthandle.cpp new file mode 100644 index 00000000000000..305250d0b67b2d --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_nthandle.cpp @@ -0,0 +1,328 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#if defined(OV_GPU_WITH_OCL_RT) && defined(_WIN32) && defined(ENABLE_DX12) +#include +#include +#include +#include +#include + + +#ifndef NOMINMAX +#define NOMINMAX +#define NOMINMAX_DEFINED_SHARED_BUF_TEST +#endif +#include +#include +#include +#ifdef NOMINMAX_DEFINED_SHARED_BUF_TEST +#undef NOMINMAX +#undef NOMINMAX_DEFINED_SHARED_BUF_TEST +#endif + +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/intel_gpu/ocl/ocl.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" + +namespace { +bool get_context_device_luid(cl_context cl_ctx, std::array& cl_luid) { + size_t devices_size = 0; + if (clGetContextInfo(cl_ctx, CL_CONTEXT_DEVICES, 0, nullptr, &devices_size) != CL_SUCCESS || + devices_size < sizeof(cl_device_id)) { + return false; + } + + std::vector cl_devices(devices_size / sizeof(cl_device_id)); + if (clGetContextInfo(cl_ctx, CL_CONTEXT_DEVICES, devices_size, cl_devices.data(), nullptr) != CL_SUCCESS || + cl_devices.empty()) { + return false; + } + + cl_bool cl_luid_valid = CL_FALSE; + if (clGetDeviceInfo(cl_devices[0], CL_DEVICE_LUID_VALID_KHR, sizeof(cl_luid_valid), &cl_luid_valid, nullptr) != CL_SUCCESS || + cl_luid_valid != CL_TRUE) { + return false; + } + + return clGetDeviceInfo(cl_devices[0], CL_DEVICE_LUID_KHR, cl_luid.size(), cl_luid.data(), nullptr) == CL_SUCCESS; +} + +// Keep data unchanged while still forcing an explicit output tensor write path. +std::shared_ptr make_copy_model(const ov::Shape& shape) { + auto param = std::make_shared(ov::element::f32, shape); + auto zero = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {0.0f}); + auto add = std::make_shared(param, zero); + auto result = std::make_shared(add); + return std::make_shared(ov::ResultVector{result}, ov::ParameterVector{param}); +} + + +struct Dx12TestContext { + CComPtr adapter; + CComPtr device; + CComPtr command_queue; +}; + +struct Dx12SharedBuffer { + CComPtr resource; + HANDLE shared_handle = nullptr; // NT handle; caller must CloseHandle when done +}; + + +static bool gpu_wait(ID3D12CommandQueue* command_queue, ID3D12Device* device) { + ID3D12Fence* raw_fence = nullptr; + HRESULT hr = device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&raw_fence)); + if (FAILED(hr)) return false; + CComPtr fence(raw_fence); + + HANDLE event = CreateEvent(nullptr, FALSE, FALSE, nullptr); + if (!event) return false; + + const UINT64 fence_value = 1; + command_queue->Signal(fence, fence_value); + if (fence->GetCompletedValue() < fence_value) { + fence->SetEventOnCompletion(fence_value, event); + WaitForSingleObject(event, INFINITE); + } + CloseHandle(event); + return true; +} + +Dx12TestContext create_dx12_test_context(const std::array& target_luid) { + IDXGIFactory4* raw_factory = nullptr; + HRESULT hr = CreateDXGIFactory1(IID_PPV_ARGS(&raw_factory)); + if (FAILED(hr)) { + return {}; + } + CComPtr factory(raw_factory); + if (!factory) { + return {}; + } + + UINT adapter_index = 0; + IDXGIAdapter1* raw_adapter = nullptr; + while (factory->EnumAdapters1(adapter_index, &raw_adapter) != DXGI_ERROR_NOT_FOUND) { + CComPtr adapter(raw_adapter); + DXGI_ADAPTER_DESC1 desc{}; + adapter->GetDesc1(&desc); + + std::array adapter_luid{}; + memcpy(adapter_luid.data(), &desc.AdapterLuid, sizeof(desc.AdapterLuid)); + if ((desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) || + memcmp(adapter_luid.data(), target_luid.data(), target_luid.size()) != 0) { + ++adapter_index; + continue; + } + + ID3D12Device* raw_device = nullptr; + hr = D3D12CreateDevice(adapter, D3D_FEATURE_LEVEL_12_0, IID_PPV_ARGS(&raw_device)); + if (FAILED(hr)) { + return {}; + } + CComPtr device(raw_device); + + D3D12_COMMAND_QUEUE_DESC queue_desc{}; + queue_desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + ID3D12CommandQueue* raw_queue = nullptr; + hr = device->CreateCommandQueue(&queue_desc, IID_PPV_ARGS(&raw_queue)); + if (FAILED(hr)) { + return {}; + } + return {adapter, device, CComPtr(raw_queue)}; + } + + return {}; +} + +Dx12SharedBuffer create_dx12_shared_buffer(ID3D12Device* device, + ID3D12CommandQueue* command_queue, + size_t byte_size, + const void* data = nullptr) { + D3D12_HEAP_PROPERTIES heap_props{}; + heap_props.Type = D3D12_HEAP_TYPE_DEFAULT; + + D3D12_RESOURCE_DESC resource_desc{}; + resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + resource_desc.Alignment = 0; + resource_desc.Width = byte_size; + resource_desc.Height = 1; + resource_desc.DepthOrArraySize = 1; + resource_desc.MipLevels = 1; + resource_desc.Format = DXGI_FORMAT_UNKNOWN; + resource_desc.SampleDesc.Count = 1; + resource_desc.SampleDesc.Quality = 0; + resource_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + resource_desc.Flags = D3D12_RESOURCE_FLAG_NONE; + + ID3D12Resource* raw_resource = nullptr; + HRESULT hr = device->CreateCommittedResource(&heap_props, + D3D12_HEAP_FLAG_SHARED, + &resource_desc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_PPV_ARGS(&raw_resource)); + if(FAILED(hr)) { + return {}; + } + CComPtr resource(raw_resource); + if (!resource) { + return {}; + } + + HANDLE shared_handle = nullptr; + hr = device->CreateSharedHandle(resource, nullptr, GENERIC_ALL, nullptr, &shared_handle); + if (FAILED(hr)) { + return {}; + } + if (shared_handle == nullptr) { + return {}; + } + + if (data && resource) { + D3D12_HEAP_PROPERTIES upload_heap{}; + upload_heap.Type = D3D12_HEAP_TYPE_UPLOAD; + + D3D12_RESOURCE_DESC upload_desc = resource_desc; + upload_desc.Flags = D3D12_RESOURCE_FLAG_NONE; + + ID3D12Resource* raw_upload = nullptr; + hr = device->CreateCommittedResource(&upload_heap, + D3D12_HEAP_FLAG_NONE, + &upload_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, + nullptr, + IID_PPV_ARGS(&raw_upload)); + if (FAILED(hr)) { + return {}; + } + CComPtr upload_resource(raw_upload); + + if (upload_resource) { + void* mapped = nullptr; + D3D12_RANGE read_range{0, 0}; + upload_resource->Map(0, &read_range, &mapped); + memcpy(mapped, data, byte_size); + upload_resource->Unmap(0, nullptr); + + ID3D12CommandAllocator* raw_allocator = nullptr; + device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&raw_allocator)); + CComPtr allocator(raw_allocator); + + ID3D12GraphicsCommandList* raw_cmd_list = nullptr; + device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, allocator, nullptr, + IID_PPV_ARGS(&raw_cmd_list)); + CComPtr cmd_list(raw_cmd_list); + + D3D12_RESOURCE_BARRIER barrier{}; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = resource; + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COMMON; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_DEST; + barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES; + cmd_list->ResourceBarrier(1, &barrier); + + cmd_list->CopyBufferRegion(resource, 0, upload_resource, 0, byte_size); + + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON; + cmd_list->ResourceBarrier(1, &barrier); + cmd_list->Close(); + + ID3D12CommandList* cmd_lists[] = {cmd_list}; + command_queue->ExecuteCommandLists(1, cmd_lists); + gpu_wait(command_queue, device); + } + } + return {resource, shared_handle}; +} + +TEST(GpuSharedBufferRemoteTensor, smoke_Dx12RemoteInputToRemoteOutputCopyAndCompare) { +#ifndef CL_VERSION_3_0 + GTEST_SKIP() << "OpenCL version 3.0 is required for external memory sharing"; +#endif + //test work on 32.101.7076 - not tried with older driver + ov::Core core; + const ov::Shape shape{16'000}; + const size_t element_count = ov::shape_size(shape); + const size_t byte_size = element_count * sizeof(float); + + // Declare GPU device number + const std::string selected_gpu_id = "0"; + const std::string selected_gpu_device = "GPU." + selected_gpu_id; + + // Get OpenCL context for the selected GPU + auto candidate_ctx = core.get_default_context(selected_gpu_device).as(); + auto params = candidate_ctx.get_params(); + auto it = params.find(ov::intel_gpu::ocl_context.name()); + if (it == params.end()) { + GTEST_SKIP() << "Failed to get OpenCL context for " << selected_gpu_device; + } + + // Extract LUID from OpenCL context + auto cl_ctx = static_cast(it->second.as()); + std::array cl_luid{}; + if (!get_context_device_luid(cl_ctx, cl_luid)) { + GTEST_SKIP() << "Failed to get LUID for " << selected_gpu_device; + } + // Create DX12 context for the selected GPU's LUID + Dx12TestContext dx12 = create_dx12_test_context(cl_luid); + if (!dx12.device) { + GTEST_SKIP() << "Failed to create DX12 context for " << selected_gpu_device; + } + + std::vector input_init(element_count, 2.0f); + auto dx_input_shared = create_dx12_shared_buffer(dx12.device, dx12.command_queue, + byte_size, input_init.data()); + std::vector output_init(element_count, 0.0f); + auto dx_output_shared = create_dx12_shared_buffer(dx12.device, dx12.command_queue, byte_size); + ASSERT_NE(dx_input_shared.shared_handle, nullptr); + ASSERT_NE(dx_output_shared.shared_handle, nullptr); + + DXGI_ADAPTER_DESC1 dxgi_desc{}; + dx12.adapter->GetDesc1(&dxgi_desc); + std::array dxgi_luid{}; + memcpy(dxgi_luid.data(), &dxgi_desc.AdapterLuid, sizeof(dxgi_desc.AdapterLuid)); + auto ov_ctx = core.get_default_context(selected_gpu_device).as(); + + ov::RemoteTensor remote_input_tensor; + ov::RemoteTensor remote_output_tensor; + + remote_input_tensor = ov_ctx.create_tensor(ov::element::f32, shape, + dx_input_shared.shared_handle, + ov::intel_gpu::MemType::SHARED_BUF); + remote_output_tensor = ov_ctx.create_tensor(ov::element::f32, shape, + dx_output_shared.shared_handle, + ov::intel_gpu::MemType::SHARED_BUF); + + auto model = make_copy_model(shape); + auto compiled = core.compile_model(model, ov_ctx); + auto infer_req = compiled.create_infer_request(); + infer_req.set_tensor(compiled.input(), remote_input_tensor); + infer_req.set_tensor(compiled.output(), remote_output_tensor); + ov::Tensor host_input(ov::element::f32, shape); + remote_input_tensor.copy_to(host_input); + const auto* input_values = host_input.data(); + for (size_t i = 0; i < element_count; ++i) { + EXPECT_FLOAT_EQ(input_values[i], 2.0f) << "Input mismatch at index " << i; + } + infer_req.infer(); + ov::Tensor host_output(ov::element::f32, shape); + remote_output_tensor.copy_to(host_output); + const auto* output_values = host_output.data(); + for (size_t i = 0; i < element_count; ++i) { + EXPECT_FLOAT_EQ(output_values[i], 2.0f) << "Mismatch at index " << i; + } + + CloseHandle(dx_input_shared.shared_handle); + dx_input_shared.shared_handle = nullptr; + CloseHandle(dx_output_shared.shared_handle); + dx_output_shared.shared_handle = nullptr; +} +} // namespace +#endif diff --git a/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_remote_run.cpp b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_remote_run.cpp new file mode 100644 index 00000000000000..41383feea9a0bc --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/remote_tensor_tests/dx12_remote_run.cpp @@ -0,0 +1,369 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#if defined(_WIN32) && defined(ENABLE_DX12) +#include +#include +#include + +#include +#include "openvino/core/any.hpp" +#include "openvino/core/memory_util.hpp" +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/core.hpp" +#include "openvino/runtime/intel_gpu/ocl/ocl.hpp" +#include "openvino/runtime/intel_gpu/remote_properties.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "shared_test_classes/base/ov_behavior_test_utils.hpp" + +#include +#include +#include +#include +#include +#include + +using CompilationParams = std::tuple; + +namespace { + +std::shared_ptr make_model() { + std::vector inputShape = {1, 2, 32, 32}; + ov::element::Type_t ngPrc = ov::element::Type_t::f32; + return ov::test::utils::make_conv_pool_relu(inputShape, ngPrc); +} + +class DX12RemoteRunTests : public ov::test::behavior::OVPluginTestBase, + public testing::WithParamInterface { +protected: + std::shared_ptr core = ov::test::utils::PluginCache::get().core(); + ov::AnyMap configuration; + std::shared_ptr ov_model; + + Microsoft::WRL::ComPtr device; + Microsoft::WRL::ComPtr heap = nullptr; + Microsoft::WRL::ComPtr placed_resources = nullptr; + Microsoft::WRL::ComPtr comitted_resource; + + HANDLE shared_mem = nullptr; + +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + std::string targetDevice; + ov::AnyMap configuration; + std::tie(targetDevice, configuration) = obj.param; + std::replace(targetDevice.begin(), targetDevice.end(), ':', '_'); + targetDevice = "GPU"; + + std::ostringstream result; + result << "targetDevice=" << targetDevice << "_"; + if (!configuration.empty()) { + for (auto& configItem : configuration) { + result << "configItem=" << configItem.first << "_"; + configItem.second.print(result); + } + } + + return result.str(); + } + + void SetUp() override { +#ifndef CL_VERSION_3_0 + GTEST_SKIP() << "OpenCL version 3.0 is required for external memory sharing"; +#endif + //tests works on 32.101.7076 - not tried with older driver + std::tie(target_device, configuration) = this->GetParam(); + SKIP_IF_CURRENT_TEST_IS_DISABLED() + OVPluginTestBase::SetUp(); + ov_model = make_model(); + createDevice(); + } + + void TearDown() override { + if (!configuration.empty()) { + ov::test::utils::PluginCache::get().reset(); + } + + APIBaseTest::TearDown(); + } + + void createDevice() { + auto res = D3D12CreateDevice(nullptr, D3D_FEATURE_LEVEL_12_0, IID_PPV_ARGS(device.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "D3D12CreateDevice failed"; + } + } + + void createHeap(const size_t byte_size) { + const size_t size = (byte_size + (static_cast(D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT) - 1)) & + ~(static_cast(D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT) - 1); + + D3D12_HEAP_DESC desc_heap{}; + desc_heap.SizeInBytes = size; + desc_heap.Properties.Type = D3D12_HEAP_TYPE_CUSTOM; + desc_heap.Properties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE; + desc_heap.Properties.MemoryPoolPreference = D3D12_MEMORY_POOL_L0; + desc_heap.Properties.CreationNodeMask = 1; + desc_heap.Properties.VisibleNodeMask = 1; + desc_heap.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + desc_heap.Flags = D3D12_HEAP_FLAG_SHARED_CROSS_ADAPTER | D3D12_HEAP_FLAG_SHARED; + auto res = device->CreateHeap(&desc_heap, IID_PPV_ARGS(heap.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreateHeap failed."; + } + + res = device->CreateSharedHandle(heap.Get(), nullptr, GENERIC_ALL, nullptr, &shared_mem); + if(FAILED(res)) { + GTEST_SKIP() << "CreateSharedHandle failed."; + } + } + + void createPlacedResources(const size_t byte_size) { + D3D12_RESOURCE_DESC desc_resource{}; + desc_resource.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + desc_resource.Alignment = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT; + desc_resource.Width = byte_size; + desc_resource.Height = 1; + desc_resource.DepthOrArraySize = 1; + desc_resource.MipLevels = 1; + desc_resource.Format = DXGI_FORMAT_UNKNOWN; + desc_resource.SampleDesc.Count = 1; + desc_resource.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + desc_resource.Flags = D3D12_RESOURCE_FLAG_ALLOW_CROSS_ADAPTER | D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + auto res = device->CreatePlacedResource(heap.Get(), + 0, + &desc_resource, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + nullptr, + IID_PPV_ARGS(placed_resources.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreatePlacedResource failed."; + } + } + + void createComittedResources(const size_t byte_size) { + D3D12_HEAP_PROPERTIES heap_properties{}; + heap_properties.Type = D3D12_HEAP_TYPE_UPLOAD; + heap_properties.CreationNodeMask = 1; + heap_properties.VisibleNodeMask = 1; + + D3D12_RESOURCE_DESC resource_desc{}; + resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + resource_desc.Width = byte_size; + resource_desc.Height = 1; + resource_desc.DepthOrArraySize = 1; + resource_desc.MipLevels = 1; + resource_desc.SampleDesc.Count = 1; + resource_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + + auto res = device->CreateCommittedResource(&heap_properties, + D3D12_HEAP_FLAG_NONE, + &resource_desc, + D3D12_RESOURCE_STATE_GENERIC_READ, + nullptr, + IID_PPV_ARGS(comitted_resource.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreateCommittedResource failed."; + } + } + + void createResources(const size_t byte_size) { + createHeap(byte_size); + createPlacedResources(byte_size); + createComittedResources(byte_size); + } + + void copyResources(const size_t byte_size) { + Microsoft::WRL::ComPtr command_queue; + Microsoft::WRL::ComPtr command_allocator; + Microsoft::WRL::ComPtr command_list; + Microsoft::WRL::ComPtr fence; + uint32_t fence_value = 0; + + D3D12_COMMAND_QUEUE_DESC desc{}; + desc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE; + desc.Priority = D3D12_COMMAND_QUEUE_PRIORITY_NORMAL; + desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + desc.NodeMask = 0; + auto res = device->CreateCommandQueue(&desc, IID_PPV_ARGS(command_queue.ReleaseAndGetAddressOf())); + if (FAILED(res)) { + GTEST_SKIP() << "CreateCommandQueue failed."; + } + + res = device->CreateFence(0, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(fence.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreateFence failed."; + } + + res = device.Get()->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COMPUTE, + IID_PPV_ARGS(command_allocator.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreateCommandAllocator failed."; + } + + res = device->CreateCommandList(0, + D3D12_COMMAND_LIST_TYPE_COMPUTE, + command_allocator.Get(), + nullptr, + IID_PPV_ARGS(command_list.ReleaseAndGetAddressOf())); + if(FAILED(res)) { + GTEST_SKIP() << "CreateCommandList failed."; + } + + command_list->CopyBufferRegion(placed_resources.Get(), 0, comitted_resource.Get(), 0, byte_size); + res = command_list->Close(); + if(FAILED(res)) { + GTEST_SKIP() << "Close command list failed."; + } + + ID3D12CommandList* command_lists[] = {command_list.Get()}; + command_queue->ExecuteCommandLists(ARRAYSIZE(command_lists), command_lists); + res = command_queue->Signal(fence.Get(), ++fence_value); + if(FAILED(res)) { + GTEST_SKIP() << "Signal command queue failed."; + } + + volatile auto event = CreateEvent(nullptr, FALSE, FALSE, nullptr); + res = fence->SetEventOnCompletion(fence_value, event); + if(FAILED(res)) { + GTEST_SKIP() << "SetEventOnCompletion failed."; + } + WaitForSingleObject(event, INFINITE); + } +}; + +TEST_P(DX12RemoteRunTests, smoke_CheckRemoteTensorSharedBuf) { + // Skip test according to plugin specific disabled_test_patterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::CompiledModel compiled_model; + ov::InferRequest inference_request; + + OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration)); + OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); + auto tensor = inference_request.get_input_tensor(); + + const auto byte_size = ov::util::get_memory_size(ov::element::f32, shape_size(tensor.get_shape())); + + auto context = core->get_default_context(target_device).as(); + + createHeap(byte_size); + + auto remote_tensor = context.create_tensor(ov::element::f32, tensor.get_shape(), shared_mem, ov::intel_gpu::MemType::SHARED_BUF); + + ov::Tensor check_remote_tensor; + ASSERT_NO_THROW(check_remote_tensor = remote_tensor); + ASSERT_THROW(check_remote_tensor.data(), ov::Exception); + + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(check_remote_tensor)); + OV_ASSERT_NO_THROW(inference_request.infer()); +} + +TEST_P(DX12RemoteRunTests, smoke_CheckRemoteTensorSharedBuChangingTensors) { + // Skip test according to plugin specific disabled_test_patterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::CompiledModel compiled_model; + ov::InferRequest inference_request; + OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration)); + OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); + auto tensor = inference_request.get_input_tensor(); + const auto byte_size = ov::util::get_memory_size(ov::element::f32, shape_size(tensor.get_shape())); + auto context = core->get_default_context(target_device).as();; + createHeap(byte_size); + auto remote_tensor = context.create_tensor(ov::element::f32, tensor.get_shape(), shared_mem, ov::intel_gpu::MemType::SHARED_BUF); + ov::Tensor check_remote_tensor; + ASSERT_NO_THROW(check_remote_tensor = remote_tensor); + ASSERT_THROW(check_remote_tensor.data(), ov::Exception); + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(check_remote_tensor)); + OV_ASSERT_NO_THROW(inference_request.infer()); + // set random input tensor + float* random_buffer_tensor = new float[byte_size / sizeof(float)]; + memset(random_buffer_tensor, 1, byte_size); + ov::Tensor random_tensor_input{ov::element::f32, tensor.get_shape(), random_buffer_tensor}; + + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(random_tensor_input)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + // set random output tensor + auto output_tensor = inference_request.get_output_tensor(); + const auto output_byte_size = ov::util::get_memory_size(ov::element::f32, shape_size(output_tensor.get_shape())); + + float* output_random_buffer_tensor = new float[output_byte_size / sizeof(float)]; + memset(output_random_buffer_tensor, 1, output_byte_size); + ov::Tensor outputrandom_tensor_input{ov::element::f32, output_tensor.get_shape(), output_random_buffer_tensor}; + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(outputrandom_tensor_input)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + delete[] random_buffer_tensor; + delete[] output_random_buffer_tensor; +} + +TEST_P(DX12RemoteRunTests, smoke_CheckOutputDataFromMultipleRuns) { + // Skip test according to plugin specific disabled_test_patterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + ov::CompiledModel compiled_model; + ov::InferRequest inference_request; + float* data; + + OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration)); + OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); + auto tensor = inference_request.get_input_tensor(); + + auto shape = tensor.get_shape(); + const auto byte_size = ov::util::get_memory_size(ov::element::f32, shape_size(shape)); + tensor = {}; + + createResources(byte_size); + void* mem; + comitted_resource.Get()->Map(0, nullptr, &mem); + memset(mem, 99, byte_size); + comitted_resource.Get()->Unmap(0, nullptr); + copyResources(byte_size); + + auto context = core->get_default_context(target_device).as(); + + auto output_tensor = inference_request.get_output_tensor(); + const auto output_byte_size = output_tensor.get_byte_size(); + float* output_data_one = new float[output_byte_size / sizeof(float)]; + ov::Tensor output_data_tensor_one{ov::element::f32, output_tensor.get_shape(), output_data_one}; + + auto remote_tensor = context.create_tensor(ov::element::f32, shape, shared_mem, ov::intel_gpu::MemType::SHARED_BUF); + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_tensor)); + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(output_data_tensor_one)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + float* output_data_two = new float[output_byte_size / sizeof(float)]; + ov::Tensor output_data_tensor_two{ov::element::f32, output_tensor.get_shape(), output_data_two}; + + data = new float[byte_size / sizeof(float)]; + memset(data, 99, byte_size); + ov::Tensor input_data_tensor{ov::element::f32, shape, data}; + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(input_data_tensor)); + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(output_data_tensor_two)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + delete[] data; + + EXPECT_NE(output_data_one, output_data_two); + EXPECT_EQ(memcmp(output_data_one, output_data_two, output_byte_size), 0); + + delete[] output_data_one; + delete[] output_data_two; +} + +const std::vector remoteConfigs = {{}}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + DX12RemoteRunTests, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_GPU), + ::testing::ValuesIn(remoteConfigs)), + DX12RemoteRunTests::getTestCaseName); + +} +#endif // defined(_WIN32) && defined(ENABLE_DX12)