diff --git a/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp b/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp new file mode 100644 index 00000000000000..9ab1fc05f32a0f --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp @@ -0,0 +1,286 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +using namespace dnnl::impl; +namespace x64 = dnnl::impl::cpu::x64; + +class StackAllocator final { +public: + using Ptr = std::shared_ptr; + + class Address; + + StackAllocator(x64::jit_generator& code_gen) + : StackAllocator{code_gen, code_gen.rbp} { + code_gen.mov(code_gen.rbp, code_gen.rsp); + } + + StackAllocator(x64::jit_generator& code_gen, + const Xbyak::Reg& bp) + : code_generator{code_gen} + , base_pointer{bp} { + checkUnique(true); + } + + ~StackAllocator() { + release(); + checkUnique(false); + } + + void release() { + current_offset = {}; + commit(); + } + + void commit() { + if (current_offset > offset) { + code_generator.sub(code_generator.rsp, current_offset - offset); + offset = current_offset; + } else if (offset > current_offset) { + code_generator.add(code_generator.rsp, offset - current_offset); + offset = current_offset; + } + } + + friend void stack_mov(Address& addr, const Xbyak::Xmm& vmm); + friend void stack_mov(Address& addr, const Xbyak::Reg& reg); + friend void stack_mov(const Xbyak::Xmm& vmm, const Address& addr) ; + friend void stack_mov(const Xbyak::Reg& reg, const Address& addr); + +private: + struct Allocation { + using Ptr = std::shared_ptr; + + Allocation(const Xbyak::Address& address, + const size_t offset, + const size_t size) + : address(address) + , offset(offset) + , size(size) {} + + bool is_used = true; + Xbyak::Address address; + size_t offset{}; + size_t size; + }; + + Allocation::Ptr allocate(const size_t alloc_size) { + std::vector free_allocations{}; + for (const auto& alloc : allocations) { + if (!alloc->is_used && alloc_size <= alloc->size) { + free_allocations.push_back(alloc); + } + } + if (!free_allocations.empty()) { + std::sort(free_allocations.begin(), free_allocations.end(), + [](const Allocation::Ptr& alloc0, const Allocation::Ptr& alloc1) { + return alloc0->size < alloc1->size; + }); + const auto alloc = free_allocations.front(); + alloc->is_used = true; + return alloc; + } else { + current_offset += alloc_size; + Xbyak::Address addr = code_generator.ptr[base_pointer - current_offset]; + const auto alloc = std::make_shared(addr, current_offset, alloc_size); + allocations.push_back(alloc); + return alloc; + } + } + + void deallocate() { + while (!allocations.empty()) { + const auto& last = allocations.back(); + if (last->is_used) { + break; + } + current_offset -= last->size; + allocations.pop_back(); + } + } + + void checkUnique(bool isCtor) { + static thread_local bool isCreated = false; + if (isCtor) { + if (isCreated) { + IE_THROW() << "There should be only one instance of StackAllocator per thread !!"; + } + isCreated = true; + } else { + isCreated = false; + } + } + + x64::jit_generator& code_generator; + const Xbyak::Reg& base_pointer; + + size_t offset{}; + size_t current_offset{}; + std::deque allocations{}; +}; + +class StackAllocator::Address final { +public: + Address() = default; + + Address(StackAllocator::Ptr stack_allocator, + const size_t alloc_size) + : stack_allocator_{stack_allocator} + , allocation_{stack_allocator_->allocate(alloc_size)} { + } + + ~Address() { + release(); + } + + Address(Address&& addr) noexcept { + this->operator=(std::move(addr)); + } + + Address& operator=(Address&& addr) noexcept { + release(); + stack_allocator_ = std::move(addr.stack_allocator_); + allocation_ = std::move(addr.allocation_); + return *this; + } + + void release() { + if (allocation_) { + allocation_->is_used = false; + } + if (stack_allocator_) { + stack_allocator_->deallocate(); + } + allocation_ = {}; + stack_allocator_ = {}; + } + + operator Xbyak::Address&() { + ensureValid(); + stack_allocator_->commit(); + return allocation_->address; + } + + operator const Xbyak::Address&() const { + ensureValid(); + stack_allocator_->commit(); + return allocation_->address; + } + + Address& operator=(const Xbyak::Xmm& vmm) { + stack_mov(*this, vmm); + return *this; + } + + Address& operator=(const Xbyak::Reg& reg) { + stack_mov(*this, reg); + return *this; + } + + friend void ::ov::intel_cpu::stack_mov(Address& addr, const Xbyak::Xmm& vmm); + friend void ::ov::intel_cpu::stack_mov(Address& addr, const Xbyak::Reg& reg); + friend void ::ov::intel_cpu::stack_mov(const Xbyak::Xmm& vmm, const Address& addr) ; + friend void ::ov::intel_cpu::stack_mov(const Xbyak::Reg& reg, const Address& addr); + +private: + void ensureSize(const Xbyak::Reg& reg) const { + ensureValid(); + const size_t reg_size = reg.getBit() / 8; + if (reg_size > allocation_->size) { + IE_THROW() << "reg size is bigger than space allocated in StackAllocator !!"; + } + } + + void ensureValid() const { + if (!stack_allocator_ || !allocation_) { + IE_THROW() << "StackAllocator::Address is either not initialized or released !!"; + } + } + + x64::jit_generator& generator() const { + return stack_allocator_->code_generator; + } + + StackAllocator::Ptr stack_allocator_; + Allocation::Ptr allocation_; +}; + +inline +void stack_mov(StackAllocator::Address& addr, const Xbyak::Xmm& vmm) { + addr.ensureSize(vmm); + x64::jit_generator& generator = addr.generator(); + if (vmm.isXMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Xmm{vmm.getIdx()}); + } else if (vmm.isYMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Ymm{vmm.getIdx()}); + } else if (vmm.isZMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Zmm{vmm.getIdx()}); + } else { + IE_THROW() << "Unknown simd register !!"; + } +} + +inline +void stack_mov(StackAllocator::Address& addr, const Xbyak::Reg& reg) { + addr.ensureSize(reg); + x64::jit_generator& generator = addr.generator(); + if (reg.isREG(8)) { + generator.mov(addr.allocation_->address, Xbyak::Reg8{reg.getIdx()}); + } else if (reg.isREG(16)) { + generator.mov(addr.allocation_->address, Xbyak::Reg16{reg.getIdx()}); + } else if (reg.isREG(32)) { + generator.mov(addr.allocation_->address, Xbyak::Reg32{reg.getIdx()}); + } else if (reg.isREG(64)) { + generator.mov(addr.allocation_->address, Xbyak::Reg64{reg.getIdx()}); + } else { + IE_THROW() << "Unknown general purpose register !!"; + } +} + +inline +void stack_mov(const Xbyak::Xmm& vmm, const StackAllocator::Address& addr) { + addr.ensureSize(vmm); + x64::jit_generator& generator = addr.generator(); + if (vmm.isXMM()) { + generator.uni_vmovdqu(Xbyak::Xmm{vmm.getIdx()}, addr.allocation_->address); + } else if (vmm.isYMM()) { + generator.uni_vmovdqu(Xbyak::Ymm{vmm.getIdx()}, addr.allocation_->address); + } else if (vmm.isZMM()) { + generator.uni_vmovdqu(Xbyak::Zmm{vmm.getIdx()}, addr.allocation_->address); + } else { + IE_THROW() << "Unknown simd register !!"; + } +} + +inline +void stack_mov(const Xbyak::Reg& reg, const StackAllocator::Address& addr) { + addr.ensureSize(reg); + x64::jit_generator& generator = addr.generator(); + if (reg.isREG(8)) { + generator.mov(Xbyak::Reg8{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(16)) { + generator.mov(Xbyak::Reg16{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(32)) { + generator.mov(Xbyak::Reg32{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(64)) { + generator.mov(Xbyak::Reg64{reg.getIdx()}, addr.allocation_->address); + } else { + IE_THROW() << "Unknown general purpose register !!"; + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp b/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp new file mode 100644 index 00000000000000..389d5ca88751d8 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp @@ -0,0 +1,171 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include + +using namespace ov::intel_cpu; +using namespace InferenceEngine; + +class StackAllocatorTest : public ::testing::Test, public x64::jit_generator { +protected: + DECLARE_CPU_JIT_AUX_FUNCTIONS(StackAllocatorTest) + + void SetUp() override { + } + + void TearDown() override { + } + + template + F create_kernel() { + const status_t code = jit_generator::create_kernel(); + if (code != dnnl::impl::status::success) { + IE_THROW() << "Could not create kernel. Error code: " << std::to_string(code) << ". " + << "Xbyak error code: " << Xbyak::ConvertErrorToString(Xbyak::GetError()); + } + return reinterpret_cast(jit_ker()); + } + + void generate() override { + this->preamble(); + stack_allocator_ = std::make_shared(*this); + kernel_(); + stack_allocator_->commit(); + stack_allocator_.reset(); + this->postamble(); + } + + std::function kernel_; + std::shared_ptr stack_allocator_; +}; + +TEST_F(StackAllocatorTest, ValueEqual) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address reg_100_addr{stack_allocator_, sizeof(int32_t)}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rax, 1); + cmp(rbx.cvt32(), reg_100_addr); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 1); +} + +TEST_F(StackAllocatorTest, ValueNotEqual) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address reg_100_addr{stack_allocator_, sizeof(int32_t)}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rcx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + mov(rax, 1); + cmp(rbx.cvt32(), rcx.cvt32()); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 0); +} + +TEST_F(StackAllocatorTest, AddressCheck) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address reg_0_5_addr{stack_allocator_, sizeof(int32_t)}; + EXPECT_EQ(static_cast(reg_0_5_addr), ptr[rbp - sizeof(int32_t)]); + }; + create_kernel(); +} + +TEST_F(StackAllocatorTest, LoopSuccess) { + kernel_ = [this]() { + Xbyak::Label l_equal; + Xbyak::Label l_loop; + Xbyak::Label l_end; + xor_(rcx, rcx); + StackAllocator::Address reg_100_addr{stack_allocator_, sizeof(int32_t)}; + stack_allocator_->commit(); + L(l_loop); + { + cmp(rcx, 10); + je(l_end); + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rdx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + mov(rax, 1); + add(rcx, 1); + cmp(rbx.cvt32(), rdx.cvt32()); + jne(l_equal); + { + mov(rax, 0); + jmp(l_loop); + } + L(l_equal); + { + jmp(l_loop); + } + } + L(l_end); + reg_100_addr.release(); + }; + auto f = create_kernel(); + const int r = f(); + EXPECT_EQ(r, 1); +} + +TEST_F(StackAllocatorTest, LoopFailed) { + kernel_ = [this]() { + Xbyak::Label l_equal; + Xbyak::Label l_loop; + Xbyak::Label l_end; + xor_(rcx, rcx); + + StackAllocator::Address reg_temp0_addr{stack_allocator_, sizeof(float)}; + StackAllocator::Address reg_temp1_addr{stack_allocator_, sizeof(int32_t)}; + StackAllocator::Address reg_100_addr{stack_allocator_, sizeof(int32_t)}; + StackAllocator::Address reg_200_addr{stack_allocator_, sizeof(int32_t)}; + stack_allocator_->commit(); + L(l_loop); + { + cmp(rcx, 10); + je(l_end); + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rbx.cvt32(), 200); + reg_200_addr.release(); + // NOTE: During implicit conversion to Xbyak::Address& will be thrown the exception + stack_mov(reg_200_addr, rbx.cvt32()); + mov(rdx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + cmp(rbx.cvt32(), rdx.cvt32()); + mov(rax, 1); + add(rcx, 1); + jne(l_equal); + { + reg_temp1_addr.release(); + mov(rax, 0); + jmp(l_loop); + } + L(l_equal); + { + jmp(l_loop); + } + } + L(l_end); + }; + EXPECT_ANY_THROW(create_kernel()); +}