diff --git a/src/plugins/intel_cpu/src/nodes/kernels/registers_pool.hpp b/src/plugins/intel_cpu/src/nodes/kernels/registers_pool.hpp index ed12fa71182bfb..9d087b2a4e5314 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/registers_pool.hpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/registers_pool.hpp @@ -9,11 +9,12 @@ #include "ie_common.h" #include "utils/cpu_utils.hpp" #include +#include "stack_allocator.hpp" namespace ov { namespace intel_cpu { -using namespace dnnl::impl::cpu; +namespace x64 = dnnl::impl::cpu::x64; /** * The RegistersPool is the base class for the IsaRegistersPool template: @@ -49,16 +50,22 @@ class RegistersPool { Reg(const RegistersPool::Ptr& regPool) { initialize(regPool); } Reg(const RegistersPool::Ptr& regPool, int requestedIdx) { initialize(regPool, requestedIdx); } ~Reg() { release(); } + Reg(Reg&& other) noexcept { + this->operator=(std::move(other)); + } Reg& operator=(Reg&& other) noexcept { release(); reg = other.reg; regPool = std::move(other.regPool); return *this; } - Reg(Reg&& other) noexcept : reg(other.reg), regPool(std::move(other.regPool)) {} operator TReg&() { ensureValid(); return reg; } operator const TReg&() const { ensureValid(); return reg; } operator Xbyak::RegExp() const { ensureValid(); return reg; } + Reg& operator=(const StackAllocator::Address& addr) { + stack_mov(*this, addr); + return *this; + } int getIdx() const { ensureValid(); return reg.getIdx(); } friend Xbyak::RegExp operator+(const Reg& lhs, const Xbyak::RegExp& rhs) { lhs.ensureValid(); diff --git a/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp b/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp new file mode 100644 index 00000000000000..98139307d594b5 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/kernels/stack_allocator.hpp @@ -0,0 +1,474 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include + +namespace ov { +namespace intel_cpu { + +namespace x64 = dnnl::impl::cpu::x64; + +class StackAllocator final { +public: + class Transaction; + class Address; + template + class Reg; + + StackAllocator(x64::jit_generator& code_gen) + : StackAllocator{code_gen, code_gen.rbp} { + } + + StackAllocator(x64::jit_generator& code_gen, const Xbyak::Reg& bp) + : StackAllocator{code_gen, bp, 1} { + } + + StackAllocator(x64::jit_generator& code_gen, const size_t alignment) + : StackAllocator{code_gen, code_gen.rbp, alignment} { + } + + StackAllocator(x64::jit_generator& code_gen, + const Xbyak::Reg& bp, + const size_t alignment) + : code_generator{code_gen} + , base_pointer{Xbyak::Reg64{bp.getIdx()}} + , alignment{alignment} { + checkUnique(true); + alignStack(true); + code_generator.mov(base_pointer, code_generator.rsp); + } + + ~StackAllocator() { + release(); + alignStack(false); + checkUnique(false); + } + + void release() { + current_offset = {}; + commit(); + } + + friend void stack_mov(Address& addr, const Xbyak::Xmm& vmm); + friend void stack_mov(Address& addr, const Xbyak::Reg& reg); + friend void stack_mov(const Xbyak::Xmm& vmm, const Address& addr); + friend void stack_mov(const Xbyak::Reg& reg, const Address& addr); + +private: + struct Allocation { + using Ptr = std::shared_ptr; + + Allocation(const Xbyak::Address& address, + const size_t offset, + const size_t size) + : address(address) + , offset(offset) + , size(size) {} + + bool is_used = true; + bool is_transaction = false; + Xbyak::Address address; + size_t offset{}; + size_t size{}; + }; + + void alignStack(bool isCtor) { + if (1 != alignment) { + constexpr size_t kReg64Size = 0x08; + if (isCtor) { + Xbyak::Label l_stack_aligned; + const Xbyak::Reg64 reg_base_stack_offset{base_pointer.getIdx()}; + code_generator.mov(reg_base_stack_offset, static_cast(kReg64Size)); + + const Xbyak::Reg64 reg_base_addr{Xbyak::Operand::RAX}; + const Xbyak::Reg64 reg_reminder{Xbyak::Operand::RDX}; + const Xbyak::Reg64 reg_alignment{Xbyak::Operand::RCX}; + + code_generator.push(reg_base_addr); + code_generator.push(reg_reminder); + code_generator.push(reg_alignment); + + code_generator.xor_(reg_reminder, reg_reminder); + + code_generator.mov(reg_base_addr, code_generator.rsp); + code_generator.add(reg_base_addr, 3 * kReg64Size - kReg64Size); + code_generator.mov(reg_alignment, alignment); + code_generator.idiv(reg_alignment); + code_generator.cmp(reg_reminder, static_cast(0x00)); + code_generator.je(l_stack_aligned); + code_generator.add(reg_base_stack_offset, reg_reminder); + code_generator.L(l_stack_aligned); + + code_generator.pop(reg_alignment); + code_generator.pop(reg_reminder); + code_generator.pop(reg_base_addr); + + code_generator.sub(code_generator.rsp, reg_base_stack_offset); + code_generator.mov(code_generator.ptr[code_generator.rsp], reg_base_stack_offset); + } else { + code_generator.add(code_generator.rsp, code_generator.ptr[code_generator.rsp]); + } + } + } + + Allocation::Ptr allocate(const size_t alloc_size, + const size_t requested_alignment, + const bool is_transaction = false) { + if (alignment % requested_alignment != 0) { + IE_THROW() << "Requested alignment should have 0 reminder of alignment % align !!"; + } + + std::vector free_allocations{}; + for (const auto& alloc : allocations) { + if (!alloc->is_used && + alloc_size <= alloc->size && + (alloc->offset % requested_alignment) == 0) { + free_allocations.push_back(alloc); + } + } + std::sort(free_allocations.begin(), free_allocations.end(), + [](const Allocation::Ptr& alloc0, const Allocation::Ptr& alloc1) { + return alloc0->size < alloc1->size; + }); + if (!free_allocations.empty()) { + const auto alloc = free_allocations.front(); + alloc->is_used = true; + alloc->is_transaction = is_transaction; + return alloc; + } else { + size_t alloc_offset = 0; + if (requested_alignment > 1) { + alloc_offset = (requested_alignment - ((current_offset+alloc_size) % requested_alignment)); + } + + const size_t aligned_alloc_size = alloc_offset + alloc_size; + current_offset += aligned_alloc_size; + Xbyak::Address addr = code_generator.ptr[base_pointer - current_offset]; + const auto alloc = std::make_shared(addr, current_offset, aligned_alloc_size); + alloc->is_transaction = is_transaction; + allocations.push_back(alloc); + return alloc; + } + } + + void deallocate() { + while (!allocations.empty()) { + const auto& last = allocations.back(); + if (last->is_used) { + break; + } + current_offset -= last->size; + allocations.pop_back(); + } + } + + void checkUnique(bool isCtor) { + static thread_local bool isCreated = false; + if (isCtor) { + if (isCreated) { + IE_THROW() << "There should be only one instance of StackAllocator per thread !!"; + } + isCreated = true; + } else { + isCreated = false; + } + } + + bool isTransaction() const { + return is_transaction_; + } + + void begin() { + is_transaction_ = true; + } + + void commit() { + if (current_offset > offset) { + code_generator.sub(code_generator.rsp, current_offset - offset); + offset = current_offset; + } else if (offset > current_offset) { + code_generator.add(code_generator.rsp, offset - current_offset); + offset = current_offset; + } + is_transaction_ = false; + for (auto& alloc : allocations) { + alloc->is_transaction = false; + } + } + + x64::jit_generator& code_generator; + const Xbyak::Reg base_pointer; + + bool is_transaction_{}; + size_t offset{}; + size_t current_offset{}; + size_t alignment{}; + std::vector allocations{}; +}; + +void stack_mov(StackAllocator::Address& addr, const Xbyak::Xmm& vmm); +void stack_mov(StackAllocator::Address& addr, const Xbyak::Reg& reg); +void stack_mov(const Xbyak::Xmm& vmm, const StackAllocator::Address& addr); +void stack_mov(const Xbyak::Reg& reg, const StackAllocator::Address& addr); + +class StackAllocator::Transaction { +public: + friend class StackAllocator::Address; + + Transaction(StackAllocator& stack_allocator) + : stack_allocator_{stack_allocator} { + checkUnique(true); + } + + ~Transaction() { + checkUnique(false); + commit(); + } + + void begin() { + stack_allocator_.begin(); + } + + void commit() { + stack_allocator_.commit(); + } + +private: + void checkUnique(bool isCtor) { + static thread_local bool isCreated = false; + if (isCtor) { + if (isCreated) { + IE_THROW() << "There should be only one instance of Transaction per thread !!"; + } + isCreated = true; + } else { + isCreated = false; + } + } + + StackAllocator& stack_allocator_; +}; + +class StackAllocator::Address { +public: + Address(Transaction& transaction, + const size_t alloc_size, + const size_t requested_alignment = 1) + : transaction_{&transaction} + , stack_allocator_{transaction.stack_allocator_} + , allocation_{stack_allocator_.allocate(alloc_size, requested_alignment, true)} { + transaction.begin(); + } + + Address(StackAllocator& stack_allocator, + const size_t alloc_size, + const size_t requested_alignment = 1) + : stack_allocator_{stack_allocator} + , allocation_{stack_allocator_.allocate(alloc_size, requested_alignment)} { + if (stack_allocator_.isTransaction()) { + IE_THROW() << "Cannot allocate Address out of transaction. Please, finish first transaction !!"; + } + stack_allocator_.commit(); + } + + Address(const Address& addr) = delete; + Address& operator=(const Address& addr) = delete; + Address(Address&& addr) noexcept = delete; + Address& operator=(Address&& addr) noexcept = delete; + + virtual ~Address() { + if (transaction_) { + release(*transaction_); + } else { + release(); + stack_allocator_.commit(); + } + } + + void release(Transaction& transaction) { + if (allocation_) { + transaction.begin(); + allocation_->is_used = false; + stack_allocator_.deallocate(); + } + allocation_ = {}; + } + + void release() { + if (allocation_) { + if (stack_allocator_.isTransaction()) { + IE_THROW() << "Cannot release Address out of transaction. Please, finish first transaction !!"; + } + allocation_->is_used = false; + stack_allocator_.deallocate(); + stack_allocator_.commit(); + } + allocation_ = {}; + } + + operator Xbyak::Address&() { + ensureValid(); + return allocation_->address; + } + + operator const Xbyak::Address&() const { + ensureValid(); + return allocation_->address; + } + + virtual Address& operator=(const Xbyak::Xmm& vmm) { + stack_mov(*this, vmm); + return *this; + } + + virtual Address& operator=(const Xbyak::Reg& reg) { + stack_mov(*this, reg); + return *this; + } + + explicit operator bool() const { + return isInitialized(); + } + + bool isInitialized() const { + return allocation_ && !allocation_->is_transaction; + } + + friend void ::ov::intel_cpu::stack_mov(Address& addr, const Xbyak::Xmm& vmm); + friend void ::ov::intel_cpu::stack_mov(Address& addr, const Xbyak::Reg& reg); + friend void ::ov::intel_cpu::stack_mov(const Xbyak::Xmm& vmm, const Address& addr); + friend void ::ov::intel_cpu::stack_mov(const Xbyak::Reg& reg, const Address& addr); + +private: + void ensureSize(const Xbyak::Reg& reg) const { + ensureValid(); + const size_t reg_size = reg.getBit() / 8; + if (reg_size > allocation_->size) { + IE_THROW() << "reg size is bigger than space allocated in StackAllocator !!"; + } + } + + void ensureValid() const { + if (!isInitialized()) { + IE_THROW() << "StackAllocator::Address is either not initialized or released !!"; + } + } + + x64::jit_generator& generator() const { + return stack_allocator_.code_generator; + } + + Transaction* transaction_{}; + StackAllocator& stack_allocator_; + Allocation::Ptr allocation_; +}; + +template +class StackAllocator::Reg : public StackAllocator::Address { +public: + static_assert(std::is_base_of::value, "TReg should be a Xbyak::Reg based !!"); + + Reg(StackAllocator::Transaction& transaction) + : Address{transaction, TReg{}.getBit() / 8, getAlignment()} { + } + + Reg(StackAllocator& stack_allocator) + : Address{stack_allocator, TReg{}.getBit() / 8, getAlignment()} { + } + + Reg& operator=(const Xbyak::Xmm& vmm) override { + Address::operator=(vmm); + return *this; + } + + Reg& operator=(const Xbyak::Reg& reg) override { + Address::operator=(reg); + return *this; + } + +private: + static size_t getAlignment() { + if (std::is_same::value) { + return x64::cpu_isa_traits::vlen; + } else if (std::is_same::value) { + return x64::cpu_isa_traits::vlen; + } else if (std::is_same::value) { + return x64::cpu_isa_traits::vlen; + } else { + return 1; + } + } +}; + +inline +void stack_mov(StackAllocator::Address& addr, const Xbyak::Xmm& vmm) { + addr.ensureSize(vmm); + x64::jit_generator& generator = addr.generator(); + if (vmm.isXMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Xmm{vmm.getIdx()}); + } else if (vmm.isYMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Ymm{vmm.getIdx()}); + } else if (vmm.isZMM()) { + generator.uni_vmovdqu(addr.allocation_->address, Xbyak::Zmm{vmm.getIdx()}); + } else { + IE_THROW() << "Unknown simd register !!"; + } +} + +inline +void stack_mov(StackAllocator::Address& addr, const Xbyak::Reg& reg) { + addr.ensureSize(reg); + x64::jit_generator& generator = addr.generator(); + if (reg.isREG(8)) { + generator.mov(addr.allocation_->address, Xbyak::Reg8{reg.getIdx()}); + } else if (reg.isREG(16)) { + generator.mov(addr.allocation_->address, Xbyak::Reg16{reg.getIdx()}); + } else if (reg.isREG(32)) { + generator.mov(addr.allocation_->address, Xbyak::Reg32{reg.getIdx()}); + } else if (reg.isREG(64)) { + generator.mov(addr.allocation_->address, Xbyak::Reg64{reg.getIdx()}); + } else { + IE_THROW() << "Unknown general purpose register !!"; + } +} + +inline +void stack_mov(const Xbyak::Xmm& vmm, const StackAllocator::Address& addr) { + addr.ensureSize(vmm); + x64::jit_generator& generator = addr.generator(); + if (vmm.isXMM()) { + generator.uni_vmovdqu(Xbyak::Xmm{vmm.getIdx()}, addr.allocation_->address); + } else if (vmm.isYMM()) { + generator.uni_vmovdqu(Xbyak::Ymm{vmm.getIdx()}, addr.allocation_->address); + } else if (vmm.isZMM()) { + generator.uni_vmovdqu(Xbyak::Zmm{vmm.getIdx()}, addr.allocation_->address); + } else { + IE_THROW() << "Unknown simd register !!"; + } +} + +inline +void stack_mov(const Xbyak::Reg& reg, const StackAllocator::Address& addr) { + addr.ensureSize(reg); + x64::jit_generator& generator = addr.generator(); + if (reg.isREG(8)) { + generator.mov(Xbyak::Reg8{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(16)) { + generator.mov(Xbyak::Reg16{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(32)) { + generator.mov(Xbyak::Reg32{reg.getIdx()}, addr.allocation_->address); + } else if (reg.isREG(64)) { + generator.mov(Xbyak::Reg64{reg.getIdx()}, addr.allocation_->address); + } else { + IE_THROW() << "Unknown general purpose register !!"; + } +} + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp b/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp new file mode 100644 index 00000000000000..e4963ddef9a534 --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/nodes/kernels/stack_allocator.cpp @@ -0,0 +1,501 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#if __GNUC__ >= 5 +// Disable -Wsuggest-override warnings in gtest +#pragma GCC diagnostic ignored "-Wsuggest-override" +#endif +#include + +#include +#include +#include + +#include + +using namespace ov::intel_cpu; + +constexpr int x64::cpu_isa_traits::vlen; +constexpr int x64::cpu_isa_traits::vlen; +constexpr int x64::cpu_isa_traits::vlen; + +class StackAllocatorTest : public ::testing::Test, public x64::jit_generator { +protected: + DECLARE_CPU_JIT_AUX_FUNCTIONS(StackAllocatorTest) + + StackAllocatorTest() + : StackAllocatorTest(x64::isa_all) { + } + + explicit StackAllocatorTest(x64::cpu_isa_t max_cpu_isa) + : x64::jit_generator(jit_name(), nullptr, 256 * 1024, true, max_cpu_isa) { + } + + void SetUp() override { + } + + void TearDown() override { + } + + template + F create_kernel() { + const dnnl::impl::status_t code = jit_generator::create_kernel(); + if (code != dnnl::impl::status::success) { + IE_THROW() << "Could not create kernel. Error code: " << std::to_string(code) << ". " + << "Xbyak error code: " << Xbyak::ConvertErrorToString(Xbyak::GetError()); + } + return reinterpret_cast(jit_ker()); + } + + void generate() override { + this->preamble(); + stack_allocator_ = std::make_shared(*this); + kernel_(); + stack_allocator_->release(); + stack_allocator_.reset(); + this->postamble(); + } + + StackAllocator& stack_allocator() { + return *stack_allocator_; + } + + std::function kernel_; + std::shared_ptr stack_allocator_; +}; + +TEST_F(StackAllocatorTest, Address_Value_Equal) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address reg_100_addr{stack_allocator(), sizeof(int32_t)}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rax, 1); + cmp(rbx.cvt32(), reg_100_addr); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 1); +} + +TEST_F(StackAllocatorTest, Reg32_Value_Equal) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Reg reg_100_addr{stack_allocator()}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rax, 1); + cmp(rbx.cvt32(), reg_100_addr); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 1); +} + +TEST_F(StackAllocatorTest, Address_Value_NotEqual) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address reg_100_addr{stack_allocator(), sizeof(int32_t)}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rcx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + mov(rax, 1); + cmp(rbx.cvt32(), rcx.cvt32()); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 0); +} + +TEST_F(StackAllocatorTest, Reg32_Value_NotEqual) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Reg reg_100_addr{stack_allocator()}; + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rcx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + mov(rax, 1); + cmp(rbx.cvt32(), rcx.cvt32()); + je(l_equal); + mov(rax, 0); + L(l_equal); + }; + auto f = create_kernel(); + int r = f(); + EXPECT_EQ(r, 0); +} + +TEST_F(StackAllocatorTest, Address_PtrCheck_Success) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address dword_addr{stack_allocator(), sizeof(int32_t)}; + EXPECT_EQ(static_cast(dword_addr), ptr[rbp - sizeof(int32_t)]); + }; + create_kernel(); +} + +TEST_F(StackAllocatorTest, Loop_Success) { + kernel_ = [this]() { + Xbyak::Label l_equal; + Xbyak::Label l_loop; + Xbyak::Label l_end; + xor_(rcx, rcx); + StackAllocator::Address reg_100_addr{stack_allocator(), sizeof(int32_t)}; + L(l_loop); + { + cmp(rcx, 10); + je(l_end); + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rdx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + mov(rax, 1); + add(rcx, 1); + cmp(rbx.cvt32(), rdx.cvt32()); + jne(l_equal); + { + mov(rax, 0); + jmp(l_loop); + } + L(l_equal); + { + jmp(l_loop); + } + } + L(l_end); + reg_100_addr.release(); + }; + auto f = create_kernel(); + const int r = f(); + EXPECT_EQ(r, 1); +} + +TEST_F(StackAllocatorTest, Loop_Fail) { + kernel_ = [this]() { + Xbyak::Label l_equal; + Xbyak::Label l_loop; + Xbyak::Label l_end; + xor_(rcx, rcx); + + StackAllocator::Transaction transaction{stack_allocator()}; + StackAllocator::Address reg_temp0_addr{transaction, sizeof(float)}; + StackAllocator::Address reg_temp1_addr{transaction, sizeof(int32_t)}; + StackAllocator::Address reg_100_addr{transaction, sizeof(int32_t)}; + StackAllocator::Address reg_200_addr{transaction, sizeof(int32_t)}; + transaction.commit(); + L(l_loop); + { + cmp(rcx, 10); + je(l_end); + mov(rbx.cvt32(), 100); + stack_mov(reg_100_addr, rbx.cvt32()); + mov(rbx.cvt32(), 200); + reg_200_addr.release(); + // NOTE: During implicit conversion to Xbyak::Address& will be thrown the exception + stack_mov(reg_200_addr, rbx.cvt32()); + mov(rdx.cvt32(), reg_100_addr); + mov(rbx.cvt32(), 201); + cmp(rbx.cvt32(), rdx.cvt32()); + mov(rax, 1); + add(rcx, 1); + jne(l_equal); + { + reg_temp1_addr.release(); + mov(rax, 0); + jmp(l_loop); + } + L(l_equal); + { + jmp(l_loop); + } + } + L(l_end); + }; + EXPECT_ANY_THROW(create_kernel()); +} + +TEST_F(StackAllocatorTest, Address_CheckAlignment_Fail) { + kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Address xmm0_addr{transaction, 16}; + transaction.commit(); + addps(xmm0, xmm0_addr); + }; + auto f = create_kernel(); + ASSERT_DEATH({ + f(); + }, ""); +} + +template +class AlignedStackAllocatorTest : public StackAllocatorTest { +public: + DECLARE_CPU_JIT_AUX_FUNCTIONS(AlignedStackAllocatorTest) + + AlignedStackAllocatorTest() + : StackAllocatorTest(T::isa) { + } + + void SetUp() override { + } + + void TearDown() override { + } + + void generate() override { + this->preamble(); + stack_allocator_ = std::make_shared(*this, x64::cpu_isa_traits::vlen); + kernel_(); + stack_allocator_->release(); + stack_allocator_.reset(); + this->postamble(); + } +}; + +template +struct StackAllocatorTestIsaParam { static constexpr x64::cpu_isa_t isa = Isa; }; + +using StackAllocatorTestIsaParamTypes = ::testing::Types< + StackAllocatorTestIsaParam, + StackAllocatorTestIsaParam, + StackAllocatorTestIsaParam >; + +TYPED_TEST_SUITE(AlignedStackAllocatorTest, StackAllocatorTestIsaParamTypes); + +TYPED_TEST(AlignedStackAllocatorTest, Address_CheckAlignment_Success) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Address xmm0_addr{transaction, 16, 16}; + transaction.commit(); + this->addps(this->xmm0, xmm0_addr); + }; + auto f = this->template create_kernel(); + f(); +} + +TYPED_TEST(AlignedStackAllocatorTest, Reg_CheckAlignment_Success) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Address byte0_addr{this->stack_allocator(), sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{this->stack_allocator()}; + this->addps(this->xmm0, xmm0_addr); + }; + auto f = this->template create_kernel(); + f(); +} + +TYPED_TEST(AlignedStackAllocatorTest, Address_Reuse_Success) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{transaction}; + StackAllocator::Address byte1_addr{transaction, sizeof(int8_t)}; + transaction.commit(); + this->addps(this->xmm0, xmm0_addr); + xmm0_addr.release(); + StackAllocator::Reg xmm1_addr{transaction}; + transaction.commit(); + this->addps(this->xmm0, xmm1_addr); + }; + auto f = this->template create_kernel(); + f(); +} + +TYPED_TEST(AlignedStackAllocatorTest, Transaction_CheckAllocation_Fail) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{transaction}; + StackAllocator::Address byte1_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm1_addr{this->stack_allocator()}; + transaction.commit(); + this->addps(this->xmm0, xmm0_addr); + xmm0_addr.release(); + StackAllocator::Reg xmm2_addr{transaction}; + transaction.commit(); + this->addps(this->xmm0, xmm2_addr); + }; + + EXPECT_ANY_THROW(this->template create_kernel()); +} + +TYPED_TEST(AlignedStackAllocatorTest, Transaction_UseAddressBeforeCommit_Fail) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{transaction}; + StackAllocator::Address byte1_addr{transaction, sizeof(int8_t)}; + this->addps(this->xmm0, xmm0_addr); + transaction.commit(); + xmm0_addr.release(); + StackAllocator::Reg xmm1_addr{transaction}; + transaction.commit(); + this->addps(this->xmm0, xmm1_addr); + }; + + EXPECT_ANY_THROW(this->template create_kernel()); +} + +TYPED_TEST(AlignedStackAllocatorTest, Transaction_UseAddressAfterCommit_Success) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{transaction}; + StackAllocator::Address byte1_addr{transaction, sizeof(int8_t)}; + transaction.commit(); + this->addps(this->xmm0, xmm0_addr); + xmm0_addr.release(transaction); + byte1_addr.release(transaction); + StackAllocator::Reg xmm1_addr{transaction}; + transaction.commit(); + this->addps(this->xmm0, xmm1_addr); + }; + + auto f = this->template create_kernel(); + f(); +} + +TYPED_TEST(AlignedStackAllocatorTest, Transaction_ReleaseAddressBeforeCommit_Fail) { + this->kernel_ = [this]() { + Xbyak::Label l_equal; + StackAllocator::Transaction transaction{this->stack_allocator()}; + StackAllocator::Address byte0_addr{transaction, sizeof(int8_t)}; + StackAllocator::Reg xmm0_addr{transaction}; + StackAllocator::Address byte1_addr{transaction, sizeof(int8_t)}; + transaction.commit(); + this->uni_vaddps(this->xmm0, this->xmm0, xmm0_addr); + xmm0_addr.release(transaction); + byte1_addr.release(); + StackAllocator::Reg xmm1_addr{transaction}; + transaction.commit(); + this->addps(this->xmm0, xmm1_addr); + }; + + EXPECT_ANY_THROW(this->template create_kernel()); +} + +TYPED_TEST(AlignedStackAllocatorTest, Xmm_Value_Equal) { + static const uint32_t data[4] = {1024, 2135, 3246, 4357}; + this->kernel_ = [this]() { + Xbyak::Label l_not_equal; + Xbyak::Xmm vmm0{0}; + Xbyak::Xmm vmm1{1}; + StackAllocator::Reg value_on_stack{this->stack_allocator()}; + this->mov(this->rbx, reinterpret_cast(data)); + this->uni_vmovups(vmm0, this->ptr[this->rbx]); + value_on_stack = vmm0; + this->uni_vpxor(vmm1, vmm1, vmm1); + this->uni_vmovups(vmm1, value_on_stack); + this->mov(this->rax, 0); + this->uni_vpcmpeqd(vmm0, vmm0, vmm1); + this->uni_vtestps(vmm0, vmm0); + this->jz(l_not_equal); + this->mov(this->rax, 1); + this->L(l_not_equal); + }; + auto f = this->template create_kernel(); + int r = f(); + EXPECT_EQ(r, 1); +} + +TYPED_TEST(AlignedStackAllocatorTest, Xmm_Value_NotEqual) { + static const uint32_t data[4] = {1024, 2135, 3246, 4357}; + this->kernel_ = [this]() { + Xbyak::Label l_not_equal; + Xbyak::Xmm vmm0{0}; + Xbyak::Xmm vmm1{1}; + StackAllocator::Reg value_on_stack{this->stack_allocator()}; + this->mov(this->rbx, reinterpret_cast(data)); + this->uni_vmovups(vmm0, this->ptr[this->rbx]); + value_on_stack = vmm0; + this->uni_vpxor(vmm1, vmm1, vmm1); + this->uni_vmovups(vmm1, value_on_stack); + this->uni_vpxor(vmm0, vmm0, vmm0); + this->mov(this->rax, 0); + this->uni_vpcmpeqd(vmm0, vmm0, vmm1); + this->uni_vtestps(vmm0, vmm0); + this->jz(l_not_equal); + this->mov(this->rax, 1); + this->L(l_not_equal); + }; + auto f = this->template create_kernel(); + int r = f(); + EXPECT_EQ(r, 0); +} + +TYPED_TEST(AlignedStackAllocatorTest, Ymm_Value_Equal) { + if (TypeParam::isa != x64::avx2) { + GTEST_SKIP() << "Skipping test for isa = " << static_cast(TypeParam::isa); + } + static const uint32_t data[8] = {1024, 2135, 3246, 4357, + 2124, 3235, 4346, 5457}; + this->kernel_ = [this]() { + Xbyak::Label l_not_equal; + Xbyak::Ymm vmm0{0}; + Xbyak::Ymm vmm1{1}; + StackAllocator::Reg value_on_stack{this->stack_allocator()}; + this->mov(this->rbx, reinterpret_cast(data)); + this->uni_vmovups(vmm0, this->ptr[this->rbx]); + value_on_stack = vmm0; + this->uni_vpxor(vmm1, vmm1, vmm1); + this->uni_vmovups(vmm1, value_on_stack); + this->mov(this->rax, 0); + this->uni_vpcmpeqd(vmm0, vmm0, vmm1); + this->uni_vtestps(vmm0, vmm0); + this->jz(l_not_equal); + this->mov(this->rax, 1); + this->L(l_not_equal); + }; + auto f = this->template create_kernel(); + int r = f(); + EXPECT_EQ(r, 1); +} + +TYPED_TEST(AlignedStackAllocatorTest, Ymm_Value_NotEqual) { + if (TypeParam::isa != x64::avx2) { + GTEST_SKIP() << "Skipping test for isa = " << static_cast(TypeParam::isa); + } + static const uint32_t data[8] = {1024, 2135, 3246, 4357, + 2124, 3235, 4346, 5457}; + this->kernel_ = [this]() { + Xbyak::Label l_not_equal; + Xbyak::Ymm vmm0{0}; + Xbyak::Ymm vmm1{1}; + StackAllocator::Reg value_on_stack{this->stack_allocator()}; + this->mov(this->rbx, reinterpret_cast(data)); + this->uni_vmovups(vmm0, this->ptr[this->rbx]); + value_on_stack = vmm0; + this->uni_vpxor(vmm1, vmm1, vmm1); + this->uni_vmovups(vmm1, value_on_stack); + this->uni_vpxor(vmm0, vmm0, vmm0); + this->mov(this->rax, 0); + this->uni_vpcmpeqd(vmm0, vmm0, vmm1); + this->uni_vtestps(vmm0, vmm0); + this->jz(l_not_equal); + this->mov(this->rax, 1); + this->L(l_not_equal); + }; + auto f = this->template create_kernel(); + int r = f(); + EXPECT_EQ(r, 0); +}